1 /*
   2 /*
   3  * Copyright (c) 2013, Red Hat Inc.
   4  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
   5  * All rights reserved.
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This code is free software; you can redistribute it and/or modify it
   9  * under the terms of the GNU General Public License version 2 only, as
  10  * published by the Free Software Foundation.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  *
  26  */
  27 
  28 #include <sys/types.h>
  29 
  30 #include "precompiled.hpp"
  31 #include "asm/assembler.hpp"
  32 #include "asm/assembler.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 
  35 #include "compiler/disassembler.hpp"
  36 #include "gc_interface/collectedHeap.inline.hpp"
  37 #include "gc_implementation/shenandoah/shenandoahBrooksPointer.hpp"
  38 #include "gc_implementation/shenandoah/shenandoahHeap.hpp"
  39 #include "gc_implementation/shenandoah/shenandoahHeap.inline.hpp"
  40 #include "gc_implementation/shenandoah/shenandoahHeapRegion.hpp"
  41 #include "memory/resourceArea.hpp"
  42 #include "runtime/biasedLocking.hpp"
  43 #include "runtime/interfaceSupport.hpp"
  44 #include "runtime/sharedRuntime.hpp"
  45 
  46 // #include "gc_interface/collectedHeap.inline.hpp"
  47 // #include "interpreter/interpreter.hpp"
  48 // #include "memory/cardTableModRefBS.hpp"
  49 // #include "prims/methodHandles.hpp"
  50 // #include "runtime/biasedLocking.hpp"
  51 // #include "runtime/interfaceSupport.hpp"
  52 // #include "runtime/objectMonitor.hpp"
  53 // #include "runtime/os.hpp"
  54 // #include "runtime/sharedRuntime.hpp"
  55 // #include "runtime/stubRoutines.hpp"
  56 
  57 #if INCLUDE_ALL_GCS
  58 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  59 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  60 #include "gc_implementation/g1/heapRegion.hpp"
  61 #endif
  62 
  63 #ifdef COMPILER2
  64 #include "opto/node.hpp"
  65 #include "opto/compile.hpp"
  66 #endif
  67 
  68 #ifdef PRODUCT
  69 #define BLOCK_COMMENT(str) /* nothing */
  70 #define STOP(error) stop(error)
  71 #else
  72 #define BLOCK_COMMENT(str) block_comment(str)
  73 #define STOP(error) block_comment(error); stop(error)
  74 #endif
  75 
  76 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  77 
  78 // Patch any kind of instruction; there may be several instructions.
  79 // Return the total length (in bytes) of the instructions.
  80 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  81   int instructions = 1;
  82   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  83   long offset = (target - branch) >> 2;
  84   unsigned insn = *(unsigned*)branch;
  85   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  86     // Load register (literal)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  89     // Unconditional branch (immediate)
  90     Instruction_aarch64::spatch(branch, 25, 0, offset);
  91   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  92     // Conditional branch (immediate)
  93     Instruction_aarch64::spatch(branch, 23, 5, offset);
  94   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  95     // Compare & branch (immediate)
  96     Instruction_aarch64::spatch(branch, 23, 5, offset);
  97   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  98     // Test & branch (immediate)
  99     Instruction_aarch64::spatch(branch, 18, 5, offset);
 100   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 101     // PC-rel. addressing
 102     offset = target-branch;
 103     int shift = Instruction_aarch64::extract(insn, 31, 31);
 104     if (shift) {
 105       u_int64_t dest = (u_int64_t)target;
 106       uint64_t pc_page = (uint64_t)branch >> 12;
 107       uint64_t adr_page = (uint64_t)target >> 12;
 108       unsigned offset_lo = dest & 0xfff;
 109       offset = adr_page - pc_page;
 110 
 111       // We handle 4 types of PC relative addressing
 112       //   1 - adrp    Rx, target_page
 113       //       ldr/str Ry, [Rx, #offset_in_page]
 114       //   2 - adrp    Rx, target_page
 115       //       add     Ry, Rx, #offset_in_page
 116       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 117       //       movk    Rx, #imm16<<32
 118       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 119       // In the first 3 cases we must check that Rx is the same in the adrp and the
 120       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 121       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 122       // to be followed by a random unrelated ldr/str, add or movk instruction.
 123       //
 124       unsigned insn2 = ((unsigned*)branch)[1];
 125       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 9, 5)) {
 128         // Load/store register (unsigned immediate)
 129         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 130         Instruction_aarch64::patch(branch + sizeof (unsigned),
 131                                     21, 10, offset_lo >> size);
 132         guarantee(((dest >> size) << size) == dest, "misaligned target");
 133         instructions = 2;
 134       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 135                 Instruction_aarch64::extract(insn, 4, 0) ==
 136                         Instruction_aarch64::extract(insn2, 4, 0)) {
 137         // add (immediate)
 138         Instruction_aarch64::patch(branch + sizeof (unsigned),
 139                                    21, 10, offset_lo);
 140         instructions = 2;
 141       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 142                    Instruction_aarch64::extract(insn, 4, 0) ==
 143                      Instruction_aarch64::extract(insn2, 4, 0)) {
 144         // movk #imm16<<32
 145         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 146         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 147         long pc_page = (long)branch >> 12;
 148         long adr_page = (long)dest >> 12;
 149         offset = adr_page - pc_page;
 150         instructions = 2;
 151       }
 152     }
 153     int offset_lo = offset & 3;
 154     offset >>= 2;
 155     Instruction_aarch64::spatch(branch, 23, 5, offset);
 156     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 157   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 158     u_int64_t dest = (u_int64_t)target;
 159     // Move wide constant
 160     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 161     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 162     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 163     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 164     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 165     assert(target_addr_for_insn(branch) == target, "should be");
 166     instructions = 3;
 167   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 168              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 169     // nothing to do
 170     assert(target == 0, "did not expect to relocate target for polling page load");
 171   } else {
 172     ShouldNotReachHere();
 173   }
 174   return instructions * NativeInstruction::instruction_size;
 175 }
 176 
 177 int MacroAssembler::patch_oop(address insn_addr, address o) {
 178   int instructions;
 179   unsigned insn = *(unsigned*)insn_addr;
 180   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 181 
 182   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 183   // narrow OOPs by setting the upper 16 bits in the first
 184   // instruction.
 185   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 186     // Move narrow OOP
 187     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 188     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 189     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 190     instructions = 2;
 191   } else {
 192     // Move wide OOP
 193     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 194     uintptr_t dest = (uintptr_t)o;
 195     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 196     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 197     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 198     instructions = 3;
 199   }
 200   return instructions * NativeInstruction::instruction_size;
 201 }
 202 
 203 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 204   long offset = 0;
 205   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 206     // Load register (literal)
 207     offset = Instruction_aarch64::sextract(insn, 23, 5);
 208     return address(((uint64_t)insn_addr + (offset << 2)));
 209   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 210     // Unconditional branch (immediate)
 211     offset = Instruction_aarch64::sextract(insn, 25, 0);
 212   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 213     // Conditional branch (immediate)
 214     offset = Instruction_aarch64::sextract(insn, 23, 5);
 215   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 216     // Compare & branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 23, 5);
 218    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 219     // Test & branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 18, 5);
 221   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 222     // PC-rel. addressing
 223     offset = Instruction_aarch64::extract(insn, 30, 29);
 224     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 225     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 226     if (shift) {
 227       offset <<= shift;
 228       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 229       target_page &= ((uint64_t)-1) << shift;
 230       // Return the target address for the following sequences
 231       //   1 - adrp    Rx, target_page
 232       //       ldr/str Ry, [Rx, #offset_in_page]
 233       //   2 - adrp    Rx, target_page
 234       //       add     Ry, Rx, #offset_in_page
 235       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 236       //       movk    Rx, #imm12<<32
 237       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 238       //
 239       // In the first two cases  we check that the register is the same and
 240       // return the target_page + the offset within the page.
 241       // Otherwise we assume it is a page aligned relocation and return
 242       // the target page only.
 243       //
 244       unsigned insn2 = ((unsigned*)insn_addr)[1];
 245       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 246                 Instruction_aarch64::extract(insn, 4, 0) ==
 247                         Instruction_aarch64::extract(insn2, 9, 5)) {
 248         // Load/store register (unsigned immediate)
 249         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 250         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 251         return address(target_page + (byte_offset << size));
 252       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 253                 Instruction_aarch64::extract(insn, 4, 0) ==
 254                         Instruction_aarch64::extract(insn2, 4, 0)) {
 255         // add (immediate)
 256         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 257         return address(target_page + byte_offset);
 258       } else {
 259         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 260                Instruction_aarch64::extract(insn, 4, 0) ==
 261                  Instruction_aarch64::extract(insn2, 4, 0)) {
 262           target_page = (target_page & 0xffffffff) |
 263                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 264         }
 265         return (address)target_page;
 266       }
 267     } else {
 268       ShouldNotReachHere();
 269     }
 270   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 271     u_int32_t *insns = (u_int32_t *)insn_addr;
 272     // Move wide constant: movz, movk, movk.  See movptr().
 273     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 274     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 275     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 277                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 278   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 279              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 280     return 0;
 281   } else {
 282     ShouldNotReachHere();
 283   }
 284   return address(((uint64_t)insn_addr + (offset << 2)));
 285 }
 286 
 287 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 288   dsb(Assembler::SY);
 289 }
 290 
 291 
 292 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 293   // we must set sp to zero to clear frame
 294   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 295 
 296   // must clear fp, so that compiled frames are not confused; it is
 297   // possible that we need it only for debugging
 298   if (clear_fp) {
 299     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 300   }
 301 
 302   // Always clear the pc because it could have been set by make_walkable()
 303   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 304 }
 305 
 306 // Calls to C land
 307 //
 308 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 309 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 310 // has to be reset to 0. This is required to allow proper stack traversal.
 311 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 312                                          Register last_java_fp,
 313                                          Register last_java_pc,
 314                                          Register scratch) {
 315 
 316   if (last_java_pc->is_valid()) {
 317       str(last_java_pc, Address(rthread,
 318                                 JavaThread::frame_anchor_offset()
 319                                 + JavaFrameAnchor::last_Java_pc_offset()));
 320     }
 321 
 322   // determine last_java_sp register
 323   if (last_java_sp == sp) {
 324     mov(scratch, sp);
 325     last_java_sp = scratch;
 326   } else if (!last_java_sp->is_valid()) {
 327     last_java_sp = esp;
 328   }
 329 
 330   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 331 
 332   // last_java_fp is optional
 333   if (last_java_fp->is_valid()) {
 334     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 335   }
 336 }
 337 
 338 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 339                                          Register last_java_fp,
 340                                          address  last_java_pc,
 341                                          Register scratch) {
 342   if (last_java_pc != NULL) {
 343     adr(scratch, last_java_pc);
 344   } else {
 345     // FIXME: This is almost never correct.  We should delete all
 346     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 347     // correct return address instead.
 348     adr(scratch, pc());
 349   }
 350 
 351   str(scratch, Address(rthread,
 352                        JavaThread::frame_anchor_offset()
 353                        + JavaFrameAnchor::last_Java_pc_offset()));
 354 
 355   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 356 }
 357 
 358 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 359                                          Register last_java_fp,
 360                                          Label &L,
 361                                          Register scratch) {
 362   if (L.is_bound()) {
 363     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 364   } else {
 365     InstructionMark im(this);
 366     L.add_patch_at(code(), locator());
 367     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 368   }
 369 }
 370 
 371 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 372   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 373   assert(CodeCache::find_blob(entry.target()) != NULL,
 374          "destination of far call not found in code cache");
 375   if (far_branches()) {
 376     unsigned long offset;
 377     // We can use ADRP here because we know that the total size of
 378     // the code cache cannot exceed 2Gb.
 379     adrp(tmp, entry, offset);
 380     add(tmp, tmp, offset);
 381     if (cbuf) cbuf->set_insts_mark();
 382     blr(tmp);
 383   } else {
 384     if (cbuf) cbuf->set_insts_mark();
 385     bl(entry);
 386   }
 387 }
 388 
 389 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 390   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 391   assert(CodeCache::find_blob(entry.target()) != NULL,
 392          "destination of far call not found in code cache");
 393   if (far_branches()) {
 394     unsigned long offset;
 395     // We can use ADRP here because we know that the total size of
 396     // the code cache cannot exceed 2Gb.
 397     adrp(tmp, entry, offset);
 398     add(tmp, tmp, offset);
 399     if (cbuf) cbuf->set_insts_mark();
 400     br(tmp);
 401   } else {
 402     if (cbuf) cbuf->set_insts_mark();
 403     b(entry);
 404   }
 405 }
 406 
 407 int MacroAssembler::biased_locking_enter(Register lock_reg,
 408                                          Register obj_reg,
 409                                          Register swap_reg,
 410                                          Register tmp_reg,
 411                                          bool swap_reg_contains_mark,
 412                                          Label& done,
 413                                          Label* slow_case,
 414                                          BiasedLockingCounters* counters) {
 415   assert(UseBiasedLocking, "why call this otherwise?");
 416   assert_different_registers(lock_reg, obj_reg, swap_reg);
 417 
 418   if (PrintBiasedLockingStatistics && counters == NULL)
 419     counters = BiasedLocking::counters();
 420 
 421   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 422   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 423   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 424   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 425   Address saved_mark_addr(lock_reg, 0);
 426 
 427   // Biased locking
 428   // See whether the lock is currently biased toward our thread and
 429   // whether the epoch is still valid
 430   // Note that the runtime guarantees sufficient alignment of JavaThread
 431   // pointers to allow age to be placed into low bits
 432   // First check to see whether biasing is even enabled for this object
 433   Label cas_label;
 434   int null_check_offset = -1;
 435   if (!swap_reg_contains_mark) {
 436     null_check_offset = offset();
 437     ldr(swap_reg, mark_addr);
 438   }
 439   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 440   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 441   br(Assembler::NE, cas_label);
 442   // The bias pattern is present in the object's header. Need to check
 443   // whether the bias owner and the epoch are both still current.
 444   load_prototype_header(tmp_reg, obj_reg);
 445   orr(tmp_reg, tmp_reg, rthread);
 446   eor(tmp_reg, swap_reg, tmp_reg);
 447   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 448   if (counters != NULL) {
 449     Label around;
 450     cbnz(tmp_reg, around);
 451     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 452     b(done);
 453     bind(around);
 454   } else {
 455     cbz(tmp_reg, done);
 456   }
 457 
 458   Label try_revoke_bias;
 459   Label try_rebias;
 460 
 461   // At this point we know that the header has the bias pattern and
 462   // that we are not the bias owner in the current epoch. We need to
 463   // figure out more details about the state of the header in order to
 464   // know what operations can be legally performed on the object's
 465   // header.
 466 
 467   // If the low three bits in the xor result aren't clear, that means
 468   // the prototype header is no longer biased and we have to revoke
 469   // the bias on this object.
 470   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 471   cbnz(rscratch1, try_revoke_bias);
 472 
 473   // Biasing is still enabled for this data type. See whether the
 474   // epoch of the current bias is still valid, meaning that the epoch
 475   // bits of the mark word are equal to the epoch bits of the
 476   // prototype header. (Note that the prototype header's epoch bits
 477   // only change at a safepoint.) If not, attempt to rebias the object
 478   // toward the current thread. Note that we must be absolutely sure
 479   // that the current epoch is invalid in order to do this because
 480   // otherwise the manipulations it performs on the mark word are
 481   // illegal.
 482   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 483   cbnz(rscratch1, try_rebias);
 484 
 485   // The epoch of the current bias is still valid but we know nothing
 486   // about the owner; it might be set or it might be clear. Try to
 487   // acquire the bias of the object using an atomic operation. If this
 488   // fails we will go in to the runtime to revoke the object's bias.
 489   // Note that we first construct the presumed unbiased header so we
 490   // don't accidentally blow away another thread's valid bias.
 491   {
 492     Label here;
 493     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 494     andr(swap_reg, swap_reg, rscratch1);
 495     orr(tmp_reg, swap_reg, rthread);
 496     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 497     // If the biasing toward our thread failed, this means that
 498     // another thread succeeded in biasing it toward itself and we
 499     // need to revoke that bias. The revocation will occur in the
 500     // interpreter runtime in the slow case.
 501     bind(here);
 502     if (counters != NULL) {
 503       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 504                   tmp_reg, rscratch1, rscratch2);
 505     }
 506   }
 507   b(done);
 508 
 509   bind(try_rebias);
 510   // At this point we know the epoch has expired, meaning that the
 511   // current "bias owner", if any, is actually invalid. Under these
 512   // circumstances _only_, we are allowed to use the current header's
 513   // value as the comparison value when doing the cas to acquire the
 514   // bias in the current epoch. In other words, we allow transfer of
 515   // the bias from one thread to another directly in this situation.
 516   //
 517   // FIXME: due to a lack of registers we currently blow away the age
 518   // bits in this situation. Should attempt to preserve them.
 519   {
 520     Label here;
 521     load_prototype_header(tmp_reg, obj_reg);
 522     orr(tmp_reg, rthread, tmp_reg);
 523     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 524     // If the biasing toward our thread failed, then another thread
 525     // succeeded in biasing it toward itself and we need to revoke that
 526     // bias. The revocation will occur in the runtime in the slow case.
 527     bind(here);
 528     if (counters != NULL) {
 529       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 530                   tmp_reg, rscratch1, rscratch2);
 531     }
 532   }
 533   b(done);
 534 
 535   bind(try_revoke_bias);
 536   // The prototype mark in the klass doesn't have the bias bit set any
 537   // more, indicating that objects of this data type are not supposed
 538   // to be biased any more. We are going to try to reset the mark of
 539   // this object to the prototype value and fall through to the
 540   // CAS-based locking scheme. Note that if our CAS fails, it means
 541   // that another thread raced us for the privilege of revoking the
 542   // bias of this particular object, so it's okay to continue in the
 543   // normal locking code.
 544   //
 545   // FIXME: due to a lack of registers we currently blow away the age
 546   // bits in this situation. Should attempt to preserve them.
 547   {
 548     Label here, nope;
 549     load_prototype_header(tmp_reg, obj_reg);
 550     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 551     bind(here);
 552 
 553     // Fall through to the normal CAS-based lock, because no matter what
 554     // the result of the above CAS, some thread must have succeeded in
 555     // removing the bias bit from the object's header.
 556     if (counters != NULL) {
 557       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 558                   rscratch1, rscratch2);
 559     }
 560     bind(nope);
 561   }
 562 
 563   bind(cas_label);
 564 
 565   return null_check_offset;
 566 }
 567 
 568 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 569   assert(UseBiasedLocking, "why call this otherwise?");
 570 
 571   // Check for biased locking unlock case, which is a no-op
 572   // Note: we do not have to check the thread ID for two reasons.
 573   // First, the interpreter checks for IllegalMonitorStateException at
 574   // a higher level. Second, if the bias was revoked while we held the
 575   // lock, the object could not be rebiased toward another thread, so
 576   // the bias bit would be clear.
 577   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 578   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 579   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 580   br(Assembler::EQ, done);
 581 }
 582 
 583 
 584 // added to make this compile
 585 
 586 REGISTER_DEFINITION(Register, noreg);
 587 
 588 static void pass_arg0(MacroAssembler* masm, Register arg) {
 589   if (c_rarg0 != arg ) {
 590     masm->mov(c_rarg0, arg);
 591   }
 592 }
 593 
 594 static void pass_arg1(MacroAssembler* masm, Register arg) {
 595   if (c_rarg1 != arg ) {
 596     masm->mov(c_rarg1, arg);
 597   }
 598 }
 599 
 600 static void pass_arg2(MacroAssembler* masm, Register arg) {
 601   if (c_rarg2 != arg ) {
 602     masm->mov(c_rarg2, arg);
 603   }
 604 }
 605 
 606 static void pass_arg3(MacroAssembler* masm, Register arg) {
 607   if (c_rarg3 != arg ) {
 608     masm->mov(c_rarg3, arg);
 609   }
 610 }
 611 
 612 void MacroAssembler::call_VM_base(Register oop_result,
 613                                   Register java_thread,
 614                                   Register last_java_sp,
 615                                   address  entry_point,
 616                                   int      number_of_arguments,
 617                                   bool     check_exceptions) {
 618    // determine java_thread register
 619   if (!java_thread->is_valid()) {
 620     java_thread = rthread;
 621   }
 622 
 623   // determine last_java_sp register
 624   if (!last_java_sp->is_valid()) {
 625     last_java_sp = esp;
 626   }
 627 
 628   // debugging support
 629   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 630   assert(java_thread == rthread, "unexpected register");
 631 #ifdef ASSERT
 632   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 633   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 634 #endif // ASSERT
 635 
 636   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 637   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 638 
 639   // push java thread (becomes first argument of C function)
 640 
 641   mov(c_rarg0, java_thread);
 642 
 643   // set last Java frame before call
 644   assert(last_java_sp != rfp, "can't use rfp");
 645 
 646   Label l;
 647   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 648 
 649   // do the call, remove parameters
 650   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 651 
 652   // reset last Java frame
 653   // Only interpreter should have to clear fp
 654   reset_last_Java_frame(true);
 655 
 656    // C++ interp handles this in the interpreter
 657   check_and_handle_popframe(java_thread);
 658   check_and_handle_earlyret(java_thread);
 659 
 660   if (check_exceptions) {
 661     // check for pending exceptions (java_thread is set upon return)
 662     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 663     Label ok;
 664     cbz(rscratch1, ok);
 665     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 666     br(rscratch1);
 667     bind(ok);
 668   }
 669 
 670   // get oop result if there is one and reset the value in the thread
 671   if (oop_result->is_valid()) {
 672     get_vm_result(oop_result, java_thread);
 673   }
 674 }
 675 
 676 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 677   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 678 }
 679 
 680 // Maybe emit a call via a trampoline.  If the code cache is small
 681 // trampolines won't be emitted.
 682 
 683 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 684   assert(entry.rspec().type() == relocInfo::runtime_call_type
 685          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 686          || entry.rspec().type() == relocInfo::static_call_type
 687          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 688 
 689   unsigned int start_offset = offset();
 690 #ifdef COMPILER2
 691   // We need a trampoline if branches are far.
 692   if (far_branches()) {
 693     // We don't want to emit a trampoline if C2 is generating dummy
 694     // code during its branch shortening phase.
 695     CompileTask* task = ciEnv::current()->task();
 696     bool in_scratch_emit_size =
 697       ((task != NULL) && is_c2_compile(task->comp_level())
 698        && Compile::current()->in_scratch_emit_size());
 699     if (! in_scratch_emit_size) {
 700       address stub = emit_trampoline_stub(start_offset, entry.target());
 701       if (stub == NULL) {
 702         return NULL; // CodeCache is full
 703       }
 704     }
 705   }
 706 #endif
 707 
 708   if (cbuf) cbuf->set_insts_mark();
 709   relocate(entry.rspec());
 710 #ifdef COMPILER2
 711   if (!far_branches()) {
 712     bl(entry.target());
 713   } else {
 714     bl(pc());
 715   }
 716 #else
 717     bl(entry.target());
 718 #endif
 719   // just need to return a non-null address
 720   return pc();
 721 }
 722 
 723 
 724 // Emit a trampoline stub for a call to a target which is too far away.
 725 //
 726 // code sequences:
 727 //
 728 // call-site:
 729 //   branch-and-link to <destination> or <trampoline stub>
 730 //
 731 // Related trampoline stub for this call site in the stub section:
 732 //   load the call target from the constant pool
 733 //   branch (LR still points to the call site above)
 734 
 735 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 736                                              address dest) {
 737 #ifdef COMPILER2
 738   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 739   if (stub == NULL) {
 740     return NULL;  // CodeBuffer::expand failed
 741   }
 742 
 743   // Create a trampoline stub relocation which relates this trampoline stub
 744   // with the call instruction at insts_call_instruction_offset in the
 745   // instructions code-section.
 746   align(wordSize);
 747   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 748                                             + insts_call_instruction_offset));
 749   const int stub_start_offset = offset();
 750 
 751   // Now, create the trampoline stub's code:
 752   // - load the call
 753   // - call
 754   Label target;
 755   ldr(rscratch1, target);
 756   br(rscratch1);
 757   bind(target);
 758   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 759          "should be");
 760   emit_int64((int64_t)dest);
 761 
 762   const address stub_start_addr = addr_at(stub_start_offset);
 763 
 764   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 765 
 766   end_a_stub();
 767   return stub;
 768 #else
 769   ShouldNotReachHere();
 770   return NULL;
 771 #endif
 772 }
 773 
 774 void MacroAssembler::c2bool(Register x) {
 775   // implements x == 0 ? 0 : 1
 776   // note: must only look at least-significant byte of x
 777   //       since C-style booleans are stored in one byte
 778   //       only! (was bug)
 779   tst(x, 0xff);
 780   cset(x, Assembler::NE);
 781 }
 782 
 783 address MacroAssembler::ic_call(address entry) {
 784   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 785   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 786   // unsigned long offset;
 787   // ldr_constant(rscratch2, const_ptr);
 788   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 789   return trampoline_call(Address(entry, rh));
 790 }
 791 
 792 // Implementation of call_VM versions
 793 
 794 void MacroAssembler::call_VM(Register oop_result,
 795                              address entry_point,
 796                              bool check_exceptions) {
 797   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 798 }
 799 
 800 void MacroAssembler::call_VM(Register oop_result,
 801                              address entry_point,
 802                              Register arg_1,
 803                              bool check_exceptions) {
 804   pass_arg1(this, arg_1);
 805   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 806 }
 807 
 808 void MacroAssembler::call_VM(Register oop_result,
 809                              address entry_point,
 810                              Register arg_1,
 811                              Register arg_2,
 812                              bool check_exceptions) {
 813   assert(arg_1 != c_rarg2, "smashed arg");
 814   pass_arg2(this, arg_2);
 815   pass_arg1(this, arg_1);
 816   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 817 }
 818 
 819 void MacroAssembler::call_VM(Register oop_result,
 820                              address entry_point,
 821                              Register arg_1,
 822                              Register arg_2,
 823                              Register arg_3,
 824                              bool check_exceptions) {
 825   assert(arg_1 != c_rarg3, "smashed arg");
 826   assert(arg_2 != c_rarg3, "smashed arg");
 827   pass_arg3(this, arg_3);
 828 
 829   assert(arg_1 != c_rarg2, "smashed arg");
 830   pass_arg2(this, arg_2);
 831 
 832   pass_arg1(this, arg_1);
 833   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 834 }
 835 
 836 void MacroAssembler::call_VM(Register oop_result,
 837                              Register last_java_sp,
 838                              address entry_point,
 839                              int number_of_arguments,
 840                              bool check_exceptions) {
 841   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 842 }
 843 
 844 void MacroAssembler::call_VM(Register oop_result,
 845                              Register last_java_sp,
 846                              address entry_point,
 847                              Register arg_1,
 848                              bool check_exceptions) {
 849   pass_arg1(this, arg_1);
 850   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 851 }
 852 
 853 void MacroAssembler::call_VM(Register oop_result,
 854                              Register last_java_sp,
 855                              address entry_point,
 856                              Register arg_1,
 857                              Register arg_2,
 858                              bool check_exceptions) {
 859 
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              Register last_java_sp,
 868                              address entry_point,
 869                              Register arg_1,
 870                              Register arg_2,
 871                              Register arg_3,
 872                              bool check_exceptions) {
 873   assert(arg_1 != c_rarg3, "smashed arg");
 874   assert(arg_2 != c_rarg3, "smashed arg");
 875   pass_arg3(this, arg_3);
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878   pass_arg1(this, arg_1);
 879   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 880 }
 881 
 882 
 883 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 884   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 885   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 886   verify_oop(oop_result, "broken oop in call_VM_base");
 887 }
 888 
 889 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 890   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 891   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 892 }
 893 
 894 void MacroAssembler::align(int modulus) {
 895   while (offset() % modulus != 0) nop();
 896 }
 897 
 898 // these are no-ops overridden by InterpreterMacroAssembler
 899 
 900 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 901 
 902 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 903 
 904 
 905 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 906                                                       Register tmp,
 907                                                       int offset) {
 908   intptr_t value = *delayed_value_addr;
 909   if (value != 0)
 910     return RegisterOrConstant(value + offset);
 911 
 912   // load indirectly to solve generation ordering problem
 913   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 914 
 915   if (offset != 0)
 916     add(tmp, tmp, offset);
 917 
 918   return RegisterOrConstant(tmp);
 919 }
 920 
 921 
 922 void MacroAssembler:: notify(int type) {
 923   if (type == bytecode_start) {
 924     // set_last_Java_frame(esp, rfp, (address)NULL);
 925     Assembler:: notify(type);
 926     // reset_last_Java_frame(true);
 927   }
 928   else
 929     Assembler:: notify(type);
 930 }
 931 
 932 // Look up the method for a megamorphic invokeinterface call.
 933 // The target method is determined by <intf_klass, itable_index>.
 934 // The receiver klass is in recv_klass.
 935 // On success, the result will be in method_result, and execution falls through.
 936 // On failure, execution transfers to the given label.
 937 void MacroAssembler::lookup_interface_method(Register recv_klass,
 938                                              Register intf_klass,
 939                                              RegisterOrConstant itable_index,
 940                                              Register method_result,
 941                                              Register scan_temp,
 942                                              Label& L_no_such_interface,
 943                                              bool return_method) {
 944   assert_different_registers(recv_klass, intf_klass, scan_temp);
 945   assert_different_registers(method_result, intf_klass, scan_temp);
 946   assert(recv_klass != method_result || !return_method,
 947          "recv_klass can be destroyed when method isn't needed");
 948 
 949   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 950          "caller must use same register for non-constant itable index as for method");
 951 
 952   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 953   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 954   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 955   int scan_step   = itableOffsetEntry::size() * wordSize;
 956   int vte_size    = vtableEntry::size() * wordSize;
 957   assert(vte_size == wordSize, "else adjust times_vte_scale");
 958 
 959   ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 960 
 961   // %%% Could store the aligned, prescaled offset in the klassoop.
 962   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 963   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 964   add(scan_temp, scan_temp, vtable_base);
 965   if (HeapWordsPerLong > 1) {
 966     // Round up to align_object_offset boundary
 967     // see code for instanceKlass::start_of_itable!
 968     round_to(scan_temp, BytesPerLong);
 969   }
 970 
 971   if (return_method) {
 972     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 973     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 974     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 975     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 976     if (itentry_off)
 977       add(recv_klass, recv_klass, itentry_off);
 978   }
 979 
 980   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 981   //   if (scan->interface() == intf) {
 982   //     result = (klass + scan->offset() + itable_index);
 983   //   }
 984   // }
 985   Label search, found_method;
 986 
 987   for (int peel = 1; peel >= 0; peel--) {
 988     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 989     cmp(intf_klass, method_result);
 990 
 991     if (peel) {
 992       br(Assembler::EQ, found_method);
 993     } else {
 994       br(Assembler::NE, search);
 995       // (invert the test to fall through to found_method...)
 996     }
 997 
 998     if (!peel)  break;
 999 
1000     bind(search);
1001 
1002     // Check that the previous entry is non-null.  A null entry means that
1003     // the receiver class doesn't implement the interface, and wasn't the
1004     // same as when the caller was compiled.
1005     cbz(method_result, L_no_such_interface);
1006     add(scan_temp, scan_temp, scan_step);
1007   }
1008 
1009   bind(found_method);
1010 
1011   if (return_method) {
1012     // Got a hit.
1013     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1014     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1015   }
1016 }
1017 
1018 // virtual method calling
1019 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1020                                            RegisterOrConstant vtable_index,
1021                                            Register method_result) {
1022   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1023   assert(vtableEntry::size() * wordSize == 8,
1024          "adjust the scaling in the code below");
1025   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1026 
1027   if (vtable_index.is_register()) {
1028     lea(method_result, Address(recv_klass,
1029                                vtable_index.as_register(),
1030                                Address::lsl(LogBytesPerWord)));
1031     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1032   } else {
1033     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1034     ldr(method_result,
1035         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1036   }
1037 }
1038 
1039 void MacroAssembler::check_klass_subtype(Register sub_klass,
1040                            Register super_klass,
1041                            Register temp_reg,
1042                            Label& L_success) {
1043   Label L_failure;
1044   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1045   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1046   bind(L_failure);
1047 }
1048 
1049 
1050 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1051                                                    Register super_klass,
1052                                                    Register temp_reg,
1053                                                    Label* L_success,
1054                                                    Label* L_failure,
1055                                                    Label* L_slow_path,
1056                                         RegisterOrConstant super_check_offset) {
1057   assert_different_registers(sub_klass, super_klass, temp_reg);
1058   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1059   if (super_check_offset.is_register()) {
1060     assert_different_registers(sub_klass, super_klass,
1061                                super_check_offset.as_register());
1062   } else if (must_load_sco) {
1063     assert(temp_reg != noreg, "supply either a temp or a register offset");
1064   }
1065 
1066   Label L_fallthrough;
1067   int label_nulls = 0;
1068   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1069   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1070   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1071   assert(label_nulls <= 1, "at most one NULL in the batch");
1072 
1073   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1074   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1075   Address super_check_offset_addr(super_klass, sco_offset);
1076 
1077   // Hacked jmp, which may only be used just before L_fallthrough.
1078 #define final_jmp(label)                                                \
1079   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1080   else                            b(label)                /*omit semi*/
1081 
1082   // If the pointers are equal, we are done (e.g., String[] elements).
1083   // This self-check enables sharing of secondary supertype arrays among
1084   // non-primary types such as array-of-interface.  Otherwise, each such
1085   // type would need its own customized SSA.
1086   // We move this check to the front of the fast path because many
1087   // type checks are in fact trivially successful in this manner,
1088   // so we get a nicely predicted branch right at the start of the check.
1089   cmp(sub_klass, super_klass);
1090   br(Assembler::EQ, *L_success);
1091 
1092   // Check the supertype display:
1093   if (must_load_sco) {
1094     // Positive movl does right thing on LP64.
1095     ldrw(temp_reg, super_check_offset_addr);
1096     super_check_offset = RegisterOrConstant(temp_reg);
1097   }
1098   Address super_check_addr(sub_klass, super_check_offset);
1099   ldr(rscratch1, super_check_addr);
1100   cmp(super_klass, rscratch1); // load displayed supertype
1101 
1102   // This check has worked decisively for primary supers.
1103   // Secondary supers are sought in the super_cache ('super_cache_addr').
1104   // (Secondary supers are interfaces and very deeply nested subtypes.)
1105   // This works in the same check above because of a tricky aliasing
1106   // between the super_cache and the primary super display elements.
1107   // (The 'super_check_addr' can address either, as the case requires.)
1108   // Note that the cache is updated below if it does not help us find
1109   // what we need immediately.
1110   // So if it was a primary super, we can just fail immediately.
1111   // Otherwise, it's the slow path for us (no success at this point).
1112 
1113   if (super_check_offset.is_register()) {
1114     br(Assembler::EQ, *L_success);
1115     cmp(super_check_offset.as_register(), sc_offset);
1116     if (L_failure == &L_fallthrough) {
1117       br(Assembler::EQ, *L_slow_path);
1118     } else {
1119       br(Assembler::NE, *L_failure);
1120       final_jmp(*L_slow_path);
1121     }
1122   } else if (super_check_offset.as_constant() == sc_offset) {
1123     // Need a slow path; fast failure is impossible.
1124     if (L_slow_path == &L_fallthrough) {
1125       br(Assembler::EQ, *L_success);
1126     } else {
1127       br(Assembler::NE, *L_slow_path);
1128       final_jmp(*L_success);
1129     }
1130   } else {
1131     // No slow path; it's a fast decision.
1132     if (L_failure == &L_fallthrough) {
1133       br(Assembler::EQ, *L_success);
1134     } else {
1135       br(Assembler::NE, *L_failure);
1136       final_jmp(*L_success);
1137     }
1138   }
1139 
1140   bind(L_fallthrough);
1141 
1142 #undef final_jmp
1143 }
1144 
1145 // These two are taken from x86, but they look generally useful
1146 
1147 // scans count pointer sized words at [addr] for occurence of value,
1148 // generic
1149 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1150                                 Register scratch) {
1151   Label Lloop, Lexit;
1152   cbz(count, Lexit);
1153   bind(Lloop);
1154   ldr(scratch, post(addr, wordSize));
1155   cmp(value, scratch);
1156   br(EQ, Lexit);
1157   sub(count, count, 1);
1158   cbnz(count, Lloop);
1159   bind(Lexit);
1160 }
1161 
1162 // scans count 4 byte words at [addr] for occurence of value,
1163 // generic
1164 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1165                                 Register scratch) {
1166   Label Lloop, Lexit;
1167   cbz(count, Lexit);
1168   bind(Lloop);
1169   ldrw(scratch, post(addr, wordSize));
1170   cmpw(value, scratch);
1171   br(EQ, Lexit);
1172   sub(count, count, 1);
1173   cbnz(count, Lloop);
1174   bind(Lexit);
1175 }
1176 
1177 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1178                                                    Register super_klass,
1179                                                    Register temp_reg,
1180                                                    Register temp2_reg,
1181                                                    Label* L_success,
1182                                                    Label* L_failure,
1183                                                    bool set_cond_codes) {
1184   assert_different_registers(sub_klass, super_klass, temp_reg);
1185   if (temp2_reg != noreg)
1186     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1187 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1188 
1189   Label L_fallthrough;
1190   int label_nulls = 0;
1191   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1192   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1193   assert(label_nulls <= 1, "at most one NULL in the batch");
1194 
1195   // a couple of useful fields in sub_klass:
1196   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1197   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1198   Address secondary_supers_addr(sub_klass, ss_offset);
1199   Address super_cache_addr(     sub_klass, sc_offset);
1200 
1201   BLOCK_COMMENT("check_klass_subtype_slow_path");
1202 
1203   // Do a linear scan of the secondary super-klass chain.
1204   // This code is rarely used, so simplicity is a virtue here.
1205   // The repne_scan instruction uses fixed registers, which we must spill.
1206   // Don't worry too much about pre-existing connections with the input regs.
1207 
1208   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1209   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1210 
1211   // Get super_klass value into r0 (even if it was in r5 or r2).
1212   RegSet pushed_registers;
1213   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1214   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1215 
1216   if (super_klass != r0 || UseCompressedOops) {
1217     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1218   }
1219 
1220   push(pushed_registers, sp);
1221 
1222 #ifndef PRODUCT
1223   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1224   Address pst_counter_addr(rscratch2);
1225   ldr(rscratch1, pst_counter_addr);
1226   add(rscratch1, rscratch1, 1);
1227   str(rscratch1, pst_counter_addr);
1228 #endif //PRODUCT
1229 
1230   // We will consult the secondary-super array.
1231   ldr(r5, secondary_supers_addr);
1232   // Load the array length.  (Positive movl does right thing on LP64.)
1233   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1234   // Skip to start of data.
1235   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1236 
1237   cmp(sp, zr); // Clear Z flag; SP is never zero
1238   // Scan R2 words at [R5] for an occurrence of R0.
1239   // Set NZ/Z based on last compare.
1240   repne_scan(r5, r0, r2, rscratch1);
1241 
1242   // Unspill the temp. registers:
1243   pop(pushed_registers, sp);
1244 
1245   br(Assembler::NE, *L_failure);
1246 
1247   // Success.  Cache the super we found and proceed in triumph.
1248   str(super_klass, super_cache_addr);
1249 
1250   if (L_success != &L_fallthrough) {
1251     b(*L_success);
1252   }
1253 
1254 #undef IS_A_TEMP
1255 
1256   bind(L_fallthrough);
1257 }
1258 
1259 
1260 void MacroAssembler::verify_oop(Register reg, const char* s) {
1261   if (!VerifyOops) return;
1262 
1263   // Pass register number to verify_oop_subroutine
1264   const char* b = NULL;
1265   {
1266     ResourceMark rm;
1267     stringStream ss;
1268     ss.print("verify_oop: %s: %s", reg->name(), s);
1269     b = code_string(ss.as_string());
1270   }
1271   BLOCK_COMMENT("verify_oop {");
1272 
1273   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1274   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1275 
1276   mov(r0, reg);
1277   mov(rscratch1, (address)b);
1278 
1279   // call indirectly to solve generation ordering problem
1280   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1281   ldr(rscratch2, Address(rscratch2));
1282   blr(rscratch2);
1283 
1284   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1285   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1286 
1287   BLOCK_COMMENT("} verify_oop");
1288 }
1289 
1290 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1291   if (!VerifyOops) return;
1292 
1293   const char* b = NULL;
1294   {
1295     ResourceMark rm;
1296     stringStream ss;
1297     ss.print("verify_oop_addr: %s", s);
1298     b = code_string(ss.as_string());
1299   }
1300   BLOCK_COMMENT("verify_oop_addr {");
1301 
1302   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1303   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1304 
1305   // addr may contain sp so we will have to adjust it based on the
1306   // pushes that we just did.
1307   if (addr.uses(sp)) {
1308     lea(r0, addr);
1309     ldr(r0, Address(r0, 4 * wordSize));
1310   } else {
1311     ldr(r0, addr);
1312   }
1313   mov(rscratch1, (address)b);
1314 
1315   // call indirectly to solve generation ordering problem
1316   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1317   ldr(rscratch2, Address(rscratch2));
1318   blr(rscratch2);
1319 
1320   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1321   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1322 
1323   BLOCK_COMMENT("} verify_oop_addr");
1324 }
1325 
1326 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1327                                          int extra_slot_offset) {
1328   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1329   int stackElementSize = Interpreter::stackElementSize;
1330   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1331 #ifdef ASSERT
1332   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1333   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1334 #endif
1335   if (arg_slot.is_constant()) {
1336     return Address(esp, arg_slot.as_constant() * stackElementSize
1337                    + offset);
1338   } else {
1339     add(rscratch1, esp, arg_slot.as_register(),
1340         ext::uxtx, exact_log2(stackElementSize));
1341     return Address(rscratch1, offset);
1342   }
1343 }
1344 
1345 void MacroAssembler::call_VM_leaf_base(address entry_point,
1346                                        int number_of_arguments,
1347                                        Label *retaddr) {
1348   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1349 }
1350 
1351 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1352                                         int number_of_gp_arguments,
1353                                         int number_of_fp_arguments,
1354                                         ret_type type,
1355                                         Label *retaddr) {
1356   Label E, L;
1357 
1358   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1359 
1360   // We add 1 to number_of_arguments because the thread in arg0 is
1361   // not counted
1362   mov(rscratch1, entry_point);
1363   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1364   if (retaddr)
1365     bind(*retaddr);
1366 
1367   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1368   maybe_isb();
1369 }
1370 
1371 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1372   call_VM_leaf_base(entry_point, number_of_arguments);
1373 }
1374 
1375 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1376   pass_arg0(this, arg_0);
1377   call_VM_leaf_base(entry_point, 1);
1378 }
1379 
1380 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1381   pass_arg0(this, arg_0);
1382   pass_arg1(this, arg_1);
1383   call_VM_leaf_base(entry_point, 2);
1384 }
1385 
1386 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1387                                   Register arg_1, Register arg_2) {
1388   pass_arg0(this, arg_0);
1389   pass_arg1(this, arg_1);
1390   pass_arg2(this, arg_2);
1391   call_VM_leaf_base(entry_point, 3);
1392 }
1393 
1394 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1395   pass_arg0(this, arg_0);
1396   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1397 }
1398 
1399 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1400 
1401   assert(arg_0 != c_rarg1, "smashed arg");
1402   pass_arg1(this, arg_1);
1403   pass_arg0(this, arg_0);
1404   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1405 }
1406 
1407 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1408   assert(arg_0 != c_rarg2, "smashed arg");
1409   assert(arg_1 != c_rarg2, "smashed arg");
1410   pass_arg2(this, arg_2);
1411   assert(arg_0 != c_rarg1, "smashed arg");
1412   pass_arg1(this, arg_1);
1413   pass_arg0(this, arg_0);
1414   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1415 }
1416 
1417 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1418   assert(arg_0 != c_rarg3, "smashed arg");
1419   assert(arg_1 != c_rarg3, "smashed arg");
1420   assert(arg_2 != c_rarg3, "smashed arg");
1421   pass_arg3(this, arg_3);
1422   assert(arg_0 != c_rarg2, "smashed arg");
1423   assert(arg_1 != c_rarg2, "smashed arg");
1424   pass_arg2(this, arg_2);
1425   assert(arg_0 != c_rarg1, "smashed arg");
1426   pass_arg1(this, arg_1);
1427   pass_arg0(this, arg_0);
1428   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1429 }
1430 
1431 void MacroAssembler::null_check(Register reg, int offset) {
1432   if (needs_explicit_null_check(offset)) {
1433     // provoke OS NULL exception if reg = NULL by
1434     // accessing M[reg] w/o changing any registers
1435     // NOTE: this is plenty to provoke a segv
1436     ldr(zr, Address(reg));
1437   } else {
1438     // nothing to do, (later) access of M[reg + offset]
1439     // will provoke OS NULL exception if reg = NULL
1440   }
1441 }
1442 
1443 // MacroAssembler protected routines needed to implement
1444 // public methods
1445 
1446 void MacroAssembler::mov(Register r, Address dest) {
1447   code_section()->relocate(pc(), dest.rspec());
1448   u_int64_t imm64 = (u_int64_t)dest.target();
1449   movptr(r, imm64);
1450 }
1451 
1452 // Move a constant pointer into r.  In AArch64 mode the virtual
1453 // address space is 48 bits in size, so we only need three
1454 // instructions to create a patchable instruction sequence that can
1455 // reach anywhere.
1456 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1457 #ifndef PRODUCT
1458   {
1459     char buffer[64];
1460     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1461     block_comment(buffer);
1462   }
1463 #endif
1464   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1465   movz(r, imm64 & 0xffff);
1466   imm64 >>= 16;
1467   movk(r, imm64 & 0xffff, 16);
1468   imm64 >>= 16;
1469   movk(r, imm64 & 0xffff, 32);
1470 }
1471 
1472 // Macro to mov replicated immediate to vector register.
1473 //  Vd will get the following values for different arrangements in T
1474 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1475 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1476 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1477 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1478 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1479 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1480 //   T1D/T2D: invalid
1481 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1482   assert(T != T1D && T != T2D, "invalid arrangement");
1483   if (T == T8B || T == T16B) {
1484     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1485     movi(Vd, T, imm32 & 0xff, 0);
1486     return;
1487   }
1488   u_int32_t nimm32 = ~imm32;
1489   if (T == T4H || T == T8H) {
1490     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1491     imm32 &= 0xffff;
1492     nimm32 &= 0xffff;
1493   }
1494   u_int32_t x = imm32;
1495   int movi_cnt = 0;
1496   int movn_cnt = 0;
1497   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1498   x = nimm32;
1499   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1500   if (movn_cnt < movi_cnt) imm32 = nimm32;
1501   unsigned lsl = 0;
1502   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1503   if (movn_cnt < movi_cnt)
1504     mvni(Vd, T, imm32 & 0xff, lsl);
1505   else
1506     movi(Vd, T, imm32 & 0xff, lsl);
1507   imm32 >>= 8; lsl += 8;
1508   while (imm32) {
1509     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1510     if (movn_cnt < movi_cnt)
1511       bici(Vd, T, imm32 & 0xff, lsl);
1512     else
1513       orri(Vd, T, imm32 & 0xff, lsl);
1514     lsl += 8; imm32 >>= 8;
1515   }
1516 }
1517 
1518 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1519 {
1520 #ifndef PRODUCT
1521   {
1522     char buffer[64];
1523     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1524     block_comment(buffer);
1525   }
1526 #endif
1527   if (operand_valid_for_logical_immediate(false, imm64)) {
1528     orr(dst, zr, imm64);
1529   } else {
1530     // we can use a combination of MOVZ or MOVN with
1531     // MOVK to build up the constant
1532     u_int64_t imm_h[4];
1533     int zero_count = 0;
1534     int neg_count = 0;
1535     int i;
1536     for (i = 0; i < 4; i++) {
1537       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1538       if (imm_h[i] == 0) {
1539         zero_count++;
1540       } else if (imm_h[i] == 0xffffL) {
1541         neg_count++;
1542       }
1543     }
1544     if (zero_count == 4) {
1545       // one MOVZ will do
1546       movz(dst, 0);
1547     } else if (neg_count == 4) {
1548       // one MOVN will do
1549       movn(dst, 0);
1550     } else if (zero_count == 3) {
1551       for (i = 0; i < 4; i++) {
1552         if (imm_h[i] != 0L) {
1553           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1554           break;
1555         }
1556       }
1557     } else if (neg_count == 3) {
1558       // one MOVN will do
1559       for (int i = 0; i < 4; i++) {
1560         if (imm_h[i] != 0xffffL) {
1561           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1562           break;
1563         }
1564       }
1565     } else if (zero_count == 2) {
1566       // one MOVZ and one MOVK will do
1567       for (i = 0; i < 3; i++) {
1568         if (imm_h[i] != 0L) {
1569           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1570           i++;
1571           break;
1572         }
1573       }
1574       for (;i < 4; i++) {
1575         if (imm_h[i] != 0L) {
1576           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1577         }
1578       }
1579     } else if (neg_count == 2) {
1580       // one MOVN and one MOVK will do
1581       for (i = 0; i < 4; i++) {
1582         if (imm_h[i] != 0xffffL) {
1583           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1584           i++;
1585           break;
1586         }
1587       }
1588       for (;i < 4; i++) {
1589         if (imm_h[i] != 0xffffL) {
1590           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1591         }
1592       }
1593     } else if (zero_count == 1) {
1594       // one MOVZ and two MOVKs will do
1595       for (i = 0; i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1598           i++;
1599           break;
1600         }
1601       }
1602       for (;i < 4; i++) {
1603         if (imm_h[i] != 0x0L) {
1604           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1605         }
1606       }
1607     } else if (neg_count == 1) {
1608       // one MOVN and two MOVKs will do
1609       for (i = 0; i < 4; i++) {
1610         if (imm_h[i] != 0xffffL) {
1611           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1612           i++;
1613           break;
1614         }
1615       }
1616       for (;i < 4; i++) {
1617         if (imm_h[i] != 0xffffL) {
1618           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1619         }
1620       }
1621     } else {
1622       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1623       movz(dst, (u_int32_t)imm_h[0], 0);
1624       for (i = 1; i < 4; i++) {
1625         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1626       }
1627     }
1628   }
1629 }
1630 
1631 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1632 {
1633 #ifndef PRODUCT
1634     {
1635       char buffer[64];
1636       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1637       block_comment(buffer);
1638     }
1639 #endif
1640   if (operand_valid_for_logical_immediate(true, imm32)) {
1641     orrw(dst, zr, imm32);
1642   } else {
1643     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1644     // constant
1645     u_int32_t imm_h[2];
1646     imm_h[0] = imm32 & 0xffff;
1647     imm_h[1] = ((imm32 >> 16) & 0xffff);
1648     if (imm_h[0] == 0) {
1649       movzw(dst, imm_h[1], 16);
1650     } else if (imm_h[0] == 0xffff) {
1651       movnw(dst, imm_h[1] ^ 0xffff, 16);
1652     } else if (imm_h[1] == 0) {
1653       movzw(dst, imm_h[0], 0);
1654     } else if (imm_h[1] == 0xffff) {
1655       movnw(dst, imm_h[0] ^ 0xffff, 0);
1656     } else {
1657       // use a MOVZ and MOVK (makes it easier to debug)
1658       movzw(dst, imm_h[0], 0);
1659       movkw(dst, imm_h[1], 16);
1660     }
1661   }
1662 }
1663 
1664 void MacroAssembler::mov(Register dst, address addr) {
1665   assert(Universe::heap() == NULL
1666          || !Universe::heap()->is_in(addr), "use movptr for oop pointers");
1667     mov_immediate64(dst, (uintptr_t)addr);
1668 }
1669 
1670 // Form an address from base + offset in Rd.  Rd may or may
1671 // not actually be used: you must use the Address that is returned.
1672 // It is up to you to ensure that the shift provided matches the size
1673 // of your data.
1674 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1675   if (Address::offset_ok_for_immed(byte_offset, shift))
1676     // It fits; no need for any heroics
1677     return Address(base, byte_offset);
1678 
1679   // Don't do anything clever with negative or misaligned offsets
1680   unsigned mask = (1 << shift) - 1;
1681   if (byte_offset < 0 || byte_offset & mask) {
1682     mov(Rd, byte_offset);
1683     add(Rd, base, Rd);
1684     return Address(Rd);
1685   }
1686 
1687   // See if we can do this with two 12-bit offsets
1688   {
1689     unsigned long word_offset = byte_offset >> shift;
1690     unsigned long masked_offset = word_offset & 0xfff000;
1691     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1692         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1693       add(Rd, base, masked_offset << shift);
1694       word_offset -= masked_offset;
1695       return Address(Rd, word_offset << shift);
1696     }
1697   }
1698 
1699   // Do it the hard way
1700   mov(Rd, byte_offset);
1701   add(Rd, base, Rd);
1702   return Address(Rd);
1703 }
1704 
1705 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1706   if (UseLSE) {
1707     mov(tmp, 1);
1708     ldadd(Assembler::word, tmp, zr, counter_addr);
1709     return;
1710   }
1711   Label retry_load;
1712   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
1713     prfm(Address(counter_addr), PSTL1STRM);
1714   bind(retry_load);
1715   // flush and load exclusive from the memory location
1716   ldxrw(tmp, counter_addr);
1717   addw(tmp, tmp, 1);
1718   // if we store+flush with no intervening write tmp wil be zero
1719   stxrw(tmp2, tmp, counter_addr);
1720   cbnzw(tmp2, retry_load);
1721 }
1722 
1723 
1724 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1725                                     bool want_remainder, Register scratch)
1726 {
1727   // Full implementation of Java idiv and irem.  The function
1728   // returns the (pc) offset of the div instruction - may be needed
1729   // for implicit exceptions.
1730   //
1731   // constraint : ra/rb =/= scratch
1732   //         normal case
1733   //
1734   // input : ra: dividend
1735   //         rb: divisor
1736   //
1737   // result: either
1738   //         quotient  (= ra idiv rb)
1739   //         remainder (= ra irem rb)
1740 
1741   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1742 
1743   int idivl_offset = offset();
1744   if (! want_remainder) {
1745     sdivw(result, ra, rb);
1746   } else {
1747     sdivw(scratch, ra, rb);
1748     Assembler::msubw(result, scratch, rb, ra);
1749   }
1750 
1751   return idivl_offset;
1752 }
1753 
1754 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1755                                     bool want_remainder, Register scratch)
1756 {
1757   // Full implementation of Java ldiv and lrem.  The function
1758   // returns the (pc) offset of the div instruction - may be needed
1759   // for implicit exceptions.
1760   //
1761   // constraint : ra/rb =/= scratch
1762   //         normal case
1763   //
1764   // input : ra: dividend
1765   //         rb: divisor
1766   //
1767   // result: either
1768   //         quotient  (= ra idiv rb)
1769   //         remainder (= ra irem rb)
1770 
1771   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1772 
1773   int idivq_offset = offset();
1774   if (! want_remainder) {
1775     sdiv(result, ra, rb);
1776   } else {
1777     sdiv(scratch, ra, rb);
1778     Assembler::msub(result, scratch, rb, ra);
1779   }
1780 
1781   return idivq_offset;
1782 }
1783 
1784 // MacroAssembler routines found actually to be needed
1785 
1786 void MacroAssembler::push(Register src)
1787 {
1788   str(src, Address(pre(esp, -1 * wordSize)));
1789 }
1790 
1791 void MacroAssembler::pop(Register dst)
1792 {
1793   ldr(dst, Address(post(esp, 1 * wordSize)));
1794 }
1795 
1796 // Note: load_unsigned_short used to be called load_unsigned_word.
1797 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1798   int off = offset();
1799   ldrh(dst, src);
1800   return off;
1801 }
1802 
1803 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1804   int off = offset();
1805   ldrb(dst, src);
1806   return off;
1807 }
1808 
1809 int MacroAssembler::load_signed_short(Register dst, Address src) {
1810   int off = offset();
1811   ldrsh(dst, src);
1812   return off;
1813 }
1814 
1815 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1816   int off = offset();
1817   ldrsb(dst, src);
1818   return off;
1819 }
1820 
1821 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1822   int off = offset();
1823   ldrshw(dst, src);
1824   return off;
1825 }
1826 
1827 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1828   int off = offset();
1829   ldrsbw(dst, src);
1830   return off;
1831 }
1832 
1833 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1834   switch (size_in_bytes) {
1835   case  8:  ldr(dst, src); break;
1836   case  4:  ldrw(dst, src); break;
1837   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1838   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1839   default:  ShouldNotReachHere();
1840   }
1841 }
1842 
1843 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1844   switch (size_in_bytes) {
1845   case  8:  str(src, dst); break;
1846   case  4:  strw(src, dst); break;
1847   case  2:  strh(src, dst); break;
1848   case  1:  strb(src, dst); break;
1849   default:  ShouldNotReachHere();
1850   }
1851 }
1852 
1853 void MacroAssembler::decrementw(Register reg, int value)
1854 {
1855   if (value < 0)  { incrementw(reg, -value);      return; }
1856   if (value == 0) {                               return; }
1857   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1858   /* else */ {
1859     guarantee(reg != rscratch2, "invalid dst for register decrement");
1860     movw(rscratch2, (unsigned)value);
1861     subw(reg, reg, rscratch2);
1862   }
1863 }
1864 
1865 void MacroAssembler::decrement(Register reg, int value)
1866 {
1867   if (value < 0)  { increment(reg, -value);      return; }
1868   if (value == 0) {                              return; }
1869   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1870   /* else */ {
1871     assert(reg != rscratch2, "invalid dst for register decrement");
1872     mov(rscratch2, (unsigned long)value);
1873     sub(reg, reg, rscratch2);
1874   }
1875 }
1876 
1877 void MacroAssembler::decrementw(Address dst, int value)
1878 {
1879   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1880   ldrw(rscratch1, dst);
1881   decrementw(rscratch1, value);
1882   strw(rscratch1, dst);
1883 }
1884 
1885 void MacroAssembler::decrement(Address dst, int value)
1886 {
1887   assert(!dst.uses(rscratch1), "invalid address for decrement");
1888   ldr(rscratch1, dst);
1889   decrement(rscratch1, value);
1890   str(rscratch1, dst);
1891 }
1892 
1893 void MacroAssembler::incrementw(Register reg, int value)
1894 {
1895   if (value < 0)  { decrementw(reg, -value);      return; }
1896   if (value == 0) {                               return; }
1897   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1898   /* else */ {
1899     assert(reg != rscratch2, "invalid dst for register increment");
1900     movw(rscratch2, (unsigned)value);
1901     addw(reg, reg, rscratch2);
1902   }
1903 }
1904 
1905 void MacroAssembler::increment(Register reg, int value)
1906 {
1907   if (value < 0)  { decrement(reg, -value);      return; }
1908   if (value == 0) {                              return; }
1909   if (value < (1 << 12)) { add(reg, reg, value); return; }
1910   /* else */ {
1911     assert(reg != rscratch2, "invalid dst for register increment");
1912     movw(rscratch2, (unsigned)value);
1913     add(reg, reg, rscratch2);
1914   }
1915 }
1916 
1917 void MacroAssembler::incrementw(Address dst, int value)
1918 {
1919   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1920   ldrw(rscratch1, dst);
1921   incrementw(rscratch1, value);
1922   strw(rscratch1, dst);
1923 }
1924 
1925 void MacroAssembler::increment(Address dst, int value)
1926 {
1927   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1928   ldr(rscratch1, dst);
1929   increment(rscratch1, value);
1930   str(rscratch1, dst);
1931 }
1932 
1933 
1934 void MacroAssembler::pusha() {
1935   push(0x7fffffff, sp);
1936 }
1937 
1938 void MacroAssembler::popa() {
1939   pop(0x7fffffff, sp);
1940 }
1941 
1942 // Push lots of registers in the bit set supplied.  Don't push sp.
1943 // Return the number of words pushed
1944 int MacroAssembler::push(unsigned int bitset, Register stack) {
1945   int words_pushed = 0;
1946 
1947   // Scan bitset to accumulate register pairs
1948   unsigned char regs[32];
1949   int count = 0;
1950   for (int reg = 0; reg <= 30; reg++) {
1951     if (1 & bitset)
1952       regs[count++] = reg;
1953     bitset >>= 1;
1954   }
1955   regs[count++] = zr->encoding_nocheck();
1956   count &= ~1;  // Only push an even nuber of regs
1957 
1958   if (count) {
1959     stp(as_Register(regs[0]), as_Register(regs[1]),
1960        Address(pre(stack, -count * wordSize)));
1961     words_pushed += 2;
1962   }
1963   for (int i = 2; i < count; i += 2) {
1964     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1965        Address(stack, i * wordSize));
1966     words_pushed += 2;
1967   }
1968 
1969   assert(words_pushed == count, "oops, pushed != count");
1970 
1971   return count;
1972 }
1973 
1974 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1975   int words_pushed = 0;
1976 
1977   // Scan bitset to accumulate register pairs
1978   unsigned char regs[32];
1979   int count = 0;
1980   for (int reg = 0; reg <= 30; reg++) {
1981     if (1 & bitset)
1982       regs[count++] = reg;
1983     bitset >>= 1;
1984   }
1985   regs[count++] = zr->encoding_nocheck();
1986   count &= ~1;
1987 
1988   for (int i = 2; i < count; i += 2) {
1989     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1990        Address(stack, i * wordSize));
1991     words_pushed += 2;
1992   }
1993   if (count) {
1994     ldp(as_Register(regs[0]), as_Register(regs[1]),
1995        Address(post(stack, count * wordSize)));
1996     words_pushed += 2;
1997   }
1998 
1999   assert(words_pushed == count, "oops, pushed != count");
2000 
2001   return count;
2002 }
2003 #ifdef ASSERT
2004 void MacroAssembler::verify_heapbase(const char* msg) {
2005 #if 0
2006   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2007   assert (Universe::heap() != NULL, "java heap should be initialized");
2008   if (CheckCompressedOops) {
2009     Label ok;
2010     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2011     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2012     br(Assembler::EQ, ok);
2013     stop(msg);
2014     bind(ok);
2015     pop(1 << rscratch1->encoding(), sp);
2016   }
2017 #endif
2018 }
2019 #endif
2020 
2021 void MacroAssembler::stop(const char* msg) {
2022   address ip = pc();
2023   pusha();
2024   mov(c_rarg0, (address)msg);
2025   mov(c_rarg1, (address)ip);
2026   mov(c_rarg2, sp);
2027   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2028   // call(c_rarg3);
2029   blrt(c_rarg3, 3, 0, 1);
2030   hlt(0);
2031 }
2032 
2033 // If a constant does not fit in an immediate field, generate some
2034 // number of MOV instructions and then perform the operation.
2035 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2036                                            add_sub_imm_insn insn1,
2037                                            add_sub_reg_insn insn2) {
2038   assert(Rd != zr, "Rd = zr and not setting flags?");
2039   if (operand_valid_for_add_sub_immediate((int)imm)) {
2040     (this->*insn1)(Rd, Rn, imm);
2041   } else {
2042     if (uabs(imm) < (1 << 24)) {
2043        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2044        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2045     } else {
2046        assert_different_registers(Rd, Rn);
2047        mov(Rd, (uint64_t)imm);
2048        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2049     }
2050   }
2051 }
2052 
2053 // Seperate vsn which sets the flags. Optimisations are more restricted
2054 // because we must set the flags correctly.
2055 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2056                                            add_sub_imm_insn insn1,
2057                                            add_sub_reg_insn insn2) {
2058   if (operand_valid_for_add_sub_immediate((int)imm)) {
2059     (this->*insn1)(Rd, Rn, imm);
2060   } else {
2061     assert_different_registers(Rd, Rn);
2062     assert(Rd != zr, "overflow in immediate operand");
2063     mov(Rd, (uint64_t)imm);
2064     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2065   }
2066 }
2067 
2068 
2069 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2070   if (increment.is_register()) {
2071     add(Rd, Rn, increment.as_register());
2072   } else {
2073     add(Rd, Rn, increment.as_constant());
2074   }
2075 }
2076 
2077 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2078   if (increment.is_register()) {
2079     addw(Rd, Rn, increment.as_register());
2080   } else {
2081     addw(Rd, Rn, increment.as_constant());
2082   }
2083 }
2084 
2085 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2086   if (decrement.is_register()) {
2087     sub(Rd, Rn, decrement.as_register());
2088   } else {
2089     sub(Rd, Rn, decrement.as_constant());
2090   }
2091 }
2092 
2093 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2094   if (decrement.is_register()) {
2095     subw(Rd, Rn, decrement.as_register());
2096   } else {
2097     subw(Rd, Rn, decrement.as_constant());
2098   }
2099 }
2100 
2101 void MacroAssembler::reinit_heapbase()
2102 {
2103   if (UseCompressedOops) {
2104     if (Universe::is_fully_initialized()) {
2105       mov(rheapbase, Universe::narrow_ptrs_base());
2106     } else {
2107       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2108       ldr(rheapbase, Address(rheapbase));
2109     }
2110   }
2111 }
2112 
2113 // this simulates the behaviour of the x86 cmpxchg instruction using a
2114 // load linked/store conditional pair. we use the acquire/release
2115 // versions of these instructions so that we flush pending writes as
2116 // per Java semantics.
2117 
2118 // n.b the x86 version assumes the old value to be compared against is
2119 // in rax and updates rax with the value located in memory if the
2120 // cmpxchg fails. we supply a register for the old value explicitly
2121 
2122 // the aarch64 load linked/store conditional instructions do not
2123 // accept an offset. so, unlike x86, we must provide a plain register
2124 // to identify the memory word to be compared/exchanged rather than a
2125 // register+offset Address.
2126 
2127 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2128                                 Label &succeed, Label *fail) {
2129   // oldv holds comparison value
2130   // newv holds value to write in exchange
2131   // addr identifies memory word to compare against/update
2132   if (UseLSE) {
2133     mov(tmp, oldv);
2134     casal(Assembler::xword, oldv, newv, addr);
2135     cmp(tmp, oldv);
2136     br(Assembler::EQ, succeed);
2137     membar(AnyAny);
2138   } else {
2139     Label retry_load, nope;
2140     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2141       prfm(Address(addr), PSTL1STRM);
2142     bind(retry_load);
2143     // flush and load exclusive from the memory location
2144     // and fail if it is not what we expect
2145     ldaxr(tmp, addr);
2146     cmp(tmp, oldv);
2147     br(Assembler::NE, nope);
2148     // if we store+flush with no intervening write tmp wil be zero
2149     stlxr(tmp, newv, addr);
2150     cbzw(tmp, succeed);
2151     // retry so we only ever return after a load fails to compare
2152     // ensures we don't return a stale value after a failed write.
2153     b(retry_load);
2154     // if the memory word differs we return it in oldv and signal a fail
2155     bind(nope);
2156     membar(AnyAny);
2157     mov(oldv, tmp);
2158   }
2159   if (fail)
2160     b(*fail);
2161 }
2162 
2163 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2164                                 Label &succeed, Label *fail) {
2165   // oldv holds comparison value
2166   // newv holds value to write in exchange
2167   // addr identifies memory word to compare against/update
2168   // tmp returns 0/1 for success/failure
2169   if (UseLSE) {
2170     mov(tmp, oldv);
2171     casal(Assembler::word, oldv, newv, addr);
2172     cmp(tmp, oldv);
2173     br(Assembler::EQ, succeed);
2174     membar(AnyAny);
2175   } else {
2176     Label retry_load, nope;
2177     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2178       prfm(Address(addr), PSTL1STRM);
2179     bind(retry_load);
2180     // flush and load exclusive from the memory location
2181     // and fail if it is not what we expect
2182     ldaxrw(tmp, addr);
2183     cmp(tmp, oldv);
2184     br(Assembler::NE, nope);
2185     // if we store+flush with no intervening write tmp wil be zero
2186     stlxrw(tmp, newv, addr);
2187     cbzw(tmp, succeed);
2188     // retry so we only ever return after a load fails to compare
2189     // ensures we don't return a stale value after a failed write.
2190     b(retry_load);
2191     // if the memory word differs we return it in oldv and signal a fail
2192     bind(nope);
2193     membar(AnyAny);
2194     mov(oldv, tmp);
2195   }
2196   if (fail)
2197     b(*fail);
2198 }
2199 
2200 // A generic CAS; success or failure is in the EQ flag.
2201 void MacroAssembler::cmpxchg(Register addr, Register expected,
2202                              Register new_val,
2203                              enum operand_size size,
2204                              bool acquire, bool release,
2205                              Register tmp) {
2206   if (UseLSE) {
2207     mov(tmp, expected);
2208     lse_cas(tmp, new_val, addr, size, acquire, release, /*not_pair*/ true);
2209     cmp(tmp, expected);
2210   } else {
2211     BLOCK_COMMENT("cmpxchg {");
2212     Label retry_load, done;
2213     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2214       prfm(Address(addr), PSTL1STRM);
2215     bind(retry_load);
2216     load_exclusive(tmp, addr, size, acquire);
2217     if (size == xword)
2218       cmp(tmp, expected);
2219     else
2220       cmpw(tmp, expected);
2221     br(Assembler::NE, done);
2222     store_exclusive(tmp, new_val, addr, size, release);
2223     cbnzw(tmp, retry_load);
2224     bind(done);
2225     BLOCK_COMMENT("} cmpxchg");
2226   }
2227 }
2228 
2229 void MacroAssembler::cmpxchg_oop_shenandoah(Register addr, Register expected,
2230                                             Register new_val,
2231                                             enum operand_size size,
2232                                             bool acquire, bool release,
2233                                             bool weak,
2234                                             Register result, Register tmp2) {
2235   assert(UseShenandoahGC, "only for shenandoah");
2236   bool is_cae = (result != noreg);
2237   bool is_narrow = (size == word);
2238 
2239   if (! is_cae) result = rscratch1;
2240 
2241   assert_different_registers(addr, expected, new_val, result, tmp2);
2242 
2243   Label retry, done, fail;
2244 
2245   // CAS, using LL/SC pair.
2246   bind(retry);
2247   load_exclusive(result, addr, size, acquire);
2248   if (is_narrow) {
2249     cmpw(result, expected);
2250   } else {
2251     cmp(result, expected);
2252   }
2253   br(Assembler::NE, fail);
2254   store_exclusive(tmp2, new_val, addr, size, release);
2255   if (weak) {
2256     cmpw(tmp2, 0u); // If the store fails, return NE to our caller
2257   } else {
2258     cbnzw(tmp2, retry);
2259   }
2260   b(done);
2261 
2262   bind(fail);
2263   // Check if rb(expected)==rb(result)
2264   // Shuffle registers so that we have memory value ready for next expected.
2265   mov(tmp2, expected);
2266   mov(expected, result);
2267   if (is_narrow) {
2268     decode_heap_oop(result, result);
2269     decode_heap_oop(tmp2, tmp2);
2270   }
2271   oopDesc::bs()->interpreter_read_barrier(this, result);
2272   oopDesc::bs()->interpreter_read_barrier(this, tmp2);
2273   cmp(result, tmp2);
2274   // Retry with expected now being the value we just loaded from addr.
2275   br(Assembler::EQ, retry);
2276   if (is_narrow && is_cae) {
2277     // For cmp-and-exchange and narrow oops, we need to restore
2278     // the compressed old-value. We moved it to 'expected' a few lines up.
2279     mov(result, expected);
2280   }
2281   bind(done);
2282 }
2283 
2284 static bool different(Register a, RegisterOrConstant b, Register c) {
2285   if (b.is_constant())
2286     return a != c;
2287   else
2288     return a != b.as_register() && a != c && b.as_register() != c;
2289 }
2290 
2291 #define ATOMIC_OP(LDXR, OP, IOP, AOP, STXR, sz)                         \
2292 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2293   if (UseLSE) {                                                         \
2294     prev = prev->is_valid() ? prev : zr;                                \
2295     if (incr.is_register()) {                                           \
2296       AOP(sz, incr.as_register(), prev, addr);                          \
2297     } else {                                                            \
2298       mov(rscratch2, incr.as_constant());                               \
2299       AOP(sz, rscratch2, prev, addr);                                   \
2300     }                                                                   \
2301     return;                                                             \
2302   }                                                                     \
2303   Register result = rscratch2;                                          \
2304   if (prev->is_valid())                                                      \
2305     result = different(prev, incr, addr) ? prev : rscratch2;            \
2306                                                                         \
2307   Label retry_load;                                                     \
2308   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
2309     prfm(Address(addr), PSTL1STRM);                                     \
2310   bind(retry_load);                                                     \
2311   LDXR(result, addr);                                                   \
2312   OP(rscratch1, result, incr);                                          \
2313   STXR(rscratch2, rscratch1, addr);                                     \
2314   cbnzw(rscratch2, retry_load);                                         \
2315   if (prev->is_valid() && prev != result) {                             \
2316     IOP(prev, rscratch1, incr);                                         \
2317   }                                                                     \
2318 }
2319 
2320 ATOMIC_OP(ldxr, add, sub, ldadd, stxr, Assembler::xword)
2321 ATOMIC_OP(ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2322 
2323 #undef ATOMIC_OP
2324 
2325 #define ATOMIC_XCHG(OP, LDXR, STXR, sz)                                 \
2326 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2327   if (UseLSE) {                                                         \
2328     prev = prev->is_valid() ? prev : zr;                                \
2329     swp(sz, newv, prev, addr);                                          \
2330     return;                                                             \
2331   }                                                                     \
2332   Register result = rscratch2;                                          \
2333   if (prev->is_valid())                                                      \
2334     result = different(prev, newv, addr) ? prev : rscratch2;            \
2335                                                                         \
2336   Label retry_load;                                                     \
2337   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
2338     prfm(Address(addr), PSTL1STRM);                                     \
2339   bind(retry_load);                                                     \
2340   LDXR(result, addr);                                                   \
2341   STXR(rscratch1, newv, addr);                                          \
2342   cbnzw(rscratch1, retry_load);                                         \
2343   if (prev->is_valid() && prev != result)                            \
2344     mov(prev, result);                                                  \
2345 }
2346 
2347 ATOMIC_XCHG(xchg, ldxr, stxr, Assembler::xword)
2348 ATOMIC_XCHG(xchgw, ldxrw, stxrw, Assembler::word)
2349 
2350 #undef ATOMIC_XCHG
2351 
2352 void MacroAssembler::incr_allocated_bytes(Register thread,
2353                                           Register var_size_in_bytes,
2354                                           int con_size_in_bytes,
2355                                           Register t1) {
2356   if (!thread->is_valid()) {
2357     thread = rthread;
2358   }
2359   assert(t1->is_valid(), "need temp reg");
2360 
2361   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2362   if (var_size_in_bytes->is_valid()) {
2363     add(t1, t1, var_size_in_bytes);
2364   } else {
2365     add(t1, t1, con_size_in_bytes);
2366   }
2367   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2368 }
2369 
2370 #ifndef PRODUCT
2371 extern "C" void findpc(intptr_t x);
2372 #endif
2373 
2374 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2375 {
2376   // In order to get locks to work, we need to fake a in_VM state
2377   if (ShowMessageBoxOnError ) {
2378     JavaThread* thread = JavaThread::current();
2379     JavaThreadState saved_state = thread->thread_state();
2380     thread->set_thread_state(_thread_in_vm);
2381 #ifndef PRODUCT
2382     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2383       ttyLocker ttyl;
2384       BytecodeCounter::print();
2385     }
2386 #endif
2387     if (os::message_box(msg, "Execution stopped, print registers?")) {
2388       ttyLocker ttyl;
2389       tty->print_cr(" pc = 0x%016lx", pc);
2390 #ifndef PRODUCT
2391       tty->cr();
2392       findpc(pc);
2393       tty->cr();
2394 #endif
2395       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2396       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2397       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2398       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2399       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2400       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2401       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2402       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2403       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2404       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2405       tty->print_cr("r10 = 0x%016lx", regs[10]);
2406       tty->print_cr("r11 = 0x%016lx", regs[11]);
2407       tty->print_cr("r12 = 0x%016lx", regs[12]);
2408       tty->print_cr("r13 = 0x%016lx", regs[13]);
2409       tty->print_cr("r14 = 0x%016lx", regs[14]);
2410       tty->print_cr("r15 = 0x%016lx", regs[15]);
2411       tty->print_cr("r16 = 0x%016lx", regs[16]);
2412       tty->print_cr("r17 = 0x%016lx", regs[17]);
2413       tty->print_cr("r18 = 0x%016lx", regs[18]);
2414       tty->print_cr("r19 = 0x%016lx", regs[19]);
2415       tty->print_cr("r20 = 0x%016lx", regs[20]);
2416       tty->print_cr("r21 = 0x%016lx", regs[21]);
2417       tty->print_cr("r22 = 0x%016lx", regs[22]);
2418       tty->print_cr("r23 = 0x%016lx", regs[23]);
2419       tty->print_cr("r24 = 0x%016lx", regs[24]);
2420       tty->print_cr("r25 = 0x%016lx", regs[25]);
2421       tty->print_cr("r26 = 0x%016lx", regs[26]);
2422       tty->print_cr("r27 = 0x%016lx", regs[27]);
2423       tty->print_cr("r28 = 0x%016lx", regs[28]);
2424       tty->print_cr("r30 = 0x%016lx", regs[30]);
2425       tty->print_cr("r31 = 0x%016lx", regs[31]);
2426       BREAKPOINT;
2427     }
2428     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2429   } else {
2430     ttyLocker ttyl;
2431     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2432                     msg);
2433     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
2434   }
2435 }
2436 
2437 #ifdef BUILTIN_SIM
2438 // routine to generate an x86 prolog for a stub function which
2439 // bootstraps into the generated ARM code which directly follows the
2440 // stub
2441 //
2442 // the argument encodes the number of general and fp registers
2443 // passed by the caller and the callng convention (currently just
2444 // the number of general registers and assumes C argument passing)
2445 
2446 extern "C" {
2447 int aarch64_stub_prolog_size();
2448 void aarch64_stub_prolog();
2449 void aarch64_prolog();
2450 }
2451 
2452 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2453                                    address *prolog_ptr)
2454 {
2455   int calltype = (((ret_type & 0x3) << 8) |
2456                   ((fp_arg_count & 0xf) << 4) |
2457                   (gp_arg_count & 0xf));
2458 
2459   // the addresses for the x86 to ARM entry code we need to use
2460   address start = pc();
2461   // printf("start = %lx\n", start);
2462   int byteCount =  aarch64_stub_prolog_size();
2463   // printf("byteCount = %x\n", byteCount);
2464   int instructionCount = (byteCount + 3)/ 4;
2465   // printf("instructionCount = %x\n", instructionCount);
2466   for (int i = 0; i < instructionCount; i++) {
2467     nop();
2468   }
2469 
2470   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2471 
2472   // write the address of the setup routine and the call format at the
2473   // end of into the copied code
2474   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2475   if (prolog_ptr)
2476     patch_end[-2] = (u_int64_t)prolog_ptr;
2477   patch_end[-1] = calltype;
2478 }
2479 #endif
2480 
2481 void MacroAssembler::push_call_clobbered_fp_registers() {
2482   // Push v0-v7, v16-v31.
2483   for (int i = 30; i >= 0; i -= 2) {
2484     if (i <= v7->encoding() || i >= v16->encoding()) {
2485       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2486            Address(pre(sp, -2 * wordSize)));
2487     }
2488   }
2489 }
2490 
2491 void MacroAssembler::pop_call_clobbered_fp_registers() {
2492 
2493   for (int i = 0; i < 32; i += 2) {
2494     if (i <= v7->encoding() || i >= v16->encoding()) {
2495       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2496            Address(post(sp, 2 * wordSize)));
2497     }
2498   }
2499 }
2500 
2501 void MacroAssembler::push_call_clobbered_registers() {
2502   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2503 
2504   push_call_clobbered_fp_registers();
2505 }
2506 
2507 void MacroAssembler::pop_call_clobbered_registers() {
2508 
2509   pop_call_clobbered_fp_registers();
2510 
2511   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2512 }
2513 
2514 void MacroAssembler::push_CPU_state(bool save_vectors) {
2515   push(0x3fffffff, sp);         // integer registers except lr & sp
2516 
2517   if (!save_vectors) {
2518     for (int i = 30; i >= 0; i -= 2)
2519       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2520            Address(pre(sp, -2 * wordSize)));
2521   } else {
2522     for (int i = 30; i >= 0; i -= 2)
2523       stpq(as_FloatRegister(i), as_FloatRegister(i+1),
2524            Address(pre(sp, -4 * wordSize)));
2525   }
2526 }
2527 
2528 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2529   if (!restore_vectors) {
2530     for (int i = 0; i < 32; i += 2)
2531       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2532            Address(post(sp, 2 * wordSize)));
2533   } else {
2534     for (int i = 0; i < 32; i += 2)
2535       ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
2536            Address(post(sp, 4 * wordSize)));
2537   }
2538 
2539   pop(0x3fffffff, sp);         // integer registers except lr & sp
2540 }
2541 
2542 /**
2543  * Helpers for multiply_to_len().
2544  */
2545 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2546                                      Register src1, Register src2) {
2547   adds(dest_lo, dest_lo, src1);
2548   adc(dest_hi, dest_hi, zr);
2549   adds(dest_lo, dest_lo, src2);
2550   adc(final_dest_hi, dest_hi, zr);
2551 }
2552 
2553 // Generate an address from (r + r1 extend offset).  "size" is the
2554 // size of the operand.  The result may be in rscratch2.
2555 Address MacroAssembler::offsetted_address(Register r, Register r1,
2556                                           Address::extend ext, int offset, int size) {
2557   if (offset || (ext.shift() % size != 0)) {
2558     lea(rscratch2, Address(r, r1, ext));
2559     return Address(rscratch2, offset);
2560   } else {
2561     return Address(r, r1, ext);
2562   }
2563 }
2564 
2565 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2566 {
2567   assert(offset >= 0, "spill to negative address?");
2568   // Offset reachable ?
2569   //   Not aligned - 9 bits signed offset
2570   //   Aligned - 12 bits unsigned offset shifted
2571   Register base = sp;
2572   if ((offset & (size-1)) && offset >= (1<<8)) {
2573     add(tmp, base, offset & ((1<<12)-1));
2574     base = tmp;
2575     offset &= -1<<12;
2576   }
2577 
2578   if (offset >= (1<<12) * size) {
2579     add(tmp, base, offset & (((1<<12)-1)<<12));
2580     base = tmp;
2581     offset &= ~(((1<<12)-1)<<12);
2582   }
2583 
2584   return Address(base, offset);
2585 }
2586 
2587 /**
2588  * Multiply 64 bit by 64 bit first loop.
2589  */
2590 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2591                                            Register y, Register y_idx, Register z,
2592                                            Register carry, Register product,
2593                                            Register idx, Register kdx) {
2594   //
2595   //  jlong carry, x[], y[], z[];
2596   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2597   //    huge_128 product = y[idx] * x[xstart] + carry;
2598   //    z[kdx] = (jlong)product;
2599   //    carry  = (jlong)(product >>> 64);
2600   //  }
2601   //  z[xstart] = carry;
2602   //
2603 
2604   Label L_first_loop, L_first_loop_exit;
2605   Label L_one_x, L_one_y, L_multiply;
2606 
2607   subsw(xstart, xstart, 1);
2608   br(Assembler::MI, L_one_x);
2609 
2610   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2611   ldr(x_xstart, Address(rscratch1));
2612   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2613 
2614   bind(L_first_loop);
2615   subsw(idx, idx, 1);
2616   br(Assembler::MI, L_first_loop_exit);
2617   subsw(idx, idx, 1);
2618   br(Assembler::MI, L_one_y);
2619   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2620   ldr(y_idx, Address(rscratch1));
2621   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2622   bind(L_multiply);
2623 
2624   // AArch64 has a multiply-accumulate instruction that we can't use
2625   // here because it has no way to process carries, so we have to use
2626   // separate add and adc instructions.  Bah.
2627   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2628   mul(product, x_xstart, y_idx);
2629   adds(product, product, carry);
2630   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2631 
2632   subw(kdx, kdx, 2);
2633   ror(product, product, 32); // back to big-endian
2634   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2635 
2636   b(L_first_loop);
2637 
2638   bind(L_one_y);
2639   ldrw(y_idx, Address(y,  0));
2640   b(L_multiply);
2641 
2642   bind(L_one_x);
2643   ldrw(x_xstart, Address(x,  0));
2644   b(L_first_loop);
2645 
2646   bind(L_first_loop_exit);
2647 }
2648 
2649 /**
2650  * Multiply 128 bit by 128. Unrolled inner loop.
2651  *
2652  */
2653 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2654                                              Register carry, Register carry2,
2655                                              Register idx, Register jdx,
2656                                              Register yz_idx1, Register yz_idx2,
2657                                              Register tmp, Register tmp3, Register tmp4,
2658                                              Register tmp6, Register product_hi) {
2659 
2660   //   jlong carry, x[], y[], z[];
2661   //   int kdx = ystart+1;
2662   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2663   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2664   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2665   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2666   //     carry  = (jlong)(tmp4 >>> 64);
2667   //     z[kdx+idx+1] = (jlong)tmp3;
2668   //     z[kdx+idx] = (jlong)tmp4;
2669   //   }
2670   //   idx += 2;
2671   //   if (idx > 0) {
2672   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2673   //     z[kdx+idx] = (jlong)yz_idx1;
2674   //     carry  = (jlong)(yz_idx1 >>> 64);
2675   //   }
2676   //
2677 
2678   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2679 
2680   lsrw(jdx, idx, 2);
2681 
2682   bind(L_third_loop);
2683 
2684   subsw(jdx, jdx, 1);
2685   br(Assembler::MI, L_third_loop_exit);
2686   subw(idx, idx, 4);
2687 
2688   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2689 
2690   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2691 
2692   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2693 
2694   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2695   ror(yz_idx2, yz_idx2, 32);
2696 
2697   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2698 
2699   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2700   umulh(tmp4, product_hi, yz_idx1);
2701 
2702   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2703   ror(rscratch2, rscratch2, 32);
2704 
2705   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2706   umulh(carry2, product_hi, yz_idx2);
2707 
2708   // propagate sum of both multiplications into carry:tmp4:tmp3
2709   adds(tmp3, tmp3, carry);
2710   adc(tmp4, tmp4, zr);
2711   adds(tmp3, tmp3, rscratch1);
2712   adcs(tmp4, tmp4, tmp);
2713   adc(carry, carry2, zr);
2714   adds(tmp4, tmp4, rscratch2);
2715   adc(carry, carry, zr);
2716 
2717   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2718   ror(tmp4, tmp4, 32);
2719   stp(tmp4, tmp3, Address(tmp6, 0));
2720 
2721   b(L_third_loop);
2722   bind (L_third_loop_exit);
2723 
2724   andw (idx, idx, 0x3);
2725   cbz(idx, L_post_third_loop_done);
2726 
2727   Label L_check_1;
2728   subsw(idx, idx, 2);
2729   br(Assembler::MI, L_check_1);
2730 
2731   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2732   ldr(yz_idx1, Address(rscratch1, 0));
2733   ror(yz_idx1, yz_idx1, 32);
2734   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2735   umulh(tmp4, product_hi, yz_idx1);
2736   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2737   ldr(yz_idx2, Address(rscratch1, 0));
2738   ror(yz_idx2, yz_idx2, 32);
2739 
2740   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2741 
2742   ror(tmp3, tmp3, 32);
2743   str(tmp3, Address(rscratch1, 0));
2744 
2745   bind (L_check_1);
2746 
2747   andw (idx, idx, 0x1);
2748   subsw(idx, idx, 1);
2749   br(Assembler::MI, L_post_third_loop_done);
2750   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2751   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2752   umulh(carry2, tmp4, product_hi);
2753   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2754 
2755   add2_with_carry(carry2, tmp3, tmp4, carry);
2756 
2757   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2758   extr(carry, carry2, tmp3, 32);
2759 
2760   bind(L_post_third_loop_done);
2761 }
2762 
2763 /**
2764  * Code for BigInteger::multiplyToLen() instrinsic.
2765  *
2766  * r0: x
2767  * r1: xlen
2768  * r2: y
2769  * r3: ylen
2770  * r4:  z
2771  * r5: zlen
2772  * r10: tmp1
2773  * r11: tmp2
2774  * r12: tmp3
2775  * r13: tmp4
2776  * r14: tmp5
2777  * r15: tmp6
2778  * r16: tmp7
2779  *
2780  */
2781 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2782                                      Register z, Register zlen,
2783                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2784                                      Register tmp5, Register tmp6, Register product_hi) {
2785 
2786   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2787 
2788   const Register idx = tmp1;
2789   const Register kdx = tmp2;
2790   const Register xstart = tmp3;
2791 
2792   const Register y_idx = tmp4;
2793   const Register carry = tmp5;
2794   const Register product  = xlen;
2795   const Register x_xstart = zlen;  // reuse register
2796 
2797   // First Loop.
2798   //
2799   //  final static long LONG_MASK = 0xffffffffL;
2800   //  int xstart = xlen - 1;
2801   //  int ystart = ylen - 1;
2802   //  long carry = 0;
2803   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2804   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2805   //    z[kdx] = (int)product;
2806   //    carry = product >>> 32;
2807   //  }
2808   //  z[xstart] = (int)carry;
2809   //
2810 
2811   movw(idx, ylen);      // idx = ylen;
2812   movw(kdx, zlen);      // kdx = xlen+ylen;
2813   mov(carry, zr);       // carry = 0;
2814 
2815   Label L_done;
2816 
2817   movw(xstart, xlen);
2818   subsw(xstart, xstart, 1);
2819   br(Assembler::MI, L_done);
2820 
2821   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2822 
2823   Label L_second_loop;
2824   cbzw(kdx, L_second_loop);
2825 
2826   Label L_carry;
2827   subw(kdx, kdx, 1);
2828   cbzw(kdx, L_carry);
2829 
2830   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2831   lsr(carry, carry, 32);
2832   subw(kdx, kdx, 1);
2833 
2834   bind(L_carry);
2835   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2836 
2837   // Second and third (nested) loops.
2838   //
2839   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2840   //   carry = 0;
2841   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2842   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2843   //                    (z[k] & LONG_MASK) + carry;
2844   //     z[k] = (int)product;
2845   //     carry = product >>> 32;
2846   //   }
2847   //   z[i] = (int)carry;
2848   // }
2849   //
2850   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2851 
2852   const Register jdx = tmp1;
2853 
2854   bind(L_second_loop);
2855   mov(carry, zr);                // carry = 0;
2856   movw(jdx, ylen);               // j = ystart+1
2857 
2858   subsw(xstart, xstart, 1);      // i = xstart-1;
2859   br(Assembler::MI, L_done);
2860 
2861   str(z, Address(pre(sp, -4 * wordSize)));
2862 
2863   Label L_last_x;
2864   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2865   subsw(xstart, xstart, 1);       // i = xstart-1;
2866   br(Assembler::MI, L_last_x);
2867 
2868   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2869   ldr(product_hi, Address(rscratch1));
2870   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2871 
2872   Label L_third_loop_prologue;
2873   bind(L_third_loop_prologue);
2874 
2875   str(ylen, Address(sp, wordSize));
2876   stp(x, xstart, Address(sp, 2 * wordSize));
2877   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2878                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2879   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2880   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2881 
2882   addw(tmp3, xlen, 1);
2883   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2884   subsw(tmp3, tmp3, 1);
2885   br(Assembler::MI, L_done);
2886 
2887   lsr(carry, carry, 32);
2888   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2889   b(L_second_loop);
2890 
2891   // Next infrequent code is moved outside loops.
2892   bind(L_last_x);
2893   ldrw(product_hi, Address(x,  0));
2894   b(L_third_loop_prologue);
2895 
2896   bind(L_done);
2897 }
2898 
2899 /**
2900  * Emits code to update CRC-32 with a byte value according to constants in table
2901  *
2902  * @param [in,out]crc   Register containing the crc.
2903  * @param [in]val       Register containing the byte to fold into the CRC.
2904  * @param [in]table     Register containing the table of crc constants.
2905  *
2906  * uint32_t crc;
2907  * val = crc_table[(val ^ crc) & 0xFF];
2908  * crc = val ^ (crc >> 8);
2909  *
2910  */
2911 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2912   eor(val, val, crc);
2913   andr(val, val, 0xff);
2914   ldrw(val, Address(table, val, Address::lsl(2)));
2915   eor(crc, val, crc, Assembler::LSR, 8);
2916 }
2917 
2918 /**
2919  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2920  *
2921  * @param [in,out]crc   Register containing the crc.
2922  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2923  * @param [in]table0    Register containing table 0 of crc constants.
2924  * @param [in]table1    Register containing table 1 of crc constants.
2925  * @param [in]table2    Register containing table 2 of crc constants.
2926  * @param [in]table3    Register containing table 3 of crc constants.
2927  *
2928  * uint32_t crc;
2929  *   v = crc ^ v
2930  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2931  *
2932  */
2933 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2934         Register table0, Register table1, Register table2, Register table3,
2935         bool upper) {
2936   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2937   uxtb(tmp, v);
2938   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2939   ubfx(tmp, v, 8, 8);
2940   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2941   eor(crc, crc, tmp);
2942   ubfx(tmp, v, 16, 8);
2943   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2944   eor(crc, crc, tmp);
2945   ubfx(tmp, v, 24, 8);
2946   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2947   eor(crc, crc, tmp);
2948 }
2949 
2950 /**
2951  * @param crc   register containing existing CRC (32-bit)
2952  * @param buf   register pointing to input byte buffer (byte*)
2953  * @param len   register containing number of bytes
2954  * @param table register that will contain address of CRC table
2955  * @param tmp   scratch register
2956  */
2957 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2958         Register table0, Register table1, Register table2, Register table3,
2959         Register tmp, Register tmp2, Register tmp3) {
2960   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2961   unsigned long offset;
2962 
2963     ornw(crc, zr, crc);
2964 
2965   if (UseCRC32) {
2966     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2967 
2968       subs(len, len, 64);
2969       br(Assembler::GE, CRC_by64_loop);
2970       adds(len, len, 64-4);
2971       br(Assembler::GE, CRC_by4_loop);
2972       adds(len, len, 4);
2973       br(Assembler::GT, CRC_by1_loop);
2974       b(L_exit);
2975 
2976     BIND(CRC_by4_loop);
2977       ldrw(tmp, Address(post(buf, 4)));
2978       subs(len, len, 4);
2979       crc32w(crc, crc, tmp);
2980       br(Assembler::GE, CRC_by4_loop);
2981       adds(len, len, 4);
2982       br(Assembler::LE, L_exit);
2983     BIND(CRC_by1_loop);
2984       ldrb(tmp, Address(post(buf, 1)));
2985       subs(len, len, 1);
2986       crc32b(crc, crc, tmp);
2987       br(Assembler::GT, CRC_by1_loop);
2988       b(L_exit);
2989 
2990       align(CodeEntryAlignment);
2991     BIND(CRC_by64_loop);
2992       subs(len, len, 64);
2993       ldp(tmp, tmp3, Address(post(buf, 16)));
2994       crc32x(crc, crc, tmp);
2995       crc32x(crc, crc, tmp3);
2996       ldp(tmp, tmp3, Address(post(buf, 16)));
2997       crc32x(crc, crc, tmp);
2998       crc32x(crc, crc, tmp3);
2999       ldp(tmp, tmp3, Address(post(buf, 16)));
3000       crc32x(crc, crc, tmp);
3001       crc32x(crc, crc, tmp3);
3002       ldp(tmp, tmp3, Address(post(buf, 16)));
3003       crc32x(crc, crc, tmp);
3004       crc32x(crc, crc, tmp3);
3005       br(Assembler::GE, CRC_by64_loop);
3006       adds(len, len, 64-4);
3007       br(Assembler::GE, CRC_by4_loop);
3008       adds(len, len, 4);
3009       br(Assembler::GT, CRC_by1_loop);
3010     BIND(L_exit);
3011       ornw(crc, zr, crc);
3012       return;
3013   }
3014 
3015     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3016     if (offset) add(table0, table0, offset);
3017     add(table1, table0, 1*256*sizeof(juint));
3018     add(table2, table0, 2*256*sizeof(juint));
3019     add(table3, table0, 3*256*sizeof(juint));
3020 
3021   if (UseNeon) {
3022       cmp(len, 64);
3023       br(Assembler::LT, L_by16);
3024       eor(v16, T16B, v16, v16);
3025 
3026     Label L_fold;
3027 
3028       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3029 
3030       ld1(v0, v1, T2D, post(buf, 32));
3031       ld1r(v4, T2D, post(tmp, 8));
3032       ld1r(v5, T2D, post(tmp, 8));
3033       ld1r(v6, T2D, post(tmp, 8));
3034       ld1r(v7, T2D, post(tmp, 8));
3035       mov(v16, T4S, 0, crc);
3036 
3037       eor(v0, T16B, v0, v16);
3038       sub(len, len, 64);
3039 
3040     BIND(L_fold);
3041       pmull(v22, T8H, v0, v5, T8B);
3042       pmull(v20, T8H, v0, v7, T8B);
3043       pmull(v23, T8H, v0, v4, T8B);
3044       pmull(v21, T8H, v0, v6, T8B);
3045 
3046       pmull2(v18, T8H, v0, v5, T16B);
3047       pmull2(v16, T8H, v0, v7, T16B);
3048       pmull2(v19, T8H, v0, v4, T16B);
3049       pmull2(v17, T8H, v0, v6, T16B);
3050 
3051       uzp1(v24, v20, v22, T8H);
3052       uzp2(v25, v20, v22, T8H);
3053       eor(v20, T16B, v24, v25);
3054 
3055       uzp1(v26, v16, v18, T8H);
3056       uzp2(v27, v16, v18, T8H);
3057       eor(v16, T16B, v26, v27);
3058 
3059       ushll2(v22, T4S, v20, T8H, 8);
3060       ushll(v20, T4S, v20, T4H, 8);
3061 
3062       ushll2(v18, T4S, v16, T8H, 8);
3063       ushll(v16, T4S, v16, T4H, 8);
3064 
3065       eor(v22, T16B, v23, v22);
3066       eor(v18, T16B, v19, v18);
3067       eor(v20, T16B, v21, v20);
3068       eor(v16, T16B, v17, v16);
3069 
3070       uzp1(v17, v16, v20, T2D);
3071       uzp2(v21, v16, v20, T2D);
3072       eor(v17, T16B, v17, v21);
3073 
3074       ushll2(v20, T2D, v17, T4S, 16);
3075       ushll(v16, T2D, v17, T2S, 16);
3076 
3077       eor(v20, T16B, v20, v22);
3078       eor(v16, T16B, v16, v18);
3079 
3080       uzp1(v17, v20, v16, T2D);
3081       uzp2(v21, v20, v16, T2D);
3082       eor(v28, T16B, v17, v21);
3083 
3084       pmull(v22, T8H, v1, v5, T8B);
3085       pmull(v20, T8H, v1, v7, T8B);
3086       pmull(v23, T8H, v1, v4, T8B);
3087       pmull(v21, T8H, v1, v6, T8B);
3088 
3089       pmull2(v18, T8H, v1, v5, T16B);
3090       pmull2(v16, T8H, v1, v7, T16B);
3091       pmull2(v19, T8H, v1, v4, T16B);
3092       pmull2(v17, T8H, v1, v6, T16B);
3093 
3094       ld1(v0, v1, T2D, post(buf, 32));
3095 
3096       uzp1(v24, v20, v22, T8H);
3097       uzp2(v25, v20, v22, T8H);
3098       eor(v20, T16B, v24, v25);
3099 
3100       uzp1(v26, v16, v18, T8H);
3101       uzp2(v27, v16, v18, T8H);
3102       eor(v16, T16B, v26, v27);
3103 
3104       ushll2(v22, T4S, v20, T8H, 8);
3105       ushll(v20, T4S, v20, T4H, 8);
3106 
3107       ushll2(v18, T4S, v16, T8H, 8);
3108       ushll(v16, T4S, v16, T4H, 8);
3109 
3110       eor(v22, T16B, v23, v22);
3111       eor(v18, T16B, v19, v18);
3112       eor(v20, T16B, v21, v20);
3113       eor(v16, T16B, v17, v16);
3114 
3115       uzp1(v17, v16, v20, T2D);
3116       uzp2(v21, v16, v20, T2D);
3117       eor(v16, T16B, v17, v21);
3118 
3119       ushll2(v20, T2D, v16, T4S, 16);
3120       ushll(v16, T2D, v16, T2S, 16);
3121 
3122       eor(v20, T16B, v22, v20);
3123       eor(v16, T16B, v16, v18);
3124 
3125       uzp1(v17, v20, v16, T2D);
3126       uzp2(v21, v20, v16, T2D);
3127       eor(v20, T16B, v17, v21);
3128 
3129       shl(v16, T2D, v28, 1);
3130       shl(v17, T2D, v20, 1);
3131 
3132       eor(v0, T16B, v0, v16);
3133       eor(v1, T16B, v1, v17);
3134 
3135       subs(len, len, 32);
3136       br(Assembler::GE, L_fold);
3137 
3138       mov(crc, 0);
3139       mov(tmp, v0, T1D, 0);
3140       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3141       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3142       mov(tmp, v0, T1D, 1);
3143       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3144       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3145       mov(tmp, v1, T1D, 0);
3146       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3147       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3148       mov(tmp, v1, T1D, 1);
3149       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3150       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3151 
3152       add(len, len, 32);
3153   }
3154 
3155   BIND(L_by16);
3156     subs(len, len, 16);
3157     br(Assembler::GE, L_by16_loop);
3158     adds(len, len, 16-4);
3159     br(Assembler::GE, L_by4_loop);
3160     adds(len, len, 4);
3161     br(Assembler::GT, L_by1_loop);
3162     b(L_exit);
3163 
3164   BIND(L_by4_loop);
3165     ldrw(tmp, Address(post(buf, 4)));
3166     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3167     subs(len, len, 4);
3168     br(Assembler::GE, L_by4_loop);
3169     adds(len, len, 4);
3170     br(Assembler::LE, L_exit);
3171   BIND(L_by1_loop);
3172     subs(len, len, 1);
3173     ldrb(tmp, Address(post(buf, 1)));
3174     update_byte_crc32(crc, tmp, table0);
3175     br(Assembler::GT, L_by1_loop);
3176     b(L_exit);
3177 
3178     align(CodeEntryAlignment);
3179   BIND(L_by16_loop);
3180     subs(len, len, 16);
3181     ldp(tmp, tmp3, Address(post(buf, 16)));
3182     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3183     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3184     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3185     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3186     br(Assembler::GE, L_by16_loop);
3187     adds(len, len, 16-4);
3188     br(Assembler::GE, L_by4_loop);
3189     adds(len, len, 4);
3190     br(Assembler::GT, L_by1_loop);
3191   BIND(L_exit);
3192     ornw(crc, zr, crc);
3193 }
3194 
3195 SkipIfEqual::SkipIfEqual(
3196     MacroAssembler* masm, const bool* flag_addr, bool value) {
3197   _masm = masm;
3198   unsigned long offset;
3199   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3200   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3201   _masm->cbzw(rscratch1, _label);
3202 }
3203 
3204 SkipIfEqual::~SkipIfEqual() {
3205   _masm->bind(_label);
3206 }
3207 
3208 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3209   Address adr;
3210   switch(dst.getMode()) {
3211   case Address::base_plus_offset:
3212     // This is the expected mode, although we allow all the other
3213     // forms below.
3214     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3215     break;
3216   default:
3217     lea(rscratch2, dst);
3218     adr = Address(rscratch2);
3219     break;
3220   }
3221   ldr(rscratch1, adr);
3222   add(rscratch1, rscratch1, src);
3223   str(rscratch1, adr);
3224 }
3225 
3226 void MacroAssembler::cmpptr(Register src1, Address src2) {
3227   unsigned long offset;
3228   adrp(rscratch1, src2, offset);
3229   ldr(rscratch1, Address(rscratch1, offset));
3230   cmp(src1, rscratch1);
3231 }
3232 
3233 void MacroAssembler::store_check(Register obj) {
3234   // Does a store check for the oop in register obj. The content of
3235   // register obj is destroyed afterwards.
3236   store_check_part_1(obj);
3237   store_check_part_2(obj);
3238 }
3239 
3240 void MacroAssembler::cmpoops(Register src1, Register src2) {
3241   cmp(src1, src2);
3242   oopDesc::bs()->asm_acmp_barrier(this, src1, src2);
3243 }
3244 
3245 void MacroAssembler::store_check(Register obj, Address dst) {
3246   store_check(obj);
3247 }
3248 
3249 
3250 // split the store check operation so that other instructions can be scheduled inbetween
3251 void MacroAssembler::store_check_part_1(Register obj) {
3252   BarrierSet* bs = Universe::heap()->barrier_set();
3253   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3254   lsr(obj, obj, CardTableModRefBS::card_shift);
3255 }
3256 
3257 void MacroAssembler::store_check_part_2(Register obj) {
3258   BarrierSet* bs = Universe::heap()->barrier_set();
3259   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3260   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3261   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3262 
3263   // The calculation for byte_map_base is as follows:
3264   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
3265   // So this essentially converts an address to a displacement and
3266   // it will never need to be relocated.
3267 
3268   // FIXME: It's not likely that disp will fit into an offset so we
3269   // don't bother to check, but it could save an instruction.
3270   intptr_t disp = (intptr_t) ct->byte_map_base;
3271   load_byte_map_base(rscratch1);
3272   strb(zr, Address(obj, rscratch1));
3273 }
3274 
3275 void MacroAssembler::load_klass(Register dst, Register src) {
3276   if (UseCompressedClassPointers) {
3277     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3278     decode_klass_not_null(dst);
3279   } else {
3280     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3281   }
3282 }
3283 
3284 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3285   if (UseCompressedClassPointers) {
3286     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3287     if (Universe::narrow_klass_base() == NULL) {
3288       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3289       return;
3290     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3291                && Universe::narrow_klass_shift() == 0) {
3292       // Only the bottom 32 bits matter
3293       cmpw(trial_klass, tmp);
3294       return;
3295     }
3296     decode_klass_not_null(tmp);
3297   } else {
3298     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3299   }
3300   cmp(trial_klass, tmp);
3301 }
3302 
3303 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3304   load_klass(dst, src);
3305   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3306 }
3307 
3308 void MacroAssembler::store_klass(Register dst, Register src) {
3309   // FIXME: Should this be a store release?  concurrent gcs assumes
3310   // klass length is valid if klass field is not null.
3311   if (UseCompressedClassPointers) {
3312     encode_klass_not_null(src);
3313     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3314   } else {
3315     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3316   }
3317 }
3318 
3319 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3320   if (UseCompressedClassPointers) {
3321     // Store to klass gap in destination
3322     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3323   }
3324 }
3325 
3326 // Algorithm must match oop.inline.hpp encode_heap_oop.
3327 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3328 #ifdef ASSERT
3329   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3330 #endif
3331   verify_oop(s, "broken oop in encode_heap_oop");
3332   if (Universe::narrow_oop_base() == NULL) {
3333     if (Universe::narrow_oop_shift() != 0) {
3334       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3335       lsr(d, s, LogMinObjAlignmentInBytes);
3336     } else {
3337       mov(d, s);
3338     }
3339   } else {
3340     subs(d, s, rheapbase);
3341     csel(d, d, zr, Assembler::HS);
3342     lsr(d, d, LogMinObjAlignmentInBytes);
3343 
3344     /*  Old algorithm: is this any worse?
3345     Label nonnull;
3346     cbnz(r, nonnull);
3347     sub(r, r, rheapbase);
3348     bind(nonnull);
3349     lsr(r, r, LogMinObjAlignmentInBytes);
3350     */
3351   }
3352 }
3353 
3354 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3355 #ifdef ASSERT
3356   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3357   if (CheckCompressedOops) {
3358     Label ok;
3359     cbnz(r, ok);
3360     stop("null oop passed to encode_heap_oop_not_null");
3361     bind(ok);
3362   }
3363 #endif
3364   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3365   if (Universe::narrow_oop_base() != NULL) {
3366     sub(r, r, rheapbase);
3367   }
3368   if (Universe::narrow_oop_shift() != 0) {
3369     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3370     lsr(r, r, LogMinObjAlignmentInBytes);
3371   }
3372 }
3373 
3374 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3375 #ifdef ASSERT
3376   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3377   if (CheckCompressedOops) {
3378     Label ok;
3379     cbnz(src, ok);
3380     stop("null oop passed to encode_heap_oop_not_null2");
3381     bind(ok);
3382   }
3383 #endif
3384   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3385 
3386   Register data = src;
3387   if (Universe::narrow_oop_base() != NULL) {
3388     sub(dst, src, rheapbase);
3389     data = dst;
3390   }
3391   if (Universe::narrow_oop_shift() != 0) {
3392     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3393     lsr(dst, data, LogMinObjAlignmentInBytes);
3394     data = dst;
3395   }
3396   if (data == src)
3397     mov(dst, src);
3398 }
3399 
3400 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3401 #ifdef ASSERT
3402   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3403 #endif
3404   if (Universe::narrow_oop_base() == NULL) {
3405     if (Universe::narrow_oop_shift() != 0 || d != s) {
3406       lsl(d, s, Universe::narrow_oop_shift());
3407     }
3408   } else {
3409     Label done;
3410     if (d != s)
3411       mov(d, s);
3412     cbz(s, done);
3413     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3414     bind(done);
3415   }
3416   verify_oop(d, "broken oop in decode_heap_oop");
3417 }
3418 
3419 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3420   assert (UseCompressedOops, "should only be used for compressed headers");
3421   assert (Universe::heap() != NULL, "java heap should be initialized");
3422   // Cannot assert, unverified entry point counts instructions (see .ad file)
3423   // vtableStubs also counts instructions in pd_code_size_limit.
3424   // Also do not verify_oop as this is called by verify_oop.
3425   if (Universe::narrow_oop_shift() != 0) {
3426     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3427     if (Universe::narrow_oop_base() != NULL) {
3428       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3429     } else {
3430       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3431     }
3432   } else {
3433     assert (Universe::narrow_oop_base() == NULL, "sanity");
3434   }
3435 }
3436 
3437 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3438   assert (UseCompressedOops, "should only be used for compressed headers");
3439   assert (Universe::heap() != NULL, "java heap should be initialized");
3440   // Cannot assert, unverified entry point counts instructions (see .ad file)
3441   // vtableStubs also counts instructions in pd_code_size_limit.
3442   // Also do not verify_oop as this is called by verify_oop.
3443   if (Universe::narrow_oop_shift() != 0) {
3444     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3445     if (Universe::narrow_oop_base() != NULL) {
3446       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3447     } else {
3448       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3449     }
3450   } else {
3451     assert (Universe::narrow_oop_base() == NULL, "sanity");
3452     if (dst != src) {
3453       mov(dst, src);
3454     }
3455   }
3456 }
3457 
3458 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3459   if (Universe::narrow_klass_base() == NULL) {
3460     if (Universe::narrow_klass_shift() != 0) {
3461       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3462       lsr(dst, src, LogKlassAlignmentInBytes);
3463     } else {
3464       if (dst != src) mov(dst, src);
3465     }
3466     return;
3467   }
3468 
3469   if (use_XOR_for_compressed_class_base) {
3470     if (Universe::narrow_klass_shift() != 0) {
3471       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3472       lsr(dst, dst, LogKlassAlignmentInBytes);
3473     } else {
3474       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3475     }
3476     return;
3477   }
3478 
3479   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3480       && Universe::narrow_klass_shift() == 0) {
3481     movw(dst, src);
3482     return;
3483   }
3484 
3485 #ifdef ASSERT
3486   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3487 #endif
3488 
3489   Register rbase = dst;
3490   if (dst == src) rbase = rheapbase;
3491   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3492   sub(dst, src, rbase);
3493   if (Universe::narrow_klass_shift() != 0) {
3494     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3495     lsr(dst, dst, LogKlassAlignmentInBytes);
3496   }
3497   if (dst == src) reinit_heapbase();
3498 }
3499 
3500 void MacroAssembler::encode_klass_not_null(Register r) {
3501   encode_klass_not_null(r, r);
3502 }
3503 
3504 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3505   Register rbase = dst;
3506   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3507 
3508   if (Universe::narrow_klass_base() == NULL) {
3509     if (Universe::narrow_klass_shift() != 0) {
3510       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3511       lsl(dst, src, LogKlassAlignmentInBytes);
3512     } else {
3513       if (dst != src) mov(dst, src);
3514     }
3515     return;
3516   }
3517 
3518   if (use_XOR_for_compressed_class_base) {
3519     if (Universe::narrow_klass_shift() != 0) {
3520       lsl(dst, src, LogKlassAlignmentInBytes);
3521       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3522     } else {
3523       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3524     }
3525     return;
3526   }
3527 
3528   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3529       && Universe::narrow_klass_shift() == 0) {
3530     if (dst != src)
3531       movw(dst, src);
3532     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3533     return;
3534   }
3535 
3536   // Cannot assert, unverified entry point counts instructions (see .ad file)
3537   // vtableStubs also counts instructions in pd_code_size_limit.
3538   // Also do not verify_oop as this is called by verify_oop.
3539   if (dst == src) rbase = rheapbase;
3540   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3541   if (Universe::narrow_klass_shift() != 0) {
3542     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3543     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3544   } else {
3545     add(dst, rbase, src);
3546   }
3547   if (dst == src) reinit_heapbase();
3548 }
3549 
3550 void  MacroAssembler::decode_klass_not_null(Register r) {
3551   decode_klass_not_null(r, r);
3552 }
3553 
3554 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3555   assert (UseCompressedOops, "should only be used for compressed oops");
3556   assert (Universe::heap() != NULL, "java heap should be initialized");
3557   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3558 
3559   int oop_index = oop_recorder()->find_index(obj);
3560   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3561 
3562   InstructionMark im(this);
3563   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3564   code_section()->relocate(inst_mark(), rspec);
3565   movz(dst, 0xDEAD, 16);
3566   movk(dst, 0xBEEF);
3567 }
3568 
3569 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3570   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3571   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3572   int index = oop_recorder()->find_index(k);
3573   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3574 
3575   InstructionMark im(this);
3576   RelocationHolder rspec = metadata_Relocation::spec(index);
3577   code_section()->relocate(inst_mark(), rspec);
3578   narrowKlass nk = Klass::encode_klass(k);
3579   movz(dst, (nk >> 16), 16);
3580   movk(dst, nk & 0xffff);
3581 }
3582 
3583 void MacroAssembler::load_heap_oop(Register dst, Address src)
3584 {
3585   if (UseCompressedOops) {
3586     ldrw(dst, src);
3587     decode_heap_oop(dst);
3588   } else {
3589     ldr(dst, src);
3590   }
3591 }
3592 
3593 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3594 {
3595   if (UseCompressedOops) {
3596     ldrw(dst, src);
3597     decode_heap_oop_not_null(dst);
3598   } else {
3599     ldr(dst, src);
3600   }
3601 }
3602 
3603 void MacroAssembler::store_heap_oop(Address dst, Register src) {
3604   if (UseCompressedOops) {
3605     assert(!dst.uses(src), "not enough registers");
3606     encode_heap_oop(src);
3607     strw(src, dst);
3608   } else
3609     str(src, dst);
3610 }
3611 
3612 // Used for storing NULLs.
3613 void MacroAssembler::store_heap_oop_null(Address dst) {
3614   if (UseCompressedOops) {
3615     strw(zr, dst);
3616   } else
3617     str(zr, dst);
3618 }
3619 
3620 #if INCLUDE_ALL_GCS
3621 void MacroAssembler::g1_write_barrier_pre(Register obj,
3622                                           Register pre_val,
3623                                           Register thread,
3624                                           Register tmp,
3625                                           bool tosca_live,
3626                                           bool expand_call) {
3627   // If expand_call is true then we expand the call_VM_leaf macro
3628   // directly to skip generating the check by
3629   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3630 
3631 #ifdef _LP64
3632   assert(thread == rthread, "must be");
3633 #endif // _LP64
3634 
3635   Label done;
3636   Label runtime;
3637 
3638   assert(pre_val != noreg, "check this code");
3639 
3640   if (obj != noreg)
3641     assert_different_registers(obj, pre_val, tmp);
3642 
3643   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3644                                        PtrQueue::byte_offset_of_active()));
3645   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3646                                        PtrQueue::byte_offset_of_index()));
3647   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3648                                        PtrQueue::byte_offset_of_buf()));
3649 
3650 
3651   // Is marking active?
3652   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
3653     ldrw(tmp, in_progress);
3654   } else {
3655     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
3656     ldrb(tmp, in_progress);
3657   }
3658   cbzw(tmp, done);
3659 
3660   // Do we need to load the previous value?
3661   if (obj != noreg) {
3662     load_heap_oop(pre_val, Address(obj, 0));
3663   }
3664 
3665   // Is the previous value null?
3666   cbz(pre_val, done);
3667 
3668   // Can we store original value in the thread's buffer?
3669   // Is index == 0?
3670   // (The index field is typed as size_t.)
3671 
3672   ldr(tmp, index);                      // tmp := *index_adr
3673   cbz(tmp, runtime);                    // tmp == 0?
3674                                         // If yes, goto runtime
3675 
3676   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3677   str(tmp, index);                      // *index_adr := tmp
3678   ldr(rscratch1, buffer);
3679   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3680 
3681   // Record the previous value
3682   str(pre_val, Address(tmp, 0));
3683   b(done);
3684 
3685   bind(runtime);
3686   // save the live input values
3687   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3688 
3689   // Calling the runtime using the regular call_VM_leaf mechanism generates
3690   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3691   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3692   //
3693   // If we care generating the pre-barrier without a frame (e.g. in the
3694   // intrinsified Reference.get() routine) then ebp might be pointing to
3695   // the caller frame and so this check will most likely fail at runtime.
3696   //
3697   // Expanding the call directly bypasses the generation of the check.
3698   // So when we do not have have a full interpreter frame on the stack
3699   // expand_call should be passed true.
3700 
3701   if (expand_call) {
3702     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
3703     pass_arg1(this, thread);
3704     pass_arg0(this, pre_val);
3705     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3706   } else {
3707     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3708   }
3709 
3710   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3711 
3712   bind(done);
3713 }
3714 
3715 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3716                                            Register new_val,
3717                                            Register thread,
3718                                            Register tmp,
3719                                            Register tmp2) {
3720 #ifdef _LP64
3721   assert(thread == rthread, "must be");
3722 #endif // _LP64
3723 
3724   if (UseShenandoahGC) {
3725     // No need for this in Shenandoah.
3726     return;
3727   }
3728 
3729   assert(UseG1GC, "expect G1 GC");
3730 
3731   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3732                                        PtrQueue::byte_offset_of_index()));
3733   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3734                                        PtrQueue::byte_offset_of_buf()));
3735 
3736   BarrierSet* bs = Universe::heap()->barrier_set();
3737   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3738   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3739 
3740   Label done;
3741   Label runtime;
3742 
3743   // Does store cross heap regions?
3744 
3745   eor(tmp, store_addr, new_val);
3746   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3747   cbz(tmp, done);
3748 
3749   // crosses regions, storing NULL?
3750 
3751   cbz(new_val, done);
3752 
3753   // storing region crossing non-NULL, is card already dirty?
3754 
3755   ExternalAddress cardtable((address) ct->byte_map_base);
3756   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3757   const Register card_addr = tmp;
3758 
3759   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3760 
3761   // get the address of the card
3762   load_byte_map_base(tmp2);
3763   add(card_addr, card_addr, tmp2);
3764   ldrb(tmp2, Address(card_addr));
3765   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3766   br(Assembler::EQ, done);
3767 
3768   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3769 
3770   membar(Assembler::Assembler::StoreLoad);
3771 
3772   ldrb(tmp2, Address(card_addr));
3773   cbzw(tmp2, done);
3774 
3775   // storing a region crossing, non-NULL oop, card is clean.
3776   // dirty card and log.
3777 
3778   strb(zr, Address(card_addr));
3779 
3780   ldr(rscratch1, queue_index);
3781   cbz(rscratch1, runtime);
3782   sub(rscratch1, rscratch1, wordSize);
3783   str(rscratch1, queue_index);
3784 
3785   ldr(tmp2, buffer);
3786   str(card_addr, Address(tmp2, rscratch1));
3787   b(done);
3788 
3789   bind(runtime);
3790   // save the live input values
3791   push(store_addr->bit(true) | new_val->bit(true), sp);
3792   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3793   pop(store_addr->bit(true) | new_val->bit(true), sp);
3794 
3795   bind(done);
3796 }
3797 
3798 void MacroAssembler::shenandoah_write_barrier(Register dst) {
3799   assert(UseShenandoahGC && ShenandoahWriteBarrier, "Should be enabled");
3800   assert(dst != rscratch1, "need rscratch1");
3801   assert(dst != rscratch2, "need rscratch2");
3802 
3803   Label done;
3804 
3805   Address gc_state(rthread, in_bytes(JavaThread::gc_state_offset()));
3806   ldrb(rscratch1, gc_state);
3807 
3808   // Check for heap stability
3809   mov(rscratch2, ShenandoahHeap::HAS_FORWARDED | ShenandoahHeap::EVACUATION);
3810   tst(rscratch1, rscratch2);
3811   br(Assembler::EQ, done);
3812 
3813   // Heap is unstable, need to perform the read-barrier even if WB is inactive
3814   ldr(dst, Address(dst, ShenandoahBrooksPointer::byte_offset()));
3815 
3816   // Check for evacuation-in-progress and jump to WB slow-path if needed
3817   mov(rscratch2, ShenandoahHeap::EVACUATION);
3818   tst(rscratch1, rscratch2);
3819   br(Assembler::EQ, done);
3820 
3821   RegSet to_save = RegSet::of(r0);
3822   if (dst != r0) {
3823     push(to_save, sp);
3824     mov(r0, dst);
3825   }
3826 
3827   assert(StubRoutines::aarch64::shenandoah_wb() != NULL, "need write barrier stub");
3828   far_call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::aarch64::shenandoah_wb())));
3829 
3830   if (dst != r0) {
3831     mov(dst, r0);
3832     pop(to_save, sp);
3833   }
3834   block_comment("} Shenandoah write barrier");
3835 
3836   bind(done);
3837 }
3838 
3839 #endif // INCLUDE_ALL_GCS
3840 
3841 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3842   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3843   int index = oop_recorder()->allocate_metadata_index(obj);
3844   RelocationHolder rspec = metadata_Relocation::spec(index);
3845   return Address((address)obj, rspec);
3846 }
3847 
3848 // Move an oop into a register.  immediate is true if we want
3849 // immediate instrcutions, i.e. we are not going to patch this
3850 // instruction while the code is being executed by another thread.  In
3851 // that case we can use move immediates rather than the constant pool.
3852 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3853   int oop_index;
3854   if (obj == NULL) {
3855     oop_index = oop_recorder()->allocate_oop_index(obj);
3856   } else {
3857     oop_index = oop_recorder()->find_index(obj);
3858     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3859   }
3860   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3861   if (! immediate) {
3862     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3863     ldr_constant(dst, Address(dummy, rspec));
3864   } else
3865     mov(dst, Address((address)obj, rspec));
3866 }
3867 
3868 // Move a metadata address into a register.
3869 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3870   int oop_index;
3871   if (obj == NULL) {
3872     oop_index = oop_recorder()->allocate_metadata_index(obj);
3873   } else {
3874     oop_index = oop_recorder()->find_index(obj);
3875   }
3876   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3877   mov(dst, Address((address)obj, rspec));
3878 }
3879 
3880 Address MacroAssembler::constant_oop_address(jobject obj) {
3881   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3882   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3883   int oop_index = oop_recorder()->find_index(obj);
3884   return Address((address)obj, oop_Relocation::spec(oop_index));
3885 }
3886 
3887 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3888 void MacroAssembler::tlab_allocate(Register obj,
3889                                    Register var_size_in_bytes,
3890                                    int con_size_in_bytes,
3891                                    Register t1,
3892                                    Register t2,
3893                                    Label& slow_case) {
3894   assert_different_registers(obj, t2);
3895   assert_different_registers(obj, var_size_in_bytes);
3896   Register end = t2;
3897 
3898   // verify_tlab();
3899 
3900   int oop_extra_words = Universe::heap()->oop_extra_words();
3901 
3902   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3903   if (var_size_in_bytes == noreg) {
3904     lea(end, Address(obj, con_size_in_bytes + oop_extra_words * HeapWordSize));
3905   } else {
3906     if (oop_extra_words > 0) {
3907       add(var_size_in_bytes, var_size_in_bytes, oop_extra_words * HeapWordSize);
3908     }
3909     lea(end, Address(obj, var_size_in_bytes));
3910   }
3911   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3912   cmp(end, rscratch1);
3913   br(Assembler::HI, slow_case);
3914 
3915   // update the tlab top pointer
3916   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3917 
3918   Universe::heap()->compile_prepare_oop(this, obj);
3919 
3920   // recover var_size_in_bytes if necessary
3921   if (var_size_in_bytes == end) {
3922     sub(var_size_in_bytes, var_size_in_bytes, obj);
3923   }
3924   // verify_tlab();
3925 }
3926 
3927 // Preserves r19, and r3.
3928 Register MacroAssembler::tlab_refill(Label& retry,
3929                                      Label& try_eden,
3930                                      Label& slow_case) {
3931   Register top = r0;
3932   Register t1  = r2;
3933   Register t2  = r4;
3934   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3935   Label do_refill, discard_tlab;
3936 
3937   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3938     // No allocation in the shared eden.
3939     b(slow_case);
3940   }
3941 
3942   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3943   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3944 
3945   // calculate amount of free space
3946   sub(t1, t1, top);
3947   lsr(t1, t1, LogHeapWordSize);
3948 
3949   // Retain tlab and allocate object in shared space if
3950   // the amount free in the tlab is too large to discard.
3951 
3952   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3953   cmp(t1, rscratch1);
3954   br(Assembler::LE, discard_tlab);
3955 
3956   // Retain
3957   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3958   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3959   add(rscratch1, rscratch1, t2);
3960   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3961 
3962   if (TLABStats) {
3963     // increment number of slow_allocations
3964     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3965          1, rscratch1);
3966   }
3967   b(try_eden);
3968 
3969   bind(discard_tlab);
3970   if (TLABStats) {
3971     // increment number of refills
3972     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3973          rscratch1);
3974     // accumulate wastage -- t1 is amount free in tlab
3975     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3976          rscratch1);
3977   }
3978 
3979   // if tlab is currently allocated (top or end != null) then
3980   // fill [top, end + alignment_reserve) with array object
3981   cbz(top, do_refill);
3982 
3983   // set up the mark word
3984   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3985   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3986   // set the length to the remaining space
3987   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3988   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3989   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3990   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3991   // set klass to intArrayKlass
3992   {
3993     unsigned long offset;
3994     // dubious reloc why not an oop reloc?
3995     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3996          offset);
3997     ldr(t1, Address(rscratch1, offset));
3998   }
3999   // store klass last.  concurrent gcs assumes klass length is valid if
4000   // klass field is not null.
4001   store_klass(top, t1);
4002 
4003   mov(t1, top);
4004   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4005   sub(t1, t1, rscratch1);
4006   incr_allocated_bytes(rthread, t1, 0, rscratch1);
4007 
4008   // refill the tlab with an eden allocation
4009   bind(do_refill);
4010   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4011   lsl(t1, t1, LogHeapWordSize);
4012   // allocate new tlab, address returned in top
4013   eden_allocate(top, t1, 0, t2, slow_case);
4014 
4015   // Check that t1 was preserved in eden_allocate.
4016 #ifdef ASSERT
4017   if (UseTLAB) {
4018     Label ok;
4019     Register tsize = r4;
4020     assert_different_registers(tsize, rthread, t1);
4021     str(tsize, Address(pre(sp, -16)));
4022     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4023     lsl(tsize, tsize, LogHeapWordSize);
4024     cmp(t1, tsize);
4025     br(Assembler::EQ, ok);
4026     STOP("assert(t1 != tlab size)");
4027     should_not_reach_here();
4028 
4029     bind(ok);
4030     ldr(tsize, Address(post(sp, 16)));
4031   }
4032 #endif
4033   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4034   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4035   add(top, top, t1);
4036   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4037   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4038   verify_tlab();
4039   b(retry);
4040 
4041   return rthread; // for use by caller
4042 }
4043 
4044 // Defines obj, preserves var_size_in_bytes
4045 void MacroAssembler::eden_allocate(Register obj,
4046                                    Register var_size_in_bytes,
4047                                    int con_size_in_bytes,
4048                                    Register t1,
4049                                    Label& slow_case) {
4050   assert_different_registers(obj, var_size_in_bytes, t1);
4051   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
4052     b(slow_case);
4053   } else {
4054     Register end = t1;
4055     Register heap_end = rscratch2;
4056     Label retry;
4057     bind(retry);
4058     {
4059       unsigned long offset;
4060       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
4061       ldr(heap_end, Address(rscratch1, offset));
4062     }
4063 
4064     ExternalAddress heap_top((address) Universe::heap()->top_addr());
4065 
4066     // Get the current top of the heap
4067     {
4068       unsigned long offset;
4069       adrp(rscratch1, heap_top, offset);
4070       // Use add() here after ARDP, rather than lea().
4071       // lea() does not generate anything if its offset is zero.
4072       // However, relocs expect to find either an ADD or a load/store
4073       // insn after an ADRP.  add() always generates an ADD insn, even
4074       // for add(Rn, Rn, 0).
4075       add(rscratch1, rscratch1, offset);
4076       ldaxr(obj, rscratch1);
4077     }
4078 
4079     // Adjust it my the size of our new object
4080     if (var_size_in_bytes == noreg) {
4081       lea(end, Address(obj, con_size_in_bytes));
4082     } else {
4083       lea(end, Address(obj, var_size_in_bytes));
4084     }
4085 
4086     // if end < obj then we wrapped around high memory
4087     cmp(end, obj);
4088     br(Assembler::LO, slow_case);
4089 
4090     cmp(end, heap_end);
4091     br(Assembler::HI, slow_case);
4092 
4093     // If heap_top hasn't been changed by some other thread, update it.
4094     stlxr(rscratch2, end, rscratch1);
4095     cbnzw(rscratch2, retry);
4096   }
4097 }
4098 
4099 void MacroAssembler::verify_tlab() {
4100 #ifdef ASSERT
4101   if (UseTLAB && VerifyOops) {
4102     Label next, ok;
4103 
4104     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4105 
4106     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4107     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4108     cmp(rscratch2, rscratch1);
4109     br(Assembler::HS, next);
4110     STOP("assert(top >= start)");
4111     should_not_reach_here();
4112 
4113     bind(next);
4114     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4115     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4116     cmp(rscratch2, rscratch1);
4117     br(Assembler::HS, ok);
4118     STOP("assert(top <= end)");
4119     should_not_reach_here();
4120 
4121     bind(ok);
4122     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4123   }
4124 #endif
4125 }
4126 
4127 // Writes to stack successive pages until offset reached to check for
4128 // stack overflow + shadow pages.  This clobbers tmp.
4129 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4130   assert_different_registers(tmp, size, rscratch1);
4131   mov(tmp, sp);
4132   // Bang stack for total size given plus shadow page size.
4133   // Bang one page at a time because large size can bang beyond yellow and
4134   // red zones.
4135   Label loop;
4136   mov(rscratch1, os::vm_page_size());
4137   bind(loop);
4138   lea(tmp, Address(tmp, -os::vm_page_size()));
4139   subsw(size, size, rscratch1);
4140   str(size, Address(tmp));
4141   br(Assembler::GT, loop);
4142 
4143   // Bang down shadow pages too.
4144   // The -1 because we already subtracted 1 page.
4145   for (int i = 0; i< StackShadowPages-1; i++) {
4146     // this could be any sized move but this is can be a debugging crumb
4147     // so the bigger the better.
4148     lea(tmp, Address(tmp, -os::vm_page_size()));
4149     str(size, Address(tmp));
4150   }
4151 }
4152 
4153 
4154 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4155   unsigned long off;
4156   adrp(r, Address(page, rtype), off);
4157   InstructionMark im(this);
4158   code_section()->relocate(inst_mark(), rtype);
4159   ldrw(zr, Address(r, off));
4160   return inst_mark();
4161 }
4162 
4163 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4164   InstructionMark im(this);
4165   code_section()->relocate(inst_mark(), rtype);
4166   ldrw(zr, Address(r, 0));
4167   return inst_mark();
4168 }
4169 
4170 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4171   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4172   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4173   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4174   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4175   long offset_low = dest_page - low_page;
4176   long offset_high = dest_page - high_page;
4177 
4178   assert(is_valid_AArch64_address(dest.target()), "bad address");
4179   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4180 
4181   InstructionMark im(this);
4182   code_section()->relocate(inst_mark(), dest.rspec());
4183   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4184   // the code cache so that if it is relocated we know it will still reach
4185   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4186     _adrp(reg1, dest.target());
4187   } else {
4188     unsigned long target = (unsigned long)dest.target();
4189     unsigned long adrp_target
4190       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4191 
4192     _adrp(reg1, (address)adrp_target);
4193     movk(reg1, target >> 32, 32);
4194   }
4195   byte_offset = (unsigned long)dest.target() & 0xfff;
4196 }
4197 
4198 void MacroAssembler::load_byte_map_base(Register reg) {
4199   jbyte *byte_map_base =
4200     ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base;
4201 
4202   if (is_valid_AArch64_address((address)byte_map_base)) {
4203     // Strictly speaking the byte_map_base isn't an address at all,
4204     // and it might even be negative.
4205     unsigned long offset;
4206     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4207     // We expect offset to be zero with most collectors.
4208     if (offset != 0) {
4209       add(reg, reg, offset);
4210     }
4211   } else {
4212     mov(reg, (uint64_t)byte_map_base);
4213   }
4214 }
4215 
4216 void MacroAssembler::build_frame(int framesize) {
4217   if (framesize == 0) {
4218     // Is this even possible?
4219     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4220   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
4221     sub(sp, sp, framesize);
4222     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4223   } else {
4224     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4225     if (framesize < ((1 << 12) + 2 * wordSize))
4226       sub(sp, sp, framesize - 2 * wordSize);
4227     else {
4228       mov(rscratch1, framesize - 2 * wordSize);
4229       sub(sp, sp, rscratch1);
4230     }
4231   }
4232 }
4233 
4234 void MacroAssembler::remove_frame(int framesize) {
4235   if (framesize == 0) {
4236     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4237   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
4238     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4239     add(sp, sp, framesize);
4240   } else {
4241     if (framesize < ((1 << 12) + 2 * wordSize))
4242       add(sp, sp, framesize - 2 * wordSize);
4243     else {
4244       mov(rscratch1, framesize - 2 * wordSize);
4245       add(sp, sp, rscratch1);
4246     }
4247     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4248   }
4249 }
4250 
4251 // Search for str1 in str2 and return index or -1
4252 void MacroAssembler::string_indexof(Register str2, Register str1,
4253                                     Register cnt2, Register cnt1,
4254                                     Register tmp1, Register tmp2,
4255                                     Register tmp3, Register tmp4,
4256                                     int icnt1, Register result) {
4257   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
4258 
4259   Register ch1 = rscratch1;
4260   Register ch2 = rscratch2;
4261   Register cnt1tmp = tmp1;
4262   Register cnt2tmp = tmp2;
4263   Register cnt1_neg = cnt1;
4264   Register cnt2_neg = cnt2;
4265   Register result_tmp = tmp4;
4266 
4267   // Note, inline_string_indexOf() generates checks:
4268   // if (substr.count > string.count) return -1;
4269   // if (substr.count == 0) return 0;
4270 
4271 // We have two strings, a source string in str2, cnt2 and a pattern string
4272 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4273 
4274 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4275 // With a small pattern and source we use linear scan.
4276 
4277   if (icnt1 == -1) {
4278     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4279     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
4280     br(LO, LINEARSEARCH);       // a byte array.
4281     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
4282     br(HS, LINEARSEARCH);
4283   }
4284 
4285 // The Boyer Moore alogorithm is based on the description here:-
4286 //
4287 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4288 //
4289 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4290 // and the 'Good Suffix' rule.
4291 //
4292 // These rules are essentially heuristics for how far we can shift the
4293 // pattern along the search string.
4294 //
4295 // The implementation here uses the 'Bad Character' rule only because of the
4296 // complexity of initialisation for the 'Good Suffix' rule.
4297 //
4298 // This is also known as the Boyer-Moore-Horspool algorithm:-
4299 //
4300 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4301 //
4302 // #define ASIZE 128
4303 //
4304 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4305 //       int i, j;
4306 //       unsigned c;
4307 //       unsigned char bc[ASIZE];
4308 //    
4309 //       /* Preprocessing */
4310 //       for (i = 0; i < ASIZE; ++i)
4311 //          bc[i] = 0;
4312 //       for (i = 0; i < m - 1; ) {
4313 //          c = x[i];
4314 //          ++i;
4315 //          if (c < ASIZE) bc[c] = i;
4316 //       }
4317 //    
4318 //       /* Searching */
4319 //       j = 0;
4320 //       while (j <= n - m) {
4321 //          c = y[i+j];
4322 //          if (x[m-1] == c)
4323 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4324 //          if (i < 0) return j;
4325 //          if (c < ASIZE)
4326 //            j = j - bc[y[j+m-1]] + m;
4327 //          else
4328 //            j += 1; // Advance by 1 only if char >= ASIZE
4329 //       }
4330 //    }
4331 
4332   if (icnt1 == -1) {
4333     BIND(BM);
4334 
4335     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4336     Label BMADV, BMMATCH, BMCHECKEND;
4337 
4338     Register cnt1end = tmp2;
4339     Register str2end = cnt2;
4340     Register skipch = tmp2;
4341 
4342     // Restrict ASIZE to 128 to reduce stack space/initialisation.
4343     // The presence of chars >= ASIZE in the target string does not affect
4344     // performance, but we must be careful not to initialise them in the stack
4345     // array.
4346     // The presence of chars >= ASIZE in the source string may adversely affect
4347     // performance since we can only advance by one when we encounter one.
4348 
4349       stp(zr, zr, pre(sp, -128));
4350       for (int i = 1; i < 8; i++)
4351           stp(zr, zr, Address(sp, i*16));
4352 
4353       mov(cnt1tmp, 0);
4354       sub(cnt1end, cnt1, 1);
4355     BIND(BCLOOP);
4356       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4357       cmp(ch1, 128);
4358       add(cnt1tmp, cnt1tmp, 1);
4359       br(HS, BCSKIP);
4360       strb(cnt1tmp, Address(sp, ch1));
4361     BIND(BCSKIP);
4362       cmp(cnt1tmp, cnt1end);
4363       br(LT, BCLOOP);
4364 
4365       mov(result_tmp, str2);
4366 
4367       sub(cnt2, cnt2, cnt1);
4368       add(str2end, str2, cnt2, LSL, 1);
4369     BIND(BMLOOPSTR2);
4370       sub(cnt1tmp, cnt1, 1);
4371       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4372       ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
4373       cmp(ch1, skipch);
4374       br(NE, BMSKIP);
4375       subs(cnt1tmp, cnt1tmp, 1);
4376       br(LT, BMMATCH);
4377     BIND(BMLOOPSTR1);
4378       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4379       ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
4380       cmp(ch1, ch2);
4381       br(NE, BMSKIP);
4382       subs(cnt1tmp, cnt1tmp, 1);
4383       br(GE, BMLOOPSTR1);
4384     BIND(BMMATCH);
4385       sub(result_tmp, str2, result_tmp);
4386       lsr(result, result_tmp, 1);
4387       add(sp, sp, 128);
4388       b(DONE);
4389     BIND(BMADV);
4390       add(str2, str2, 2);
4391       b(BMCHECKEND);
4392     BIND(BMSKIP);
4393       cmp(skipch, 128);
4394       br(HS, BMADV);
4395       ldrb(ch2, Address(sp, skipch));
4396       add(str2, str2, cnt1, LSL, 1);
4397       sub(str2, str2, ch2, LSL, 1);
4398     BIND(BMCHECKEND);
4399       cmp(str2, str2end);
4400       br(LE, BMLOOPSTR2);
4401       add(sp, sp, 128);
4402       b(NOMATCH);
4403   }
4404 
4405   BIND(LINEARSEARCH);
4406   {
4407     Label DO1, DO2, DO3;
4408 
4409     Register str2tmp = tmp2;
4410     Register first = tmp3;
4411 
4412     if (icnt1 == -1)
4413     {
4414         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
4415 
4416         cmp(cnt1, 4);
4417         br(LT, DOSHORT);
4418 
4419         sub(cnt2, cnt2, cnt1);
4420         sub(cnt1, cnt1, 4);
4421         mov(result_tmp, cnt2);
4422 
4423         lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4424         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4425         sub(cnt1_neg, zr, cnt1, LSL, 1);
4426         sub(cnt2_neg, zr, cnt2, LSL, 1);
4427         ldr(first, Address(str1, cnt1_neg));
4428 
4429       BIND(FIRST_LOOP);
4430         ldr(ch2, Address(str2, cnt2_neg));
4431         cmp(first, ch2);
4432         br(EQ, STR1_LOOP);
4433       BIND(STR2_NEXT);
4434         adds(cnt2_neg, cnt2_neg, 2);
4435         br(LE, FIRST_LOOP);
4436         b(NOMATCH);
4437 
4438       BIND(STR1_LOOP);
4439         adds(cnt1tmp, cnt1_neg, 8);
4440         add(cnt2tmp, cnt2_neg, 8);
4441         br(GE, LAST_WORD);
4442 
4443       BIND(STR1_NEXT);
4444         ldr(ch1, Address(str1, cnt1tmp));
4445         ldr(ch2, Address(str2, cnt2tmp));
4446         cmp(ch1, ch2);
4447         br(NE, STR2_NEXT);
4448         adds(cnt1tmp, cnt1tmp, 8);
4449         add(cnt2tmp, cnt2tmp, 8);
4450         br(LT, STR1_NEXT);
4451 
4452       BIND(LAST_WORD);
4453         ldr(ch1, Address(str1));
4454         sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
4455         ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
4456         cmp(ch1, ch2);
4457         br(NE, STR2_NEXT);
4458         b(MATCH);
4459 
4460       BIND(DOSHORT);
4461         cmp(cnt1, 2);
4462         br(LT, DO1);
4463         br(GT, DO3);
4464     }
4465 
4466     if (icnt1 == 4) {
4467       Label CH1_LOOP;
4468 
4469         ldr(ch1, str1);
4470         sub(cnt2, cnt2, 4);
4471         mov(result_tmp, cnt2);
4472         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4473         sub(cnt2_neg, zr, cnt2, LSL, 1);
4474 
4475       BIND(CH1_LOOP);
4476         ldr(ch2, Address(str2, cnt2_neg));
4477         cmp(ch1, ch2);
4478         br(EQ, MATCH);
4479         adds(cnt2_neg, cnt2_neg, 2);
4480         br(LE, CH1_LOOP);
4481         b(NOMATCH);
4482     }
4483 
4484     if (icnt1 == -1 || icnt1 == 2) {
4485       Label CH1_LOOP;
4486 
4487       BIND(DO2);
4488         ldrw(ch1, str1);
4489         sub(cnt2, cnt2, 2);
4490         mov(result_tmp, cnt2);
4491         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4492         sub(cnt2_neg, zr, cnt2, LSL, 1);
4493 
4494       BIND(CH1_LOOP);
4495         ldrw(ch2, Address(str2, cnt2_neg));
4496         cmp(ch1, ch2);
4497         br(EQ, MATCH);
4498         adds(cnt2_neg, cnt2_neg, 2);
4499         br(LE, CH1_LOOP);
4500         b(NOMATCH);
4501     }
4502 
4503     if (icnt1 == -1 || icnt1 == 3) {
4504       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4505 
4506       BIND(DO3);
4507         ldrw(first, str1);
4508         ldrh(ch1, Address(str1, 4));
4509 
4510         sub(cnt2, cnt2, 3);
4511         mov(result_tmp, cnt2);
4512         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4513         sub(cnt2_neg, zr, cnt2, LSL, 1);
4514 
4515       BIND(FIRST_LOOP);
4516         ldrw(ch2, Address(str2, cnt2_neg));
4517         cmpw(first, ch2);
4518         br(EQ, STR1_LOOP);
4519       BIND(STR2_NEXT);
4520         adds(cnt2_neg, cnt2_neg, 2);
4521         br(LE, FIRST_LOOP);
4522         b(NOMATCH);
4523 
4524       BIND(STR1_LOOP);
4525         add(cnt2tmp, cnt2_neg, 4);
4526         ldrh(ch2, Address(str2, cnt2tmp));
4527         cmp(ch1, ch2);
4528         br(NE, STR2_NEXT);
4529         b(MATCH);
4530     }
4531 
4532     if (icnt1 == -1 || icnt1 == 1) {
4533       Label CH1_LOOP, HAS_ZERO;
4534       Label DO1_SHORT, DO1_LOOP;
4535 
4536       BIND(DO1);
4537         ldrh(ch1, str1);
4538         cmp(cnt2, 4);
4539         br(LT, DO1_SHORT);
4540 
4541         orr(ch1, ch1, ch1, LSL, 16);
4542         orr(ch1, ch1, ch1, LSL, 32);
4543 
4544         sub(cnt2, cnt2, 4);
4545         mov(result_tmp, cnt2);
4546         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4547         sub(cnt2_neg, zr, cnt2, LSL, 1);
4548 
4549         mov(tmp3, 0x0001000100010001);
4550       BIND(CH1_LOOP);
4551         ldr(ch2, Address(str2, cnt2_neg));
4552         eor(ch2, ch1, ch2);
4553         sub(tmp1, ch2, tmp3);
4554         orr(tmp2, ch2, 0x7fff7fff7fff7fff);
4555         bics(tmp1, tmp1, tmp2);
4556         br(NE, HAS_ZERO);
4557         adds(cnt2_neg, cnt2_neg, 8);
4558         br(LT, CH1_LOOP);
4559 
4560         cmp(cnt2_neg, 8);
4561         mov(cnt2_neg, 0);
4562         br(LT, CH1_LOOP);
4563         b(NOMATCH);
4564 
4565       BIND(HAS_ZERO);
4566         rev(tmp1, tmp1);
4567         clz(tmp1, tmp1);
4568         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4569         b(MATCH);
4570 
4571       BIND(DO1_SHORT);
4572         mov(result_tmp, cnt2);
4573         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4574         sub(cnt2_neg, zr, cnt2, LSL, 1);
4575       BIND(DO1_LOOP);
4576         ldrh(ch2, Address(str2, cnt2_neg));
4577         cmpw(ch1, ch2);
4578         br(EQ, MATCH);
4579         adds(cnt2_neg, cnt2_neg, 2);
4580         br(LT, DO1_LOOP);
4581     }
4582   }
4583   BIND(NOMATCH);
4584     mov(result, -1);
4585     b(DONE);
4586   BIND(MATCH);
4587     add(result, result_tmp, cnt2_neg, ASR, 1);
4588   BIND(DONE);
4589 }
4590 
4591 // Compare strings.
4592 void MacroAssembler::string_compare(Register str1, Register str2,
4593                                     Register cnt1, Register cnt2, Register result,
4594                                     Register tmp1) {
4595   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4596     NEXT_WORD, DIFFERENCE;
4597 
4598   BLOCK_COMMENT("string_compare {");
4599 
4600   // Compute the minimum of the string lengths and save the difference.
4601   subsw(tmp1, cnt1, cnt2);
4602   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4603 
4604   // A very short string
4605   cmpw(cnt2, 4);
4606   br(Assembler::LT, SHORT_STRING);
4607 
4608   // Check if the strings start at the same location.
4609   cmp(str1, str2);
4610   br(Assembler::EQ, LENGTH_DIFF);
4611 
4612   // Compare longwords
4613   {
4614     subw(cnt2, cnt2, 4); // The last longword is a special case
4615 
4616     // Move both string pointers to the last longword of their
4617     // strings, negate the remaining count, and convert it to bytes.
4618     lea(str1, Address(str1, cnt2, Address::uxtw(1)));
4619     lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4620     sub(cnt2, zr, cnt2, LSL, 1);
4621 
4622     // Loop, loading longwords and comparing them into rscratch2.
4623     bind(NEXT_WORD);
4624     ldr(result, Address(str1, cnt2));
4625     ldr(cnt1, Address(str2, cnt2));
4626     adds(cnt2, cnt2, wordSize);
4627     eor(rscratch2, result, cnt1);
4628     cbnz(rscratch2, DIFFERENCE);
4629     br(Assembler::LT, NEXT_WORD);
4630 
4631     // Last longword.  In the case where length == 4 we compare the
4632     // same longword twice, but that's still faster than another
4633     // conditional branch.
4634 
4635     ldr(result, Address(str1));
4636     ldr(cnt1, Address(str2));
4637     eor(rscratch2, result, cnt1);
4638     cbz(rscratch2, LENGTH_DIFF);
4639 
4640     // Find the first different characters in the longwords and
4641     // compute their difference.
4642     bind(DIFFERENCE);
4643     rev(rscratch2, rscratch2);
4644     clz(rscratch2, rscratch2);
4645     andr(rscratch2, rscratch2, -16);
4646     lsrv(result, result, rscratch2);
4647     uxthw(result, result);
4648     lsrv(cnt1, cnt1, rscratch2);
4649     uxthw(cnt1, cnt1);
4650     subw(result, result, cnt1);
4651     b(DONE);
4652   }
4653 
4654   bind(SHORT_STRING);
4655   // Is the minimum length zero?
4656   cbz(cnt2, LENGTH_DIFF);
4657 
4658   bind(SHORT_LOOP);
4659   load_unsigned_short(result, Address(post(str1, 2)));
4660   load_unsigned_short(cnt1, Address(post(str2, 2)));
4661   subw(result, result, cnt1);
4662   cbnz(result, DONE);
4663   sub(cnt2, cnt2, 1);
4664   cbnz(cnt2, SHORT_LOOP);
4665 
4666   // Strings are equal up to min length.  Return the length difference.
4667   bind(LENGTH_DIFF);
4668   mov(result, tmp1);
4669 
4670   // That's it
4671   bind(DONE);
4672 
4673   BLOCK_COMMENT("} string_compare");
4674 }
4675 
4676 
4677 // base:     Address of a buffer to be zeroed, 8 bytes aligned.
4678 // cnt:      Count in HeapWords.
4679 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4680 void MacroAssembler::zero_words(Register base, Register cnt)
4681 {
4682   if (UseBlockZeroing) {
4683     block_zero(base, cnt);
4684   } else {
4685     fill_words(base, cnt, zr);
4686   }
4687 }
4688 
4689 // r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
4690 // cnt:          Immediate count in HeapWords.
4691 // r11 = tmp:    For use as cnt if we need to call out
4692 #define ShortArraySize (18 * BytesPerLong)
4693 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4694 {
4695   Register tmp = r11;
4696   int i = cnt & 1;  // store any odd word to start
4697   if (i) str(zr, Address(base));
4698 
4699   if (cnt <= ShortArraySize / BytesPerLong) {
4700     for (; i < (int)cnt; i += 2)
4701       stp(zr, zr, Address(base, i * wordSize));
4702   } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
4703     mov(tmp, cnt);
4704     block_zero(base, tmp, true);
4705   } else {
4706     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4707     int remainder = cnt % (2 * unroll);
4708     for (; i < remainder; i += 2)
4709       stp(zr, zr, Address(base, i * wordSize));
4710 
4711     Label loop;
4712     Register cnt_reg = rscratch1;
4713     Register loop_base = rscratch2;
4714     cnt = cnt - remainder;
4715     mov(cnt_reg, cnt);
4716     // adjust base and prebias by -2 * wordSize so we can pre-increment
4717     add(loop_base, base, (remainder - 2) * wordSize);
4718     bind(loop);
4719     sub(cnt_reg, cnt_reg, 2 * unroll);
4720     for (i = 1; i < unroll; i++)
4721       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4722     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4723     cbnz(cnt_reg, loop);
4724   }
4725 }
4726 
4727 // base:   Address of a buffer to be filled, 8 bytes aligned.
4728 // cnt:    Count in 8-byte unit.
4729 // value:  Value to be filled with.
4730 // base will point to the end of the buffer after filling.
4731 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
4732 {
4733 //  Algorithm:
4734 //
4735 //    scratch1 = cnt & 7;
4736 //    cnt -= scratch1;
4737 //    p += scratch1;
4738 //    switch (scratch1) {
4739 //      do {
4740 //        cnt -= 8;
4741 //          p[-8] = v;
4742 //        case 7:
4743 //          p[-7] = v;
4744 //        case 6:
4745 //          p[-6] = v;
4746 //          // ...
4747 //        case 1:
4748 //          p[-1] = v;
4749 //        case 0:
4750 //          p += 8;
4751 //      } while (cnt);
4752 //    }
4753 
4754   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
4755 
4756   Label fini, skip, entry, loop;
4757   const int unroll = 8; // Number of stp instructions we'll unroll
4758 
4759   cbz(cnt, fini);
4760   tbz(base, 3, skip);
4761   str(value, Address(post(base, 8)));
4762   sub(cnt, cnt, 1);
4763   bind(skip);
4764 
4765   andr(rscratch1, cnt, (unroll-1) * 2);
4766   sub(cnt, cnt, rscratch1);
4767   add(base, base, rscratch1, Assembler::LSL, 3);
4768   adr(rscratch2, entry);
4769   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
4770   br(rscratch2);
4771 
4772   bind(loop);
4773   add(base, base, unroll * 16);
4774   for (int i = -unroll; i < 0; i++)
4775     stp(value, value, Address(base, i * 16));
4776   bind(entry);
4777   subs(cnt, cnt, unroll * 2);
4778   br(Assembler::GE, loop);
4779 
4780   tbz(cnt, 0, fini);
4781   str(value, Address(post(base, 8)));
4782   bind(fini);
4783 }
4784 
4785 // Use DC ZVA to do fast zeroing.
4786 // base:   Address of a buffer to be zeroed, 8 bytes aligned.
4787 // cnt:    Count in HeapWords.
4788 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4789 void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
4790 {
4791   Label small;
4792   Label store_pair, loop_store_pair, done;
4793   Label base_aligned;
4794 
4795   assert_different_registers(base, cnt, rscratch1);
4796   guarantee(base == r10 && cnt == r11, "fix register usage");
4797 
4798   Register tmp = rscratch1;
4799   Register tmp2 = rscratch2;
4800   int zva_length = VM_Version::zva_length();
4801 
4802   // Ensure ZVA length can be divided by 16. This is required by
4803   // the subsequent operations.
4804   assert (zva_length % 16 == 0, "Unexpected ZVA Length");
4805 
4806   if (!is_large) cbz(cnt, done);
4807   tbz(base, 3, base_aligned);
4808   str(zr, Address(post(base, 8)));
4809   sub(cnt, cnt, 1);
4810   bind(base_aligned);
4811 
4812   // Ensure count >= zva_length * 2 so that it still deserves a zva after
4813   // alignment.
4814   if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
4815     int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
4816     subs(tmp, cnt, low_limit >> 3);
4817     br(Assembler::LT, small);
4818   }
4819 
4820   far_call(StubRoutines::aarch64::get_zero_longs());
4821 
4822   bind(small);
4823 
4824   const int unroll = 8; // Number of stp instructions we'll unroll
4825   Label small_loop, small_table_end;
4826 
4827   andr(tmp, cnt, (unroll-1) * 2);
4828   sub(cnt, cnt, tmp);
4829   add(base, base, tmp, Assembler::LSL, 3);
4830   adr(tmp2, small_table_end);
4831   sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
4832   br(tmp2);
4833 
4834   bind(small_loop);
4835   add(base, base, unroll * 16);
4836   for (int i = -unroll; i < 0; i++)
4837     stp(zr, zr, Address(base, i * 16));
4838   bind(small_table_end);
4839   subs(cnt, cnt, unroll * 2);
4840   br(Assembler::GE, small_loop);
4841 
4842   tbz(cnt, 0, done);
4843   str(zr, Address(post(base, 8)));
4844 
4845   bind(done);
4846 }
4847 
4848 void MacroAssembler::string_equals(Register str1, Register str2,
4849                                    Register cnt, Register result,
4850                                    Register tmp1) {
4851   Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
4852     NEXT_WORD;
4853 
4854   const Register tmp2 = rscratch1;
4855   assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
4856 
4857   BLOCK_COMMENT("string_equals {");
4858 
4859   // Start by assuming that the strings are not equal.
4860   mov(result, zr);
4861 
4862   // A very short string
4863   cmpw(cnt, 4);
4864   br(Assembler::LT, SHORT_STRING);
4865 
4866   // Check if the strings start at the same location.
4867   cmp(str1, str2);
4868   br(Assembler::EQ, SAME_CHARS);
4869 
4870   // Compare longwords
4871   {
4872     subw(cnt, cnt, 4); // The last longword is a special case
4873 
4874     // Move both string pointers to the last longword of their
4875     // strings, negate the remaining count, and convert it to bytes.
4876     lea(str1, Address(str1, cnt, Address::uxtw(1)));
4877     lea(str2, Address(str2, cnt, Address::uxtw(1)));
4878     sub(cnt, zr, cnt, LSL, 1);
4879 
4880     // Loop, loading longwords and comparing them into rscratch2.
4881     bind(NEXT_WORD);
4882     ldr(tmp1, Address(str1, cnt));
4883     ldr(tmp2, Address(str2, cnt));
4884     adds(cnt, cnt, wordSize);
4885     eor(rscratch2, tmp1, tmp2);
4886     cbnz(rscratch2, DONE);
4887     br(Assembler::LT, NEXT_WORD);
4888 
4889     // Last longword.  In the case where length == 4 we compare the
4890     // same longword twice, but that's still faster than another
4891     // conditional branch.
4892 
4893     ldr(tmp1, Address(str1));
4894     ldr(tmp2, Address(str2));
4895     eor(rscratch2, tmp1, tmp2);
4896     cbz(rscratch2, SAME_CHARS);
4897     b(DONE);
4898   }
4899 
4900   bind(SHORT_STRING);
4901   // Is the length zero?
4902   cbz(cnt, SAME_CHARS);
4903 
4904   bind(SHORT_LOOP);
4905   load_unsigned_short(tmp1, Address(post(str1, 2)));
4906   load_unsigned_short(tmp2, Address(post(str2, 2)));
4907   subw(tmp1, tmp1, tmp2);
4908   cbnz(tmp1, DONE);
4909   sub(cnt, cnt, 1);
4910   cbnz(cnt, SHORT_LOOP);
4911 
4912   // Strings are equal.
4913   bind(SAME_CHARS);
4914   mov(result, true);
4915 
4916   // That's it
4917   bind(DONE);
4918 
4919   BLOCK_COMMENT("} string_equals");
4920 }
4921 
4922 // Compare char[] arrays aligned to 4 bytes
4923 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4924                                         Register result, Register tmp1)
4925 {
4926   Register cnt1 = rscratch1;
4927   Register cnt2 = rscratch2;
4928   Register tmp2 = rscratch2;
4929 
4930   Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4931 
4932   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4933   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4934 
4935   BLOCK_COMMENT("char_arrays_equals  {");
4936 
4937     // different until proven equal
4938     mov(result, false);
4939 
4940     // same array?
4941     cmpoops(ary1, ary2);
4942     br(Assembler::EQ, SAME);
4943 
4944     // ne if either null
4945     cbz(ary1, DIFFER);
4946     cbz(ary2, DIFFER);
4947 
4948     // lengths ne?
4949     ldrw(cnt1, Address(ary1, length_offset));
4950     ldrw(cnt2, Address(ary2, length_offset));
4951     cmp(cnt1, cnt2);
4952     br(Assembler::NE, DIFFER);
4953 
4954     lea(ary1, Address(ary1, base_offset));
4955     lea(ary2, Address(ary2, base_offset));
4956 
4957     subs(cnt1, cnt1, 4);
4958     br(LT, TAIL03);
4959 
4960   BIND(NEXT);
4961     ldr(tmp1, Address(post(ary1, 8)));
4962     ldr(tmp2, Address(post(ary2, 8)));
4963     subs(cnt1, cnt1, 4);
4964     eor(tmp1, tmp1, tmp2);
4965     cbnz(tmp1, DIFFER);
4966     br(GE, NEXT);
4967 
4968   BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4969     tst(cnt1, 0b10);
4970     br(EQ, TAIL01);
4971     ldrw(tmp1, Address(post(ary1, 4)));
4972     ldrw(tmp2, Address(post(ary2, 4)));
4973     cmp(tmp1, tmp2);
4974     br(NE, DIFFER);
4975   BIND(TAIL01);  // 0-1 chars left
4976     tst(cnt1, 0b01);
4977     br(EQ, SAME);
4978     ldrh(tmp1, ary1);
4979     ldrh(tmp2, ary2);
4980     cmp(tmp1, tmp2);
4981     br(NE, DIFFER);
4982 
4983   BIND(SAME);
4984     mov(result, true);
4985   BIND(DIFFER); // result already set
4986   
4987   BLOCK_COMMENT("} char_arrays_equals");
4988 }
4989 
4990 // encode char[] to byte[] in ISO_8859_1
4991 void MacroAssembler::encode_iso_array(Register src, Register dst,
4992                       Register len, Register result,
4993                       FloatRegister Vtmp1, FloatRegister Vtmp2,
4994                       FloatRegister Vtmp3, FloatRegister Vtmp4)
4995 {
4996     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
4997     Register tmp1 = rscratch1;
4998 
4999       mov(result, len); // Save initial len
5000 
5001 #ifndef BUILTIN_SIM
5002       subs(len, len, 32);
5003       br(LT, LOOP_8);
5004 
5005 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
5006 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
5007 // any char could not fit in a byte, so clear the FPSR so we can test it.
5008       clear_fpsr();
5009 
5010     BIND(NEXT_32);
5011       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5012       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
5013       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
5014       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
5015       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
5016       get_fpsr(tmp1);
5017       cbnzw(tmp1, LOOP_8);
5018       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
5019       subs(len, len, 32);
5020       add(src, src, 64);
5021       br(GE, NEXT_32);
5022 
5023     BIND(LOOP_8);
5024       adds(len, len, 32-8);
5025       br(LT, LOOP_1);
5026       clear_fpsr(); // QC may be set from loop above, clear again
5027     BIND(NEXT_8);
5028       ld1(Vtmp1, T8H, src);
5029       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
5030       get_fpsr(tmp1);
5031       cbnzw(tmp1, LOOP_1);
5032       st1(Vtmp1, T8B, post(dst, 8));
5033       subs(len, len, 8);
5034       add(src, src, 16);
5035       br(GE, NEXT_8);
5036 
5037     BIND(LOOP_1);
5038       adds(len, len, 8);
5039       br(LE, DONE);
5040 #else
5041       cbz(len, DONE);
5042 #endif
5043     BIND(NEXT_1);
5044       ldrh(tmp1, Address(post(src, 2)));
5045       tst(tmp1, 0xff00);
5046       br(NE, DONE);
5047       strb(tmp1, Address(post(dst, 1)));
5048       subs(len, len, 1);
5049       br(GT, NEXT_1);
5050 
5051     BIND(DONE);
5052       sub(result, result, len); // Return index where we stopped
5053 }