1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "asm/assembler.hpp"
  30 #include "asm/assembler.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 
  33 #include "compiler/disassembler.hpp"
  34 #include "gc/shared/collectedHeap.hpp"
  35 #include "gc/shenandoah/brooksPointer.hpp"
  36 #include "gc/shenandoah/shenandoahHeap.hpp"
  37 #include "gc/shenandoah/shenandoahHeap.inline.hpp"
  38 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/klass.inline.hpp"
  42 #include "oops/oop.inline.hpp"
  43 #include "opto/compile.hpp"
  44 #include "opto/intrinsicnode.hpp"
  45 #include "opto/node.hpp"
  46 #include "runtime/biasedLocking.hpp"
  47 #include "runtime/icache.hpp"
  48 #include "runtime/interfaceSupport.hpp"
  49 #include "runtime/sharedRuntime.hpp"
  50 #include "runtime/thread.hpp"
  51 
  52 #if INCLUDE_ALL_GCS
  53 #include "gc/g1/g1CollectedHeap.inline.hpp"
  54 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  55 #include "gc/g1/heapRegion.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #define STOP(error) stop(error)
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #define STOP(error) block_comment(error); stop(error)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 // Patch any kind of instruction; there may be several instructions.
  69 // Return the total length (in bytes) of the instructions.
  70 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  71   int instructions = 1;
  72   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  73   long offset = (target - branch) >> 2;
  74   unsigned insn = *(unsigned*)branch;
  75   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  76     // Load register (literal)
  77     Instruction_aarch64::spatch(branch, 23, 5, offset);
  78   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  79     // Unconditional branch (immediate)
  80     Instruction_aarch64::spatch(branch, 25, 0, offset);
  81   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  82     // Conditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 23, 5, offset);
  84   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  85     // Compare & branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  88     // Test & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 18, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  91     // PC-rel. addressing
  92     offset = target-branch;
  93     int shift = Instruction_aarch64::extract(insn, 31, 31);
  94     if (shift) {
  95       u_int64_t dest = (u_int64_t)target;
  96       uint64_t pc_page = (uint64_t)branch >> 12;
  97       uint64_t adr_page = (uint64_t)target >> 12;
  98       unsigned offset_lo = dest & 0xfff;
  99       offset = adr_page - pc_page;
 100 
 101       // We handle 4 types of PC relative addressing
 102       //   1 - adrp    Rx, target_page
 103       //       ldr/str Ry, [Rx, #offset_in_page]
 104       //   2 - adrp    Rx, target_page
 105       //       add     Ry, Rx, #offset_in_page
 106       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 107       //       movk    Rx, #imm16<<32
 108       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 109       // In the first 3 cases we must check that Rx is the same in the adrp and the
 110       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 111       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 112       // to be followed by a random unrelated ldr/str, add or movk instruction.
 113       //
 114       unsigned insn2 = ((unsigned*)branch)[1];
 115       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 116                 Instruction_aarch64::extract(insn, 4, 0) ==
 117                         Instruction_aarch64::extract(insn2, 9, 5)) {
 118         // Load/store register (unsigned immediate)
 119         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 120         Instruction_aarch64::patch(branch + sizeof (unsigned),
 121                                     21, 10, offset_lo >> size);
 122         guarantee(((dest >> size) << size) == dest, "misaligned target");
 123         instructions = 2;
 124       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 125                 Instruction_aarch64::extract(insn, 4, 0) ==
 126                         Instruction_aarch64::extract(insn2, 4, 0)) {
 127         // add (immediate)
 128         Instruction_aarch64::patch(branch + sizeof (unsigned),
 129                                    21, 10, offset_lo);
 130         instructions = 2;
 131       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 132                    Instruction_aarch64::extract(insn, 4, 0) ==
 133                      Instruction_aarch64::extract(insn2, 4, 0)) {
 134         // movk #imm16<<32
 135         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 136         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 137         long pc_page = (long)branch >> 12;
 138         long adr_page = (long)dest >> 12;
 139         offset = adr_page - pc_page;
 140         instructions = 2;
 141       }
 142     }
 143     int offset_lo = offset & 3;
 144     offset >>= 2;
 145     Instruction_aarch64::spatch(branch, 23, 5, offset);
 146     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 147   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 148     u_int64_t dest = (u_int64_t)target;
 149     // Move wide constant
 150     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 151     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 152     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 153     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 154     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 155     assert(target_addr_for_insn(branch) == target, "should be");
 156     instructions = 3;
 157   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 158              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 159     // nothing to do
 160     assert(target == 0, "did not expect to relocate target for polling page load");
 161   } else {
 162     ShouldNotReachHere();
 163   }
 164   return instructions * NativeInstruction::instruction_size;
 165 }
 166 
 167 int MacroAssembler::patch_oop(address insn_addr, address o) {
 168   int instructions;
 169   unsigned insn = *(unsigned*)insn_addr;
 170   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 171 
 172   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 173   // narrow OOPs by setting the upper 16 bits in the first
 174   // instruction.
 175   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 176     // Move narrow OOP
 177     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 178     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 179     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 180     instructions = 2;
 181   } else {
 182     // Move wide OOP
 183     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 184     uintptr_t dest = (uintptr_t)o;
 185     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 186     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 188     instructions = 3;
 189   }
 190   return instructions * NativeInstruction::instruction_size;
 191 }
 192 
 193 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 194   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 195   // We encode narrow ones by setting the upper 16 bits in the first
 196   // instruction.
 197   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 198   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 199          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 200 
 201   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 202   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 203   return 2 * NativeInstruction::instruction_size;
 204 }
 205 
 206 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 207   long offset = 0;
 208   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 209     // Load register (literal)
 210     offset = Instruction_aarch64::sextract(insn, 23, 5);
 211     return address(((uint64_t)insn_addr + (offset << 2)));
 212   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 213     // Unconditional branch (immediate)
 214     offset = Instruction_aarch64::sextract(insn, 25, 0);
 215   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 216     // Conditional branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 23, 5);
 218   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 219     // Compare & branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 23, 5);
 221    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 222     // Test & branch (immediate)
 223     offset = Instruction_aarch64::sextract(insn, 18, 5);
 224   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 225     // PC-rel. addressing
 226     offset = Instruction_aarch64::extract(insn, 30, 29);
 227     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 228     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 229     if (shift) {
 230       offset <<= shift;
 231       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 232       target_page &= ((uint64_t)-1) << shift;
 233       // Return the target address for the following sequences
 234       //   1 - adrp    Rx, target_page
 235       //       ldr/str Ry, [Rx, #offset_in_page]
 236       //   2 - adrp    Rx, target_page
 237       //       add     Ry, Rx, #offset_in_page
 238       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 239       //       movk    Rx, #imm12<<32
 240       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 241       //
 242       // In the first two cases  we check that the register is the same and
 243       // return the target_page + the offset within the page.
 244       // Otherwise we assume it is a page aligned relocation and return
 245       // the target page only.
 246       //
 247       unsigned insn2 = ((unsigned*)insn_addr)[1];
 248       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 249                 Instruction_aarch64::extract(insn, 4, 0) ==
 250                         Instruction_aarch64::extract(insn2, 9, 5)) {
 251         // Load/store register (unsigned immediate)
 252         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 253         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 254         return address(target_page + (byte_offset << size));
 255       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 256                 Instruction_aarch64::extract(insn, 4, 0) ==
 257                         Instruction_aarch64::extract(insn2, 4, 0)) {
 258         // add (immediate)
 259         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 260         return address(target_page + byte_offset);
 261       } else {
 262         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 263                Instruction_aarch64::extract(insn, 4, 0) ==
 264                  Instruction_aarch64::extract(insn2, 4, 0)) {
 265           target_page = (target_page & 0xffffffff) |
 266                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 267         }
 268         return (address)target_page;
 269       }
 270     } else {
 271       ShouldNotReachHere();
 272     }
 273   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 274     u_int32_t *insns = (u_int32_t *)insn_addr;
 275     // Move wide constant: movz, movk, movk.  See movptr().
 276     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 277     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 278     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 279                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 281   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 282              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 283     return 0;
 284   } else {
 285     ShouldNotReachHere();
 286   }
 287   return address(((uint64_t)insn_addr + (offset << 2)));
 288 }
 289 
 290 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 291   dsb(Assembler::SY);
 292 }
 293 
 294 
 295 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 296   // we must set sp to zero to clear frame
 297   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 298 
 299   // must clear fp, so that compiled frames are not confused; it is
 300   // possible that we need it only for debugging
 301   if (clear_fp) {
 302     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 303   }
 304 
 305   // Always clear the pc because it could have been set by make_walkable()
 306   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 307 }
 308 
 309 // Calls to C land
 310 //
 311 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 312 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 313 // has to be reset to 0. This is required to allow proper stack traversal.
 314 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 315                                          Register last_java_fp,
 316                                          Register last_java_pc,
 317                                          Register scratch) {
 318 
 319   if (last_java_pc->is_valid()) {
 320       str(last_java_pc, Address(rthread,
 321                                 JavaThread::frame_anchor_offset()
 322                                 + JavaFrameAnchor::last_Java_pc_offset()));
 323     }
 324 
 325   // determine last_java_sp register
 326   if (last_java_sp == sp) {
 327     mov(scratch, sp);
 328     last_java_sp = scratch;
 329   } else if (!last_java_sp->is_valid()) {
 330     last_java_sp = esp;
 331   }
 332 
 333   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 334 
 335   // last_java_fp is optional
 336   if (last_java_fp->is_valid()) {
 337     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 338   }
 339 }
 340 
 341 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 342                                          Register last_java_fp,
 343                                          address  last_java_pc,
 344                                          Register scratch) {
 345   if (last_java_pc != NULL) {
 346     adr(scratch, last_java_pc);
 347   } else {
 348     // FIXME: This is almost never correct.  We should delete all
 349     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 350     // correct return address instead.
 351     adr(scratch, pc());
 352   }
 353 
 354   str(scratch, Address(rthread,
 355                        JavaThread::frame_anchor_offset()
 356                        + JavaFrameAnchor::last_Java_pc_offset()));
 357 
 358   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 359 }
 360 
 361 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 362                                          Register last_java_fp,
 363                                          Label &L,
 364                                          Register scratch) {
 365   if (L.is_bound()) {
 366     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 367   } else {
 368     InstructionMark im(this);
 369     L.add_patch_at(code(), locator());
 370     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 371   }
 372 }
 373 
 374 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 375   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 376   assert(CodeCache::find_blob(entry.target()) != NULL,
 377          "destination of far call not found in code cache");
 378   if (far_branches()) {
 379     unsigned long offset;
 380     // We can use ADRP here because we know that the total size of
 381     // the code cache cannot exceed 2Gb.
 382     adrp(tmp, entry, offset);
 383     add(tmp, tmp, offset);
 384     if (cbuf) cbuf->set_insts_mark();
 385     blr(tmp);
 386   } else {
 387     if (cbuf) cbuf->set_insts_mark();
 388     bl(entry);
 389   }
 390 }
 391 
 392 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 393   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 394   assert(CodeCache::find_blob(entry.target()) != NULL,
 395          "destination of far call not found in code cache");
 396   if (far_branches()) {
 397     unsigned long offset;
 398     // We can use ADRP here because we know that the total size of
 399     // the code cache cannot exceed 2Gb.
 400     adrp(tmp, entry, offset);
 401     add(tmp, tmp, offset);
 402     if (cbuf) cbuf->set_insts_mark();
 403     br(tmp);
 404   } else {
 405     if (cbuf) cbuf->set_insts_mark();
 406     b(entry);
 407   }
 408 }
 409 
 410 void MacroAssembler::reserved_stack_check() {
 411     // testing if reserved zone needs to be enabled
 412     Label no_reserved_zone_enabling;
 413 
 414     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 415     cmp(sp, rscratch1);
 416     br(Assembler::LO, no_reserved_zone_enabling);
 417 
 418     enter();   // LR and FP are live.
 419     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 420     mov(c_rarg0, rthread);
 421     blr(rscratch1);
 422     leave();
 423 
 424     // We have already removed our own frame.
 425     // throw_delayed_StackOverflowError will think that it's been
 426     // called by our caller.
 427     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 428     br(rscratch1);
 429     should_not_reach_here();
 430 
 431     bind(no_reserved_zone_enabling);
 432 }
 433 
 434 int MacroAssembler::biased_locking_enter(Register lock_reg,
 435                                          Register obj_reg,
 436                                          Register swap_reg,
 437                                          Register tmp_reg,
 438                                          bool swap_reg_contains_mark,
 439                                          Label& done,
 440                                          Label* slow_case,
 441                                          BiasedLockingCounters* counters) {
 442   assert(UseBiasedLocking, "why call this otherwise?");
 443   assert_different_registers(lock_reg, obj_reg, swap_reg);
 444 
 445   if (PrintBiasedLockingStatistics && counters == NULL)
 446     counters = BiasedLocking::counters();
 447 
 448   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 449   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 450   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 451   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 452   Address saved_mark_addr(lock_reg, 0);
 453 
 454   shenandoah_store_addr_check(obj_reg);
 455 
 456   // Biased locking
 457   // See whether the lock is currently biased toward our thread and
 458   // whether the epoch is still valid
 459   // Note that the runtime guarantees sufficient alignment of JavaThread
 460   // pointers to allow age to be placed into low bits
 461   // First check to see whether biasing is even enabled for this object
 462   Label cas_label;
 463   int null_check_offset = -1;
 464   if (!swap_reg_contains_mark) {
 465     null_check_offset = offset();
 466     ldr(swap_reg, mark_addr);
 467   }
 468   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 469   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 470   br(Assembler::NE, cas_label);
 471   // The bias pattern is present in the object's header. Need to check
 472   // whether the bias owner and the epoch are both still current.
 473   load_prototype_header(tmp_reg, obj_reg);
 474   orr(tmp_reg, tmp_reg, rthread);
 475   eor(tmp_reg, swap_reg, tmp_reg);
 476   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 477   if (counters != NULL) {
 478     Label around;
 479     cbnz(tmp_reg, around);
 480     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 481     b(done);
 482     bind(around);
 483   } else {
 484     cbz(tmp_reg, done);
 485   }
 486 
 487   Label try_revoke_bias;
 488   Label try_rebias;
 489 
 490   // At this point we know that the header has the bias pattern and
 491   // that we are not the bias owner in the current epoch. We need to
 492   // figure out more details about the state of the header in order to
 493   // know what operations can be legally performed on the object's
 494   // header.
 495 
 496   // If the low three bits in the xor result aren't clear, that means
 497   // the prototype header is no longer biased and we have to revoke
 498   // the bias on this object.
 499   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 500   cbnz(rscratch1, try_revoke_bias);
 501 
 502   // Biasing is still enabled for this data type. See whether the
 503   // epoch of the current bias is still valid, meaning that the epoch
 504   // bits of the mark word are equal to the epoch bits of the
 505   // prototype header. (Note that the prototype header's epoch bits
 506   // only change at a safepoint.) If not, attempt to rebias the object
 507   // toward the current thread. Note that we must be absolutely sure
 508   // that the current epoch is invalid in order to do this because
 509   // otherwise the manipulations it performs on the mark word are
 510   // illegal.
 511   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 512   cbnz(rscratch1, try_rebias);
 513 
 514   // The epoch of the current bias is still valid but we know nothing
 515   // about the owner; it might be set or it might be clear. Try to
 516   // acquire the bias of the object using an atomic operation. If this
 517   // fails we will go in to the runtime to revoke the object's bias.
 518   // Note that we first construct the presumed unbiased header so we
 519   // don't accidentally blow away another thread's valid bias.
 520   {
 521     Label here;
 522     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 523     andr(swap_reg, swap_reg, rscratch1);
 524     orr(tmp_reg, swap_reg, rthread);
 525     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 526     // If the biasing toward our thread failed, this means that
 527     // another thread succeeded in biasing it toward itself and we
 528     // need to revoke that bias. The revocation will occur in the
 529     // interpreter runtime in the slow case.
 530     bind(here);
 531     if (counters != NULL) {
 532       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 533                   tmp_reg, rscratch1, rscratch2);
 534     }
 535   }
 536   b(done);
 537 
 538   bind(try_rebias);
 539   // At this point we know the epoch has expired, meaning that the
 540   // current "bias owner", if any, is actually invalid. Under these
 541   // circumstances _only_, we are allowed to use the current header's
 542   // value as the comparison value when doing the cas to acquire the
 543   // bias in the current epoch. In other words, we allow transfer of
 544   // the bias from one thread to another directly in this situation.
 545   //
 546   // FIXME: due to a lack of registers we currently blow away the age
 547   // bits in this situation. Should attempt to preserve them.
 548   {
 549     Label here;
 550     load_prototype_header(tmp_reg, obj_reg);
 551     orr(tmp_reg, rthread, tmp_reg);
 552     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 553     // If the biasing toward our thread failed, then another thread
 554     // succeeded in biasing it toward itself and we need to revoke that
 555     // bias. The revocation will occur in the runtime in the slow case.
 556     bind(here);
 557     if (counters != NULL) {
 558       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 559                   tmp_reg, rscratch1, rscratch2);
 560     }
 561   }
 562   b(done);
 563 
 564   bind(try_revoke_bias);
 565   // The prototype mark in the klass doesn't have the bias bit set any
 566   // more, indicating that objects of this data type are not supposed
 567   // to be biased any more. We are going to try to reset the mark of
 568   // this object to the prototype value and fall through to the
 569   // CAS-based locking scheme. Note that if our CAS fails, it means
 570   // that another thread raced us for the privilege of revoking the
 571   // bias of this particular object, so it's okay to continue in the
 572   // normal locking code.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here, nope;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 580     bind(here);
 581 
 582     // Fall through to the normal CAS-based lock, because no matter what
 583     // the result of the above CAS, some thread must have succeeded in
 584     // removing the bias bit from the object's header.
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 587                   rscratch1, rscratch2);
 588     }
 589     bind(nope);
 590   }
 591 
 592   bind(cas_label);
 593 
 594   return null_check_offset;
 595 }
 596 
 597 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 598   assert(UseBiasedLocking, "why call this otherwise?");
 599 
 600   // Check for biased locking unlock case, which is a no-op
 601   // Note: we do not have to check the thread ID for two reasons.
 602   // First, the interpreter checks for IllegalMonitorStateException at
 603   // a higher level. Second, if the bias was revoked while we held the
 604   // lock, the object could not be rebiased toward another thread, so
 605   // the bias bit would be clear.
 606   shenandoah_store_addr_check(obj_reg); // Access mark word
 607   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 608   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 609   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 610   br(Assembler::EQ, done);
 611 }
 612 
 613 static void pass_arg0(MacroAssembler* masm, Register arg) {
 614   if (c_rarg0 != arg ) {
 615     masm->mov(c_rarg0, arg);
 616   }
 617 }
 618 
 619 static void pass_arg1(MacroAssembler* masm, Register arg) {
 620   if (c_rarg1 != arg ) {
 621     masm->mov(c_rarg1, arg);
 622   }
 623 }
 624 
 625 static void pass_arg2(MacroAssembler* masm, Register arg) {
 626   if (c_rarg2 != arg ) {
 627     masm->mov(c_rarg2, arg);
 628   }
 629 }
 630 
 631 static void pass_arg3(MacroAssembler* masm, Register arg) {
 632   if (c_rarg3 != arg ) {
 633     masm->mov(c_rarg3, arg);
 634   }
 635 }
 636 
 637 void MacroAssembler::call_VM_base(Register oop_result,
 638                                   Register java_thread,
 639                                   Register last_java_sp,
 640                                   address  entry_point,
 641                                   int      number_of_arguments,
 642                                   bool     check_exceptions) {
 643    // determine java_thread register
 644   if (!java_thread->is_valid()) {
 645     java_thread = rthread;
 646   }
 647 
 648   // determine last_java_sp register
 649   if (!last_java_sp->is_valid()) {
 650     last_java_sp = esp;
 651   }
 652 
 653   // debugging support
 654   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 655   assert(java_thread == rthread, "unexpected register");
 656 #ifdef ASSERT
 657   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 658   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 659 #endif // ASSERT
 660 
 661   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 662   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 663 
 664   // push java thread (becomes first argument of C function)
 665 
 666   mov(c_rarg0, java_thread);
 667 
 668   // set last Java frame before call
 669   assert(last_java_sp != rfp, "can't use rfp");
 670 
 671   Label l;
 672   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 673 
 674   // do the call, remove parameters
 675   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 676 
 677   // reset last Java frame
 678   // Only interpreter should have to clear fp
 679   reset_last_Java_frame(true);
 680 
 681    // C++ interp handles this in the interpreter
 682   check_and_handle_popframe(java_thread);
 683   check_and_handle_earlyret(java_thread);
 684 
 685   if (check_exceptions) {
 686     // check for pending exceptions (java_thread is set upon return)
 687     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 688     Label ok;
 689     cbz(rscratch1, ok);
 690     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 691     br(rscratch1);
 692     bind(ok);
 693   }
 694 
 695   // get oop result if there is one and reset the value in the thread
 696   if (oop_result->is_valid()) {
 697     get_vm_result(oop_result, java_thread);
 698   }
 699 }
 700 
 701 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 702   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 703 }
 704 
 705 // Maybe emit a call via a trampoline.  If the code cache is small
 706 // trampolines won't be emitted.
 707 
 708 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 709   assert(entry.rspec().type() == relocInfo::runtime_call_type
 710          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 711          || entry.rspec().type() == relocInfo::static_call_type
 712          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 713 
 714   unsigned int start_offset = offset();
 715   if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
 716     address stub = emit_trampoline_stub(start_offset, entry.target());
 717     if (stub == NULL) {
 718       return NULL; // CodeCache is full
 719     }
 720   }
 721 
 722   if (cbuf) cbuf->set_insts_mark();
 723   relocate(entry.rspec());
 724   if (!far_branches()) {
 725     bl(entry.target());
 726   } else {
 727     bl(pc());
 728   }
 729   // just need to return a non-null address
 730   return pc();
 731 }
 732 
 733 
 734 // Emit a trampoline stub for a call to a target which is too far away.
 735 //
 736 // code sequences:
 737 //
 738 // call-site:
 739 //   branch-and-link to <destination> or <trampoline stub>
 740 //
 741 // Related trampoline stub for this call site in the stub section:
 742 //   load the call target from the constant pool
 743 //   branch (LR still points to the call site above)
 744 
 745 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 746                                              address dest) {
 747   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 748   if (stub == NULL) {
 749     return NULL;  // CodeBuffer::expand failed
 750   }
 751 
 752   // Create a trampoline stub relocation which relates this trampoline stub
 753   // with the call instruction at insts_call_instruction_offset in the
 754   // instructions code-section.
 755   align(wordSize);
 756   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 757                                             + insts_call_instruction_offset));
 758   const int stub_start_offset = offset();
 759 
 760   // Now, create the trampoline stub's code:
 761   // - load the call
 762   // - call
 763   Label target;
 764   ldr(rscratch1, target);
 765   br(rscratch1);
 766   bind(target);
 767   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 768          "should be");
 769   emit_int64((int64_t)dest);
 770 
 771   const address stub_start_addr = addr_at(stub_start_offset);
 772 
 773   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 774 
 775   end_a_stub();
 776   return stub;
 777 }
 778 
 779 address MacroAssembler::ic_call(address entry, jint method_index) {
 780   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 781   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 782   // unsigned long offset;
 783   // ldr_constant(rscratch2, const_ptr);
 784   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 785   return trampoline_call(Address(entry, rh));
 786 }
 787 
 788 // Implementation of call_VM versions
 789 
 790 void MacroAssembler::call_VM(Register oop_result,
 791                              address entry_point,
 792                              bool check_exceptions) {
 793   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 794 }
 795 
 796 void MacroAssembler::call_VM(Register oop_result,
 797                              address entry_point,
 798                              Register arg_1,
 799                              bool check_exceptions) {
 800   pass_arg1(this, arg_1);
 801   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 802 }
 803 
 804 void MacroAssembler::call_VM(Register oop_result,
 805                              address entry_point,
 806                              Register arg_1,
 807                              Register arg_2,
 808                              bool check_exceptions) {
 809   assert(arg_1 != c_rarg2, "smashed arg");
 810   pass_arg2(this, arg_2);
 811   pass_arg1(this, arg_1);
 812   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 813 }
 814 
 815 void MacroAssembler::call_VM(Register oop_result,
 816                              address entry_point,
 817                              Register arg_1,
 818                              Register arg_2,
 819                              Register arg_3,
 820                              bool check_exceptions) {
 821   assert(arg_1 != c_rarg3, "smashed arg");
 822   assert(arg_2 != c_rarg3, "smashed arg");
 823   pass_arg3(this, arg_3);
 824 
 825   assert(arg_1 != c_rarg2, "smashed arg");
 826   pass_arg2(this, arg_2);
 827 
 828   pass_arg1(this, arg_1);
 829   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 830 }
 831 
 832 void MacroAssembler::call_VM(Register oop_result,
 833                              Register last_java_sp,
 834                              address entry_point,
 835                              int number_of_arguments,
 836                              bool check_exceptions) {
 837   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 838 }
 839 
 840 void MacroAssembler::call_VM(Register oop_result,
 841                              Register last_java_sp,
 842                              address entry_point,
 843                              Register arg_1,
 844                              bool check_exceptions) {
 845   pass_arg1(this, arg_1);
 846   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 847 }
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              Register last_java_sp,
 851                              address entry_point,
 852                              Register arg_1,
 853                              Register arg_2,
 854                              bool check_exceptions) {
 855 
 856   assert(arg_1 != c_rarg2, "smashed arg");
 857   pass_arg2(this, arg_2);
 858   pass_arg1(this, arg_1);
 859   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 860 }
 861 
 862 void MacroAssembler::call_VM(Register oop_result,
 863                              Register last_java_sp,
 864                              address entry_point,
 865                              Register arg_1,
 866                              Register arg_2,
 867                              Register arg_3,
 868                              bool check_exceptions) {
 869   assert(arg_1 != c_rarg3, "smashed arg");
 870   assert(arg_2 != c_rarg3, "smashed arg");
 871   pass_arg3(this, arg_3);
 872   assert(arg_1 != c_rarg2, "smashed arg");
 873   pass_arg2(this, arg_2);
 874   pass_arg1(this, arg_1);
 875   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 876 }
 877 
 878 
 879 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 880   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 881   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 882   verify_oop(oop_result, "broken oop in call_VM_base");
 883 }
 884 
 885 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 886   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 887   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 888 }
 889 
 890 void MacroAssembler::align(int modulus) {
 891   while (offset() % modulus != 0) nop();
 892 }
 893 
 894 // these are no-ops overridden by InterpreterMacroAssembler
 895 
 896 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 897 
 898 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 899 
 900 
 901 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 902                                                       Register tmp,
 903                                                       int offset) {
 904   intptr_t value = *delayed_value_addr;
 905   if (value != 0)
 906     return RegisterOrConstant(value + offset);
 907 
 908   // load indirectly to solve generation ordering problem
 909   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 910 
 911   if (offset != 0)
 912     add(tmp, tmp, offset);
 913 
 914   return RegisterOrConstant(tmp);
 915 }
 916 
 917 
 918 void MacroAssembler:: notify(int type) {
 919   if (type == bytecode_start) {
 920     // set_last_Java_frame(esp, rfp, (address)NULL);
 921     Assembler:: notify(type);
 922     // reset_last_Java_frame(true);
 923   }
 924   else
 925     Assembler:: notify(type);
 926 }
 927 
 928 // Look up the method for a megamorphic invokeinterface call.
 929 // The target method is determined by <intf_klass, itable_index>.
 930 // The receiver klass is in recv_klass.
 931 // On success, the result will be in method_result, and execution falls through.
 932 // On failure, execution transfers to the given label.
 933 void MacroAssembler::lookup_interface_method(Register recv_klass,
 934                                              Register intf_klass,
 935                                              RegisterOrConstant itable_index,
 936                                              Register method_result,
 937                                              Register scan_temp,
 938                                              Label& L_no_such_interface) {
 939   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 940   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 941          "caller must use same register for non-constant itable index as for method");
 942 
 943   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 944   int vtable_base = in_bytes(Klass::vtable_start_offset());
 945   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 946   int scan_step   = itableOffsetEntry::size() * wordSize;
 947   int vte_size    = vtableEntry::size_in_bytes();
 948   assert(vte_size == wordSize, "else adjust times_vte_scale");
 949 
 950   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 951 
 952   // %%% Could store the aligned, prescaled offset in the klassoop.
 953   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 954   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 955   add(scan_temp, scan_temp, vtable_base);
 956 
 957   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 958   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 959   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 960   lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 961   if (itentry_off)
 962     add(recv_klass, recv_klass, itentry_off);
 963 
 964   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 965   //   if (scan->interface() == intf) {
 966   //     result = (klass + scan->offset() + itable_index);
 967   //   }
 968   // }
 969   Label search, found_method;
 970 
 971   for (int peel = 1; peel >= 0; peel--) {
 972     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 973     cmp(intf_klass, method_result);
 974 
 975     if (peel) {
 976       br(Assembler::EQ, found_method);
 977     } else {
 978       br(Assembler::NE, search);
 979       // (invert the test to fall through to found_method...)
 980     }
 981 
 982     if (!peel)  break;
 983 
 984     bind(search);
 985 
 986     // Check that the previous entry is non-null.  A null entry means that
 987     // the receiver class doesn't implement the interface, and wasn't the
 988     // same as when the caller was compiled.
 989     cbz(method_result, L_no_such_interface);
 990     add(scan_temp, scan_temp, scan_step);
 991   }
 992 
 993   bind(found_method);
 994 
 995   // Got a hit.
 996   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 997   ldr(method_result, Address(recv_klass, scan_temp));
 998 }
 999 
1000 // virtual method calling
1001 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1002                                            RegisterOrConstant vtable_index,
1003                                            Register method_result) {
1004   const int base = in_bytes(Klass::vtable_start_offset());
1005   assert(vtableEntry::size() * wordSize == 8,
1006          "adjust the scaling in the code below");
1007   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1008 
1009   if (vtable_index.is_register()) {
1010     lea(method_result, Address(recv_klass,
1011                                vtable_index.as_register(),
1012                                Address::lsl(LogBytesPerWord)));
1013     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1014   } else {
1015     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1016     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
1017   }
1018 }
1019 
1020 void MacroAssembler::check_klass_subtype(Register sub_klass,
1021                            Register super_klass,
1022                            Register temp_reg,
1023                            Label& L_success) {
1024   Label L_failure;
1025   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1026   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1027   bind(L_failure);
1028 }
1029 
1030 
1031 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1032                                                    Register super_klass,
1033                                                    Register temp_reg,
1034                                                    Label* L_success,
1035                                                    Label* L_failure,
1036                                                    Label* L_slow_path,
1037                                         RegisterOrConstant super_check_offset) {
1038   assert_different_registers(sub_klass, super_klass, temp_reg);
1039   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1040   if (super_check_offset.is_register()) {
1041     assert_different_registers(sub_klass, super_klass,
1042                                super_check_offset.as_register());
1043   } else if (must_load_sco) {
1044     assert(temp_reg != noreg, "supply either a temp or a register offset");
1045   }
1046 
1047   Label L_fallthrough;
1048   int label_nulls = 0;
1049   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1050   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1051   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1052   assert(label_nulls <= 1, "at most one NULL in the batch");
1053 
1054   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1055   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1056   Address super_check_offset_addr(super_klass, sco_offset);
1057 
1058   // Hacked jmp, which may only be used just before L_fallthrough.
1059 #define final_jmp(label)                                                \
1060   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1061   else                            b(label)                /*omit semi*/
1062 
1063   // If the pointers are equal, we are done (e.g., String[] elements).
1064   // This self-check enables sharing of secondary supertype arrays among
1065   // non-primary types such as array-of-interface.  Otherwise, each such
1066   // type would need its own customized SSA.
1067   // We move this check to the front of the fast path because many
1068   // type checks are in fact trivially successful in this manner,
1069   // so we get a nicely predicted branch right at the start of the check.
1070   cmp(sub_klass, super_klass);
1071   br(Assembler::EQ, *L_success);
1072 
1073   // Check the supertype display:
1074   if (must_load_sco) {
1075     ldrw(temp_reg, super_check_offset_addr);
1076     super_check_offset = RegisterOrConstant(temp_reg);
1077   }
1078   Address super_check_addr(sub_klass, super_check_offset);
1079   ldr(rscratch1, super_check_addr);
1080   cmp(super_klass, rscratch1); // load displayed supertype
1081 
1082   // This check has worked decisively for primary supers.
1083   // Secondary supers are sought in the super_cache ('super_cache_addr').
1084   // (Secondary supers are interfaces and very deeply nested subtypes.)
1085   // This works in the same check above because of a tricky aliasing
1086   // between the super_cache and the primary super display elements.
1087   // (The 'super_check_addr' can address either, as the case requires.)
1088   // Note that the cache is updated below if it does not help us find
1089   // what we need immediately.
1090   // So if it was a primary super, we can just fail immediately.
1091   // Otherwise, it's the slow path for us (no success at this point).
1092 
1093   if (super_check_offset.is_register()) {
1094     br(Assembler::EQ, *L_success);
1095     cmp(super_check_offset.as_register(), sc_offset);
1096     if (L_failure == &L_fallthrough) {
1097       br(Assembler::EQ, *L_slow_path);
1098     } else {
1099       br(Assembler::NE, *L_failure);
1100       final_jmp(*L_slow_path);
1101     }
1102   } else if (super_check_offset.as_constant() == sc_offset) {
1103     // Need a slow path; fast failure is impossible.
1104     if (L_slow_path == &L_fallthrough) {
1105       br(Assembler::EQ, *L_success);
1106     } else {
1107       br(Assembler::NE, *L_slow_path);
1108       final_jmp(*L_success);
1109     }
1110   } else {
1111     // No slow path; it's a fast decision.
1112     if (L_failure == &L_fallthrough) {
1113       br(Assembler::EQ, *L_success);
1114     } else {
1115       br(Assembler::NE, *L_failure);
1116       final_jmp(*L_success);
1117     }
1118   }
1119 
1120   bind(L_fallthrough);
1121 
1122 #undef final_jmp
1123 }
1124 
1125 // These two are taken from x86, but they look generally useful
1126 
1127 // scans count pointer sized words at [addr] for occurence of value,
1128 // generic
1129 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1130                                 Register scratch) {
1131   Label Lloop, Lexit;
1132   cbz(count, Lexit);
1133   bind(Lloop);
1134   ldr(scratch, post(addr, wordSize));
1135   cmp(value, scratch);
1136   br(EQ, Lexit);
1137   sub(count, count, 1);
1138   cbnz(count, Lloop);
1139   bind(Lexit);
1140 }
1141 
1142 // scans count 4 byte words at [addr] for occurence of value,
1143 // generic
1144 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1145                                 Register scratch) {
1146   Label Lloop, Lexit;
1147   cbz(count, Lexit);
1148   bind(Lloop);
1149   ldrw(scratch, post(addr, wordSize));
1150   cmpw(value, scratch);
1151   br(EQ, Lexit);
1152   sub(count, count, 1);
1153   cbnz(count, Lloop);
1154   bind(Lexit);
1155 }
1156 
1157 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1158                                                    Register super_klass,
1159                                                    Register temp_reg,
1160                                                    Register temp2_reg,
1161                                                    Label* L_success,
1162                                                    Label* L_failure,
1163                                                    bool set_cond_codes) {
1164   assert_different_registers(sub_klass, super_klass, temp_reg);
1165   if (temp2_reg != noreg)
1166     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1167 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1168 
1169   Label L_fallthrough;
1170   int label_nulls = 0;
1171   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1172   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1173   assert(label_nulls <= 1, "at most one NULL in the batch");
1174 
1175   // a couple of useful fields in sub_klass:
1176   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1177   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1178   Address secondary_supers_addr(sub_klass, ss_offset);
1179   Address super_cache_addr(     sub_klass, sc_offset);
1180 
1181   BLOCK_COMMENT("check_klass_subtype_slow_path");
1182 
1183   // Do a linear scan of the secondary super-klass chain.
1184   // This code is rarely used, so simplicity is a virtue here.
1185   // The repne_scan instruction uses fixed registers, which we must spill.
1186   // Don't worry too much about pre-existing connections with the input regs.
1187 
1188   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1189   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1190 
1191   // Get super_klass value into r0 (even if it was in r5 or r2).
1192   RegSet pushed_registers;
1193   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1194   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1195 
1196   if (super_klass != r0 || UseCompressedOops) {
1197     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1198   }
1199 
1200   push(pushed_registers, sp);
1201 
1202 #ifndef PRODUCT
1203   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1204   Address pst_counter_addr(rscratch2);
1205   ldr(rscratch1, pst_counter_addr);
1206   add(rscratch1, rscratch1, 1);
1207   str(rscratch1, pst_counter_addr);
1208 #endif //PRODUCT
1209 
1210   // We will consult the secondary-super array.
1211   ldr(r5, secondary_supers_addr);
1212   // Load the array length.
1213   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1214   // Skip to start of data.
1215   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1216 
1217   cmp(sp, zr); // Clear Z flag; SP is never zero
1218   // Scan R2 words at [R5] for an occurrence of R0.
1219   // Set NZ/Z based on last compare.
1220   repne_scan(r5, r0, r2, rscratch1);
1221 
1222   // Unspill the temp. registers:
1223   pop(pushed_registers, sp);
1224 
1225   br(Assembler::NE, *L_failure);
1226 
1227   // Success.  Cache the super we found and proceed in triumph.
1228   str(super_klass, super_cache_addr);
1229 
1230   if (L_success != &L_fallthrough) {
1231     b(*L_success);
1232   }
1233 
1234 #undef IS_A_TEMP
1235 
1236   bind(L_fallthrough);
1237 }
1238 
1239 
1240 void MacroAssembler::verify_oop(Register reg, const char* s) {
1241   if (!VerifyOops) return;
1242 
1243   // Pass register number to verify_oop_subroutine
1244   const char* b = NULL;
1245   {
1246     ResourceMark rm;
1247     stringStream ss;
1248     ss.print("verify_oop: %s: %s", reg->name(), s);
1249     b = code_string(ss.as_string());
1250   }
1251   BLOCK_COMMENT("verify_oop {");
1252 
1253   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1254   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1255 
1256   mov(r0, reg);
1257   mov(rscratch1, (address)b);
1258 
1259   // call indirectly to solve generation ordering problem
1260   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1261   ldr(rscratch2, Address(rscratch2));
1262   blr(rscratch2);
1263 
1264   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1265   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1266 
1267   BLOCK_COMMENT("} verify_oop");
1268 }
1269 
1270 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1271   if (!VerifyOops) return;
1272 
1273   const char* b = NULL;
1274   {
1275     ResourceMark rm;
1276     stringStream ss;
1277     ss.print("verify_oop_addr: %s", s);
1278     b = code_string(ss.as_string());
1279   }
1280   BLOCK_COMMENT("verify_oop_addr {");
1281 
1282   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1283   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1284 
1285   // addr may contain sp so we will have to adjust it based on the
1286   // pushes that we just did.
1287   if (addr.uses(sp)) {
1288     lea(r0, addr);
1289     ldr(r0, Address(r0, 4 * wordSize));
1290   } else {
1291     ldr(r0, addr);
1292   }
1293   mov(rscratch1, (address)b);
1294 
1295   // call indirectly to solve generation ordering problem
1296   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1297   ldr(rscratch2, Address(rscratch2));
1298   blr(rscratch2);
1299 
1300   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1301   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1302 
1303   BLOCK_COMMENT("} verify_oop_addr");
1304 }
1305 
1306 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1307                                          int extra_slot_offset) {
1308   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1309   int stackElementSize = Interpreter::stackElementSize;
1310   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1311 #ifdef ASSERT
1312   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1313   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1314 #endif
1315   if (arg_slot.is_constant()) {
1316     return Address(esp, arg_slot.as_constant() * stackElementSize
1317                    + offset);
1318   } else {
1319     add(rscratch1, esp, arg_slot.as_register(),
1320         ext::uxtx, exact_log2(stackElementSize));
1321     return Address(rscratch1, offset);
1322   }
1323 }
1324 
1325 void MacroAssembler::call_VM_leaf_base(address entry_point,
1326                                        int number_of_arguments,
1327                                        Label *retaddr) {
1328   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1329 }
1330 
1331 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1332                                         int number_of_gp_arguments,
1333                                         int number_of_fp_arguments,
1334                                         ret_type type,
1335                                         Label *retaddr) {
1336   Label E, L;
1337 
1338   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1339 
1340   // We add 1 to number_of_arguments because the thread in arg0 is
1341   // not counted
1342   mov(rscratch1, entry_point);
1343   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1344   if (retaddr)
1345     bind(*retaddr);
1346 
1347   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1348   maybe_isb();
1349 }
1350 
1351 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1352   call_VM_leaf_base(entry_point, number_of_arguments);
1353 }
1354 
1355 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1356   pass_arg0(this, arg_0);
1357   call_VM_leaf_base(entry_point, 1);
1358 }
1359 
1360 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1361   pass_arg0(this, arg_0);
1362   pass_arg1(this, arg_1);
1363   call_VM_leaf_base(entry_point, 2);
1364 }
1365 
1366 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1367                                   Register arg_1, Register arg_2) {
1368   pass_arg0(this, arg_0);
1369   pass_arg1(this, arg_1);
1370   pass_arg2(this, arg_2);
1371   call_VM_leaf_base(entry_point, 3);
1372 }
1373 
1374 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1375   pass_arg0(this, arg_0);
1376   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1377 }
1378 
1379 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1380 
1381   assert(arg_0 != c_rarg1, "smashed arg");
1382   pass_arg1(this, arg_1);
1383   pass_arg0(this, arg_0);
1384   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1385 }
1386 
1387 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1388   assert(arg_0 != c_rarg2, "smashed arg");
1389   assert(arg_1 != c_rarg2, "smashed arg");
1390   pass_arg2(this, arg_2);
1391   assert(arg_0 != c_rarg1, "smashed arg");
1392   pass_arg1(this, arg_1);
1393   pass_arg0(this, arg_0);
1394   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1395 }
1396 
1397 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1398   assert(arg_0 != c_rarg3, "smashed arg");
1399   assert(arg_1 != c_rarg3, "smashed arg");
1400   assert(arg_2 != c_rarg3, "smashed arg");
1401   pass_arg3(this, arg_3);
1402   assert(arg_0 != c_rarg2, "smashed arg");
1403   assert(arg_1 != c_rarg2, "smashed arg");
1404   pass_arg2(this, arg_2);
1405   assert(arg_0 != c_rarg1, "smashed arg");
1406   pass_arg1(this, arg_1);
1407   pass_arg0(this, arg_0);
1408   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1409 }
1410 
1411 void MacroAssembler::null_check(Register reg, int offset) {
1412   if (needs_explicit_null_check(offset)) {
1413     // provoke OS NULL exception if reg = NULL by
1414     // accessing M[reg] w/o changing any registers
1415     // NOTE: this is plenty to provoke a segv
1416     ldr(zr, Address(reg));
1417   } else {
1418     // nothing to do, (later) access of M[reg + offset]
1419     // will provoke OS NULL exception if reg = NULL
1420   }
1421 }
1422 
1423 // MacroAssembler protected routines needed to implement
1424 // public methods
1425 
1426 void MacroAssembler::mov(Register r, Address dest) {
1427   code_section()->relocate(pc(), dest.rspec());
1428   u_int64_t imm64 = (u_int64_t)dest.target();
1429   movptr(r, imm64);
1430 }
1431 
1432 // Move a constant pointer into r.  In AArch64 mode the virtual
1433 // address space is 48 bits in size, so we only need three
1434 // instructions to create a patchable instruction sequence that can
1435 // reach anywhere.
1436 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1437 #ifndef PRODUCT
1438   {
1439     char buffer[64];
1440     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1441     block_comment(buffer);
1442   }
1443 #endif
1444   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1445   movz(r, imm64 & 0xffff);
1446   imm64 >>= 16;
1447   movk(r, imm64 & 0xffff, 16);
1448   imm64 >>= 16;
1449   movk(r, imm64 & 0xffff, 32);
1450 }
1451 
1452 // Macro to mov replicated immediate to vector register.
1453 //  Vd will get the following values for different arrangements in T
1454 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1455 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1456 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1457 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1458 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1459 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1460 //   T1D/T2D: invalid
1461 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1462   assert(T != T1D && T != T2D, "invalid arrangement");
1463   if (T == T8B || T == T16B) {
1464     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1465     movi(Vd, T, imm32 & 0xff, 0);
1466     return;
1467   }
1468   u_int32_t nimm32 = ~imm32;
1469   if (T == T4H || T == T8H) {
1470     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1471     imm32 &= 0xffff;
1472     nimm32 &= 0xffff;
1473   }
1474   u_int32_t x = imm32;
1475   int movi_cnt = 0;
1476   int movn_cnt = 0;
1477   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1478   x = nimm32;
1479   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1480   if (movn_cnt < movi_cnt) imm32 = nimm32;
1481   unsigned lsl = 0;
1482   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1483   if (movn_cnt < movi_cnt)
1484     mvni(Vd, T, imm32 & 0xff, lsl);
1485   else
1486     movi(Vd, T, imm32 & 0xff, lsl);
1487   imm32 >>= 8; lsl += 8;
1488   while (imm32) {
1489     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1490     if (movn_cnt < movi_cnt)
1491       bici(Vd, T, imm32 & 0xff, lsl);
1492     else
1493       orri(Vd, T, imm32 & 0xff, lsl);
1494     lsl += 8; imm32 >>= 8;
1495   }
1496 }
1497 
1498 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1499 {
1500 #ifndef PRODUCT
1501   {
1502     char buffer[64];
1503     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1504     block_comment(buffer);
1505   }
1506 #endif
1507   if (operand_valid_for_logical_immediate(false, imm64)) {
1508     orr(dst, zr, imm64);
1509   } else {
1510     // we can use a combination of MOVZ or MOVN with
1511     // MOVK to build up the constant
1512     u_int64_t imm_h[4];
1513     int zero_count = 0;
1514     int neg_count = 0;
1515     int i;
1516     for (i = 0; i < 4; i++) {
1517       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1518       if (imm_h[i] == 0) {
1519         zero_count++;
1520       } else if (imm_h[i] == 0xffffL) {
1521         neg_count++;
1522       }
1523     }
1524     if (zero_count == 4) {
1525       // one MOVZ will do
1526       movz(dst, 0);
1527     } else if (neg_count == 4) {
1528       // one MOVN will do
1529       movn(dst, 0);
1530     } else if (zero_count == 3) {
1531       for (i = 0; i < 4; i++) {
1532         if (imm_h[i] != 0L) {
1533           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1534           break;
1535         }
1536       }
1537     } else if (neg_count == 3) {
1538       // one MOVN will do
1539       for (int i = 0; i < 4; i++) {
1540         if (imm_h[i] != 0xffffL) {
1541           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1542           break;
1543         }
1544       }
1545     } else if (zero_count == 2) {
1546       // one MOVZ and one MOVK will do
1547       for (i = 0; i < 3; i++) {
1548         if (imm_h[i] != 0L) {
1549           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1550           i++;
1551           break;
1552         }
1553       }
1554       for (;i < 4; i++) {
1555         if (imm_h[i] != 0L) {
1556           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1557         }
1558       }
1559     } else if (neg_count == 2) {
1560       // one MOVN and one MOVK will do
1561       for (i = 0; i < 4; i++) {
1562         if (imm_h[i] != 0xffffL) {
1563           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1564           i++;
1565           break;
1566         }
1567       }
1568       for (;i < 4; i++) {
1569         if (imm_h[i] != 0xffffL) {
1570           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1571         }
1572       }
1573     } else if (zero_count == 1) {
1574       // one MOVZ and two MOVKs will do
1575       for (i = 0; i < 4; i++) {
1576         if (imm_h[i] != 0L) {
1577           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1578           i++;
1579           break;
1580         }
1581       }
1582       for (;i < 4; i++) {
1583         if (imm_h[i] != 0x0L) {
1584           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1585         }
1586       }
1587     } else if (neg_count == 1) {
1588       // one MOVN and two MOVKs will do
1589       for (i = 0; i < 4; i++) {
1590         if (imm_h[i] != 0xffffL) {
1591           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1592           i++;
1593           break;
1594         }
1595       }
1596       for (;i < 4; i++) {
1597         if (imm_h[i] != 0xffffL) {
1598           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1599         }
1600       }
1601     } else {
1602       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1603       movz(dst, (u_int32_t)imm_h[0], 0);
1604       for (i = 1; i < 4; i++) {
1605         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1606       }
1607     }
1608   }
1609 }
1610 
1611 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1612 {
1613 #ifndef PRODUCT
1614     {
1615       char buffer[64];
1616       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1617       block_comment(buffer);
1618     }
1619 #endif
1620   if (operand_valid_for_logical_immediate(true, imm32)) {
1621     orrw(dst, zr, imm32);
1622   } else {
1623     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1624     // constant
1625     u_int32_t imm_h[2];
1626     imm_h[0] = imm32 & 0xffff;
1627     imm_h[1] = ((imm32 >> 16) & 0xffff);
1628     if (imm_h[0] == 0) {
1629       movzw(dst, imm_h[1], 16);
1630     } else if (imm_h[0] == 0xffff) {
1631       movnw(dst, imm_h[1] ^ 0xffff, 16);
1632     } else if (imm_h[1] == 0) {
1633       movzw(dst, imm_h[0], 0);
1634     } else if (imm_h[1] == 0xffff) {
1635       movnw(dst, imm_h[0] ^ 0xffff, 0);
1636     } else {
1637       // use a MOVZ and MOVK (makes it easier to debug)
1638       movzw(dst, imm_h[0], 0);
1639       movkw(dst, imm_h[1], 16);
1640     }
1641   }
1642 }
1643 
1644 // Form an address from base + offset in Rd.  Rd may or may
1645 // not actually be used: you must use the Address that is returned.
1646 // It is up to you to ensure that the shift provided matches the size
1647 // of your data.
1648 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1649   if (Address::offset_ok_for_immed(byte_offset, shift))
1650     // It fits; no need for any heroics
1651     return Address(base, byte_offset);
1652 
1653   // Don't do anything clever with negative or misaligned offsets
1654   unsigned mask = (1 << shift) - 1;
1655   if (byte_offset < 0 || byte_offset & mask) {
1656     mov(Rd, byte_offset);
1657     add(Rd, base, Rd);
1658     return Address(Rd);
1659   }
1660 
1661   // See if we can do this with two 12-bit offsets
1662   {
1663     unsigned long word_offset = byte_offset >> shift;
1664     unsigned long masked_offset = word_offset & 0xfff000;
1665     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1666         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1667       add(Rd, base, masked_offset << shift);
1668       word_offset -= masked_offset;
1669       return Address(Rd, word_offset << shift);
1670     }
1671   }
1672 
1673   // Do it the hard way
1674   mov(Rd, byte_offset);
1675   add(Rd, base, Rd);
1676   return Address(Rd);
1677 }
1678 
1679 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1680   if (UseLSE) {
1681     mov(tmp, 1);
1682     ldadd(Assembler::word, tmp, zr, counter_addr);
1683     return;
1684   }
1685   Label retry_load;
1686   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1687     prfm(Address(counter_addr), PSTL1STRM);
1688   bind(retry_load);
1689   // flush and load exclusive from the memory location
1690   ldxrw(tmp, counter_addr);
1691   addw(tmp, tmp, 1);
1692   // if we store+flush with no intervening write tmp wil be zero
1693   stxrw(tmp2, tmp, counter_addr);
1694   cbnzw(tmp2, retry_load);
1695 }
1696 
1697 
1698 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1699                                     bool want_remainder, Register scratch)
1700 {
1701   // Full implementation of Java idiv and irem.  The function
1702   // returns the (pc) offset of the div instruction - may be needed
1703   // for implicit exceptions.
1704   //
1705   // constraint : ra/rb =/= scratch
1706   //         normal case
1707   //
1708   // input : ra: dividend
1709   //         rb: divisor
1710   //
1711   // result: either
1712   //         quotient  (= ra idiv rb)
1713   //         remainder (= ra irem rb)
1714 
1715   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1716 
1717   int idivl_offset = offset();
1718   if (! want_remainder) {
1719     sdivw(result, ra, rb);
1720   } else {
1721     sdivw(scratch, ra, rb);
1722     Assembler::msubw(result, scratch, rb, ra);
1723   }
1724 
1725   return idivl_offset;
1726 }
1727 
1728 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1729                                     bool want_remainder, Register scratch)
1730 {
1731   // Full implementation of Java ldiv and lrem.  The function
1732   // returns the (pc) offset of the div instruction - may be needed
1733   // for implicit exceptions.
1734   //
1735   // constraint : ra/rb =/= scratch
1736   //         normal case
1737   //
1738   // input : ra: dividend
1739   //         rb: divisor
1740   //
1741   // result: either
1742   //         quotient  (= ra idiv rb)
1743   //         remainder (= ra irem rb)
1744 
1745   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1746 
1747   int idivq_offset = offset();
1748   if (! want_remainder) {
1749     sdiv(result, ra, rb);
1750   } else {
1751     sdiv(scratch, ra, rb);
1752     Assembler::msub(result, scratch, rb, ra);
1753   }
1754 
1755   return idivq_offset;
1756 }
1757 
1758 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1759   address prev = pc() - NativeMembar::instruction_size;
1760   if (prev == code()->last_membar()) {
1761     NativeMembar *bar = NativeMembar_at(prev);
1762     // We are merging two memory barrier instructions.  On AArch64 we
1763     // can do this simply by ORing them together.
1764     bar->set_kind(bar->get_kind() | order_constraint);
1765     BLOCK_COMMENT("merged membar");
1766   } else {
1767     code()->set_last_membar(pc());
1768     dmb(Assembler::barrier(order_constraint));
1769   }
1770 }
1771 
1772 // MacroAssembler routines found actually to be needed
1773 
1774 void MacroAssembler::push(Register src)
1775 {
1776   str(src, Address(pre(esp, -1 * wordSize)));
1777 }
1778 
1779 void MacroAssembler::pop(Register dst)
1780 {
1781   ldr(dst, Address(post(esp, 1 * wordSize)));
1782 }
1783 
1784 // Note: load_unsigned_short used to be called load_unsigned_word.
1785 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1786   int off = offset();
1787   ldrh(dst, src);
1788   return off;
1789 }
1790 
1791 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1792   int off = offset();
1793   ldrb(dst, src);
1794   return off;
1795 }
1796 
1797 int MacroAssembler::load_signed_short(Register dst, Address src) {
1798   int off = offset();
1799   ldrsh(dst, src);
1800   return off;
1801 }
1802 
1803 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1804   int off = offset();
1805   ldrsb(dst, src);
1806   return off;
1807 }
1808 
1809 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1810   int off = offset();
1811   ldrshw(dst, src);
1812   return off;
1813 }
1814 
1815 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1816   int off = offset();
1817   ldrsbw(dst, src);
1818   return off;
1819 }
1820 
1821 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1822   switch (size_in_bytes) {
1823   case  8:  ldr(dst, src); break;
1824   case  4:  ldrw(dst, src); break;
1825   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1826   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1827   default:  ShouldNotReachHere();
1828   }
1829 }
1830 
1831 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1832   switch (size_in_bytes) {
1833   case  8:  str(src, dst); break;
1834   case  4:  strw(src, dst); break;
1835   case  2:  strh(src, dst); break;
1836   case  1:  strb(src, dst); break;
1837   default:  ShouldNotReachHere();
1838   }
1839 }
1840 
1841 void MacroAssembler::decrementw(Register reg, int value)
1842 {
1843   if (value < 0)  { incrementw(reg, -value);      return; }
1844   if (value == 0) {                               return; }
1845   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1846   /* else */ {
1847     guarantee(reg != rscratch2, "invalid dst for register decrement");
1848     movw(rscratch2, (unsigned)value);
1849     subw(reg, reg, rscratch2);
1850   }
1851 }
1852 
1853 void MacroAssembler::decrement(Register reg, int value)
1854 {
1855   if (value < 0)  { increment(reg, -value);      return; }
1856   if (value == 0) {                              return; }
1857   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1858   /* else */ {
1859     assert(reg != rscratch2, "invalid dst for register decrement");
1860     mov(rscratch2, (unsigned long)value);
1861     sub(reg, reg, rscratch2);
1862   }
1863 }
1864 
1865 void MacroAssembler::decrementw(Address dst, int value)
1866 {
1867   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1868   ldrw(rscratch1, dst);
1869   decrementw(rscratch1, value);
1870   strw(rscratch1, dst);
1871 }
1872 
1873 void MacroAssembler::decrement(Address dst, int value)
1874 {
1875   assert(!dst.uses(rscratch1), "invalid address for decrement");
1876   ldr(rscratch1, dst);
1877   decrement(rscratch1, value);
1878   str(rscratch1, dst);
1879 }
1880 
1881 void MacroAssembler::incrementw(Register reg, int value)
1882 {
1883   if (value < 0)  { decrementw(reg, -value);      return; }
1884   if (value == 0) {                               return; }
1885   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1886   /* else */ {
1887     assert(reg != rscratch2, "invalid dst for register increment");
1888     movw(rscratch2, (unsigned)value);
1889     addw(reg, reg, rscratch2);
1890   }
1891 }
1892 
1893 void MacroAssembler::increment(Register reg, int value)
1894 {
1895   if (value < 0)  { decrement(reg, -value);      return; }
1896   if (value == 0) {                              return; }
1897   if (value < (1 << 12)) { add(reg, reg, value); return; }
1898   /* else */ {
1899     assert(reg != rscratch2, "invalid dst for register increment");
1900     movw(rscratch2, (unsigned)value);
1901     add(reg, reg, rscratch2);
1902   }
1903 }
1904 
1905 void MacroAssembler::incrementw(Address dst, int value)
1906 {
1907   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1908   ldrw(rscratch1, dst);
1909   incrementw(rscratch1, value);
1910   strw(rscratch1, dst);
1911 }
1912 
1913 void MacroAssembler::increment(Address dst, int value)
1914 {
1915   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1916   ldr(rscratch1, dst);
1917   increment(rscratch1, value);
1918   str(rscratch1, dst);
1919 }
1920 
1921 
1922 void MacroAssembler::pusha() {
1923   push(0x7fffffff, sp);
1924 }
1925 
1926 void MacroAssembler::popa() {
1927   pop(0x7fffffff, sp);
1928 }
1929 
1930 // Push lots of registers in the bit set supplied.  Don't push sp.
1931 // Return the number of words pushed
1932 int MacroAssembler::push(unsigned int bitset, Register stack) {
1933   int words_pushed = 0;
1934 
1935   // Scan bitset to accumulate register pairs
1936   unsigned char regs[32];
1937   int count = 0;
1938   for (int reg = 0; reg <= 30; reg++) {
1939     if (1 & bitset)
1940       regs[count++] = reg;
1941     bitset >>= 1;
1942   }
1943   regs[count++] = zr->encoding_nocheck();
1944   count &= ~1;  // Only push an even nuber of regs
1945 
1946   if (count) {
1947     stp(as_Register(regs[0]), as_Register(regs[1]),
1948        Address(pre(stack, -count * wordSize)));
1949     words_pushed += 2;
1950   }
1951   for (int i = 2; i < count; i += 2) {
1952     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1953        Address(stack, i * wordSize));
1954     words_pushed += 2;
1955   }
1956 
1957   assert(words_pushed == count, "oops, pushed != count");
1958 
1959   return count;
1960 }
1961 
1962 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1963   int words_pushed = 0;
1964 
1965   // Scan bitset to accumulate register pairs
1966   unsigned char regs[32];
1967   int count = 0;
1968   for (int reg = 0; reg <= 30; reg++) {
1969     if (1 & bitset)
1970       regs[count++] = reg;
1971     bitset >>= 1;
1972   }
1973   regs[count++] = zr->encoding_nocheck();
1974   count &= ~1;
1975 
1976   for (int i = 2; i < count; i += 2) {
1977     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1978        Address(stack, i * wordSize));
1979     words_pushed += 2;
1980   }
1981   if (count) {
1982     ldp(as_Register(regs[0]), as_Register(regs[1]),
1983        Address(post(stack, count * wordSize)));
1984     words_pushed += 2;
1985   }
1986 
1987   assert(words_pushed == count, "oops, pushed != count");
1988 
1989   return count;
1990 }
1991 #ifdef ASSERT
1992 void MacroAssembler::verify_heapbase(const char* msg) {
1993 #if 0
1994   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1995   assert (Universe::heap() != NULL, "java heap should be initialized");
1996   if (CheckCompressedOops) {
1997     Label ok;
1998     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1999     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2000     br(Assembler::EQ, ok);
2001     stop(msg);
2002     bind(ok);
2003     pop(1 << rscratch1->encoding(), sp);
2004   }
2005 #endif
2006 }
2007 #endif
2008 
2009 void MacroAssembler::stop(const char* msg) {
2010   address ip = pc();
2011   pusha();
2012   mov(c_rarg0, (address)msg);
2013   mov(c_rarg1, (address)ip);
2014   mov(c_rarg2, sp);
2015   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2016   // call(c_rarg3);
2017   blrt(c_rarg3, 3, 0, 1);
2018   hlt(0);
2019 }
2020 
2021 // If a constant does not fit in an immediate field, generate some
2022 // number of MOV instructions and then perform the operation.
2023 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2024                                            add_sub_imm_insn insn1,
2025                                            add_sub_reg_insn insn2) {
2026   assert(Rd != zr, "Rd = zr and not setting flags?");
2027   if (operand_valid_for_add_sub_immediate((int)imm)) {
2028     (this->*insn1)(Rd, Rn, imm);
2029   } else {
2030     if (uabs(imm) < (1 << 24)) {
2031        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2032        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2033     } else {
2034        assert_different_registers(Rd, Rn);
2035        mov(Rd, (uint64_t)imm);
2036        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2037     }
2038   }
2039 }
2040 
2041 // Seperate vsn which sets the flags. Optimisations are more restricted
2042 // because we must set the flags correctly.
2043 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2044                                            add_sub_imm_insn insn1,
2045                                            add_sub_reg_insn insn2) {
2046   if (operand_valid_for_add_sub_immediate((int)imm)) {
2047     (this->*insn1)(Rd, Rn, imm);
2048   } else {
2049     assert_different_registers(Rd, Rn);
2050     assert(Rd != zr, "overflow in immediate operand");
2051     mov(Rd, (uint64_t)imm);
2052     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2053   }
2054 }
2055 
2056 
2057 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2058   if (increment.is_register()) {
2059     add(Rd, Rn, increment.as_register());
2060   } else {
2061     add(Rd, Rn, increment.as_constant());
2062   }
2063 }
2064 
2065 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2066   if (increment.is_register()) {
2067     addw(Rd, Rn, increment.as_register());
2068   } else {
2069     addw(Rd, Rn, increment.as_constant());
2070   }
2071 }
2072 
2073 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2074   if (decrement.is_register()) {
2075     sub(Rd, Rn, decrement.as_register());
2076   } else {
2077     sub(Rd, Rn, decrement.as_constant());
2078   }
2079 }
2080 
2081 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2082   if (decrement.is_register()) {
2083     subw(Rd, Rn, decrement.as_register());
2084   } else {
2085     subw(Rd, Rn, decrement.as_constant());
2086   }
2087 }
2088 
2089 void MacroAssembler::reinit_heapbase()
2090 {
2091   if (UseCompressedOops) {
2092     if (Universe::is_fully_initialized()) {
2093       mov(rheapbase, Universe::narrow_ptrs_base());
2094     } else {
2095       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2096       ldr(rheapbase, Address(rheapbase));
2097     }
2098   }
2099 }
2100 
2101 // this simulates the behaviour of the x86 cmpxchg instruction using a
2102 // load linked/store conditional pair. we use the acquire/release
2103 // versions of these instructions so that we flush pending writes as
2104 // per Java semantics.
2105 
2106 // n.b the x86 version assumes the old value to be compared against is
2107 // in rax and updates rax with the value located in memory if the
2108 // cmpxchg fails. we supply a register for the old value explicitly
2109 
2110 // the aarch64 load linked/store conditional instructions do not
2111 // accept an offset. so, unlike x86, we must provide a plain register
2112 // to identify the memory word to be compared/exchanged rather than a
2113 // register+offset Address.
2114 
2115 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2116                                 Label &succeed, Label *fail) {
2117   // oldv holds comparison value
2118   // newv holds value to write in exchange
2119   // addr identifies memory word to compare against/update
2120   if (UseLSE) {
2121     mov(tmp, oldv);
2122     casal(Assembler::xword, oldv, newv, addr);
2123     cmp(tmp, oldv);
2124     br(Assembler::EQ, succeed);
2125     membar(AnyAny);
2126   } else {
2127     Label retry_load, nope;
2128     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2129       prfm(Address(addr), PSTL1STRM);
2130     bind(retry_load);
2131     // flush and load exclusive from the memory location
2132     // and fail if it is not what we expect
2133     ldaxr(tmp, addr);
2134     cmp(tmp, oldv);
2135     br(Assembler::NE, nope);
2136     // if we store+flush with no intervening write tmp wil be zero
2137     stlxr(tmp, newv, addr);
2138     cbzw(tmp, succeed);
2139     // retry so we only ever return after a load fails to compare
2140     // ensures we don't return a stale value after a failed write.
2141     b(retry_load);
2142     // if the memory word differs we return it in oldv and signal a fail
2143     bind(nope);
2144     membar(AnyAny);
2145     mov(oldv, tmp);
2146   }
2147   if (fail)
2148     b(*fail);
2149 }
2150 
2151 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2152                                 Label &succeed, Label *fail) {
2153   // oldv holds comparison value
2154   // newv holds value to write in exchange
2155   // addr identifies memory word to compare against/update
2156   // tmp returns 0/1 for success/failure
2157   if (UseLSE) {
2158     mov(tmp, oldv);
2159     casal(Assembler::word, oldv, newv, addr);
2160     cmp(tmp, oldv);
2161     br(Assembler::EQ, succeed);
2162     membar(AnyAny);
2163   } else {
2164     Label retry_load, nope;
2165     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2166       prfm(Address(addr), PSTL1STRM);
2167     bind(retry_load);
2168     // flush and load exclusive from the memory location
2169     // and fail if it is not what we expect
2170     ldaxrw(tmp, addr);
2171     cmp(tmp, oldv);
2172     br(Assembler::NE, nope);
2173     // if we store+flush with no intervening write tmp wil be zero
2174     stlxrw(tmp, newv, addr);
2175     cbzw(tmp, succeed);
2176     // retry so we only ever return after a load fails to compare
2177     // ensures we don't return a stale value after a failed write.
2178     b(retry_load);
2179     // if the memory word differs we return it in oldv and signal a fail
2180     bind(nope);
2181     membar(AnyAny);
2182     mov(oldv, tmp);
2183   }
2184   if (fail)
2185     b(*fail);
2186 }
2187 
2188 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2189 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2190 // Pass a register for the result, otherwise pass noreg.
2191 
2192 // Clobbers rscratch1
2193 void MacroAssembler::cmpxchg(Register addr, Register expected,
2194                              Register new_val,
2195                              enum operand_size size,
2196                              bool acquire, bool release,
2197                              bool weak,
2198                              Register result) {
2199   if (result == noreg)  result = rscratch1;
2200   if (UseLSE) {
2201     mov(result, expected);
2202     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2203     cmp(result, expected);
2204   } else {
2205     BLOCK_COMMENT("cmpxchg {");
2206     Label retry_load, done;
2207     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2208       prfm(Address(addr), PSTL1STRM);
2209     bind(retry_load);
2210     load_exclusive(result, addr, size, acquire);
2211     if (size == xword)
2212       cmp(result, expected);
2213     else
2214       cmpw(result, expected);
2215     br(Assembler::NE, done);
2216     store_exclusive(rscratch1, new_val, addr, size, release);
2217     if (weak) {
2218       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2219     } else {
2220       cbnzw(rscratch1, retry_load);
2221     }
2222     bind(done);
2223     BLOCK_COMMENT("} cmpxchg");
2224   }
2225 }
2226 
2227 void MacroAssembler::cmpxchg_oop_shenandoah(Register addr, Register expected,
2228                                             Register new_val,
2229                                             enum operand_size size,
2230                                             bool acquire, bool release,
2231                                             bool weak,
2232                                             Register result, Register tmp2) {
2233   assert(UseShenandoahGC, "only for shenandoah");
2234   bool is_cae = (result != noreg);
2235   bool is_narrow = (size == word);
2236 
2237   if (! is_cae) result = rscratch1;
2238 
2239   assert_different_registers(addr, expected, new_val, result, tmp2);
2240 
2241   if (ShenandoahStoreCheck) {
2242     if (is_narrow) {
2243       decode_heap_oop(tmp2, new_val);
2244       shenandoah_store_check(addr, tmp2);
2245     } else {
2246       shenandoah_store_check(addr, new_val);
2247     }
2248   }
2249   Label retry, done, fail;
2250 
2251   // CAS, using LL/SC pair.
2252   bind(retry);
2253   load_exclusive(result, addr, size, acquire);
2254   if (is_narrow) {
2255     cmpw(result, expected);
2256   } else {
2257     cmp(result, expected);
2258   }
2259   br(Assembler::NE, fail);
2260   store_exclusive(tmp2, new_val, addr, size, release);
2261   if (weak) {
2262     cmpw(tmp2, 0u); // If the store fails, return NE to our caller
2263   } else {
2264     cbnzw(tmp2, retry);
2265   }
2266   b(done);
2267 
2268   bind(fail);
2269   // Check if rb(expected)==rb(result)
2270   // Shuffle registers so that we have memory value ready for next expected.
2271   mov(tmp2, expected);
2272   mov(expected, result);
2273   if (is_narrow) {
2274     decode_heap_oop(result, result);
2275     decode_heap_oop(tmp2, tmp2);
2276   }
2277   oopDesc::bs()->interpreter_read_barrier(this, result);
2278   oopDesc::bs()->interpreter_read_barrier(this, tmp2);
2279   cmp(result, tmp2);
2280   // Retry with expected now being the value we just loaded from addr.
2281   br(Assembler::EQ, retry);
2282   if (is_narrow && is_cae) {
2283     // For cmp-and-exchange and narrow oops, we need to restore
2284     // the compressed old-value. We moved it to 'expected' a few lines up.
2285     mov(result, expected);
2286   }
2287   bind(done);
2288 }
2289 
2290 static bool different(Register a, RegisterOrConstant b, Register c) {
2291   if (b.is_constant())
2292     return a != c;
2293   else
2294     return a != b.as_register() && a != c && b.as_register() != c;
2295 }
2296 
2297 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2298 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2299   if (UseLSE) {                                                         \
2300     prev = prev->is_valid() ? prev : zr;                                \
2301     if (incr.is_register()) {                                           \
2302       AOP(sz, incr.as_register(), prev, addr);                          \
2303     } else {                                                            \
2304       mov(rscratch2, incr.as_constant());                               \
2305       AOP(sz, rscratch2, prev, addr);                                   \
2306     }                                                                   \
2307     return;                                                             \
2308   }                                                                     \
2309   Register result = rscratch2;                                          \
2310   if (prev->is_valid())                                                 \
2311     result = different(prev, incr, addr) ? prev : rscratch2;            \
2312                                                                         \
2313   Label retry_load;                                                     \
2314   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2315     prfm(Address(addr), PSTL1STRM);                                     \
2316   bind(retry_load);                                                     \
2317   LDXR(result, addr);                                                   \
2318   OP(rscratch1, result, incr);                                          \
2319   STXR(rscratch2, rscratch1, addr);                                     \
2320   cbnzw(rscratch2, retry_load);                                         \
2321   if (prev->is_valid() && prev != result) {                             \
2322     IOP(prev, rscratch1, incr);                                         \
2323   }                                                                     \
2324 }
2325 
2326 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2327 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2328 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2329 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2330 
2331 #undef ATOMIC_OP
2332 
2333 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2334 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2335   if (UseLSE) {                                                         \
2336     prev = prev->is_valid() ? prev : zr;                                \
2337     AOP(sz, newv, prev, addr);                                          \
2338     return;                                                             \
2339   }                                                                     \
2340   Register result = rscratch2;                                          \
2341   if (prev->is_valid())                                                 \
2342     result = different(prev, newv, addr) ? prev : rscratch2;            \
2343                                                                         \
2344   Label retry_load;                                                     \
2345   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2346     prfm(Address(addr), PSTL1STRM);                                     \
2347   bind(retry_load);                                                     \
2348   LDXR(result, addr);                                                   \
2349   STXR(rscratch1, newv, addr);                                          \
2350   cbnzw(rscratch1, retry_load);                                         \
2351   if (prev->is_valid() && prev != result)                               \
2352     mov(prev, result);                                                  \
2353 }
2354 
2355 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2356 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2357 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2358 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2359 
2360 #undef ATOMIC_XCHG
2361 
2362 void MacroAssembler::incr_allocated_bytes(Register thread,
2363                                           Register var_size_in_bytes,
2364                                           int con_size_in_bytes,
2365                                           Register t1) {
2366   if (!thread->is_valid()) {
2367     thread = rthread;
2368   }
2369   assert(t1->is_valid(), "need temp reg");
2370 
2371   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2372   if (var_size_in_bytes->is_valid()) {
2373     add(t1, t1, var_size_in_bytes);
2374   } else {
2375     add(t1, t1, con_size_in_bytes);
2376   }
2377   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2378 }
2379 
2380 #ifndef PRODUCT
2381 extern "C" void findpc(intptr_t x);
2382 #endif
2383 
2384 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2385 {
2386   // In order to get locks to work, we need to fake a in_VM state
2387   if (ShowMessageBoxOnError ) {
2388     JavaThread* thread = JavaThread::current();
2389     JavaThreadState saved_state = thread->thread_state();
2390     thread->set_thread_state(_thread_in_vm);
2391 #ifndef PRODUCT
2392     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2393       ttyLocker ttyl;
2394       BytecodeCounter::print();
2395     }
2396 #endif
2397     if (os::message_box(msg, "Execution stopped, print registers?")) {
2398       ttyLocker ttyl;
2399       tty->print_cr(" pc = 0x%016lx", pc);
2400 #ifndef PRODUCT
2401       tty->cr();
2402       findpc(pc);
2403       tty->cr();
2404 #endif
2405       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2406       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2407       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2408       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2409       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2410       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2411       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2412       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2413       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2414       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2415       tty->print_cr("r10 = 0x%016lx", regs[10]);
2416       tty->print_cr("r11 = 0x%016lx", regs[11]);
2417       tty->print_cr("r12 = 0x%016lx", regs[12]);
2418       tty->print_cr("r13 = 0x%016lx", regs[13]);
2419       tty->print_cr("r14 = 0x%016lx", regs[14]);
2420       tty->print_cr("r15 = 0x%016lx", regs[15]);
2421       tty->print_cr("r16 = 0x%016lx", regs[16]);
2422       tty->print_cr("r17 = 0x%016lx", regs[17]);
2423       tty->print_cr("r18 = 0x%016lx", regs[18]);
2424       tty->print_cr("r19 = 0x%016lx", regs[19]);
2425       tty->print_cr("r20 = 0x%016lx", regs[20]);
2426       tty->print_cr("r21 = 0x%016lx", regs[21]);
2427       tty->print_cr("r22 = 0x%016lx", regs[22]);
2428       tty->print_cr("r23 = 0x%016lx", regs[23]);
2429       tty->print_cr("r24 = 0x%016lx", regs[24]);
2430       tty->print_cr("r25 = 0x%016lx", regs[25]);
2431       tty->print_cr("r26 = 0x%016lx", regs[26]);
2432       tty->print_cr("r27 = 0x%016lx", regs[27]);
2433       tty->print_cr("r28 = 0x%016lx", regs[28]);
2434       tty->print_cr("r30 = 0x%016lx", regs[30]);
2435       tty->print_cr("r31 = 0x%016lx", regs[31]);
2436       BREAKPOINT;
2437     }
2438     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2439   } else {
2440     ttyLocker ttyl;
2441     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2442                     msg);
2443     assert(false, "DEBUG MESSAGE: %s", msg);
2444   }
2445 }
2446 
2447 #ifdef BUILTIN_SIM
2448 // routine to generate an x86 prolog for a stub function which
2449 // bootstraps into the generated ARM code which directly follows the
2450 // stub
2451 //
2452 // the argument encodes the number of general and fp registers
2453 // passed by the caller and the callng convention (currently just
2454 // the number of general registers and assumes C argument passing)
2455 
2456 extern "C" {
2457 int aarch64_stub_prolog_size();
2458 void aarch64_stub_prolog();
2459 void aarch64_prolog();
2460 }
2461 
2462 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2463                                    address *prolog_ptr)
2464 {
2465   int calltype = (((ret_type & 0x3) << 8) |
2466                   ((fp_arg_count & 0xf) << 4) |
2467                   (gp_arg_count & 0xf));
2468 
2469   // the addresses for the x86 to ARM entry code we need to use
2470   address start = pc();
2471   // printf("start = %lx\n", start);
2472   int byteCount =  aarch64_stub_prolog_size();
2473   // printf("byteCount = %x\n", byteCount);
2474   int instructionCount = (byteCount + 3)/ 4;
2475   // printf("instructionCount = %x\n", instructionCount);
2476   for (int i = 0; i < instructionCount; i++) {
2477     nop();
2478   }
2479 
2480   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2481 
2482   // write the address of the setup routine and the call format at the
2483   // end of into the copied code
2484   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2485   if (prolog_ptr)
2486     patch_end[-2] = (u_int64_t)prolog_ptr;
2487   patch_end[-1] = calltype;
2488 }
2489 #endif
2490 
2491 void MacroAssembler::push_call_clobbered_fp_registers() {
2492   // Push v0-v7, v16-v31.
2493   for (int i = 30; i >= 0; i -= 2) {
2494     if (i <= v7->encoding() || i >= v16->encoding()) {
2495         stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2496              Address(pre(sp, -2 * wordSize)));
2497     }
2498   }
2499 }
2500 
2501 void MacroAssembler::pop_call_clobbered_fp_registers() {
2502 
2503   for (int i = 0; i < 32; i += 2) {
2504     if (i <= v7->encoding() || i >= v16->encoding()) {
2505       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2506            Address(post(sp, 2 * wordSize)));
2507     }
2508   }
2509 }
2510 
2511 void MacroAssembler::push_call_clobbered_registers() {
2512   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2513 
2514   push_call_clobbered_fp_registers();
2515 }
2516 
2517 void MacroAssembler::pop_call_clobbered_registers() {
2518   pop_call_clobbered_fp_registers();
2519 
2520   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2521 }
2522 
2523 void MacroAssembler::push_CPU_state(bool save_vectors) {
2524   push(0x3fffffff, sp);         // integer registers except lr & sp
2525 
2526   if (!save_vectors) {
2527     for (int i = 30; i >= 0; i -= 2)
2528       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2529            Address(pre(sp, -2 * wordSize)));
2530   } else {
2531     for (int i = 30; i >= 0; i -= 2)
2532       stpq(as_FloatRegister(i), as_FloatRegister(i+1),
2533            Address(pre(sp, -4 * wordSize)));
2534   }
2535 }
2536 
2537 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2538   if (!restore_vectors) {
2539     for (int i = 0; i < 32; i += 2)
2540       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2541            Address(post(sp, 2 * wordSize)));
2542   } else {
2543     for (int i = 0; i < 32; i += 2)
2544       ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
2545            Address(post(sp, 4 * wordSize)));
2546   }
2547 
2548   pop(0x3fffffff, sp);         // integer registers except lr & sp
2549 }
2550 
2551 /**
2552  * Helpers for multiply_to_len().
2553  */
2554 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2555                                      Register src1, Register src2) {
2556   adds(dest_lo, dest_lo, src1);
2557   adc(dest_hi, dest_hi, zr);
2558   adds(dest_lo, dest_lo, src2);
2559   adc(final_dest_hi, dest_hi, zr);
2560 }
2561 
2562 // Generate an address from (r + r1 extend offset).  "size" is the
2563 // size of the operand.  The result may be in rscratch2.
2564 Address MacroAssembler::offsetted_address(Register r, Register r1,
2565                                           Address::extend ext, int offset, int size) {
2566   if (offset || (ext.shift() % size != 0)) {
2567     lea(rscratch2, Address(r, r1, ext));
2568     return Address(rscratch2, offset);
2569   } else {
2570     return Address(r, r1, ext);
2571   }
2572 }
2573 
2574 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2575 {
2576   assert(offset >= 0, "spill to negative address?");
2577   // Offset reachable ?
2578   //   Not aligned - 9 bits signed offset
2579   //   Aligned - 12 bits unsigned offset shifted
2580   Register base = sp;
2581   if ((offset & (size-1)) && offset >= (1<<8)) {
2582     add(tmp, base, offset & ((1<<12)-1));
2583     base = tmp;
2584     offset &= -1<<12;
2585   }
2586 
2587   if (offset >= (1<<12) * size) {
2588     add(tmp, base, offset & (((1<<12)-1)<<12));
2589     base = tmp;
2590     offset &= ~(((1<<12)-1)<<12);
2591   }
2592 
2593   return Address(base, offset);
2594 }
2595 
2596 /**
2597  * Multiply 64 bit by 64 bit first loop.
2598  */
2599 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2600                                            Register y, Register y_idx, Register z,
2601                                            Register carry, Register product,
2602                                            Register idx, Register kdx) {
2603   //
2604   //  jlong carry, x[], y[], z[];
2605   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2606   //    huge_128 product = y[idx] * x[xstart] + carry;
2607   //    z[kdx] = (jlong)product;
2608   //    carry  = (jlong)(product >>> 64);
2609   //  }
2610   //  z[xstart] = carry;
2611   //
2612 
2613   Label L_first_loop, L_first_loop_exit;
2614   Label L_one_x, L_one_y, L_multiply;
2615 
2616   subsw(xstart, xstart, 1);
2617   br(Assembler::MI, L_one_x);
2618 
2619   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2620   ldr(x_xstart, Address(rscratch1));
2621   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2622 
2623   bind(L_first_loop);
2624   subsw(idx, idx, 1);
2625   br(Assembler::MI, L_first_loop_exit);
2626   subsw(idx, idx, 1);
2627   br(Assembler::MI, L_one_y);
2628   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2629   ldr(y_idx, Address(rscratch1));
2630   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2631   bind(L_multiply);
2632 
2633   // AArch64 has a multiply-accumulate instruction that we can't use
2634   // here because it has no way to process carries, so we have to use
2635   // separate add and adc instructions.  Bah.
2636   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2637   mul(product, x_xstart, y_idx);
2638   adds(product, product, carry);
2639   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2640 
2641   subw(kdx, kdx, 2);
2642   ror(product, product, 32); // back to big-endian
2643   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2644 
2645   b(L_first_loop);
2646 
2647   bind(L_one_y);
2648   ldrw(y_idx, Address(y,  0));
2649   b(L_multiply);
2650 
2651   bind(L_one_x);
2652   ldrw(x_xstart, Address(x,  0));
2653   b(L_first_loop);
2654 
2655   bind(L_first_loop_exit);
2656 }
2657 
2658 /**
2659  * Multiply 128 bit by 128. Unrolled inner loop.
2660  *
2661  */
2662 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2663                                              Register carry, Register carry2,
2664                                              Register idx, Register jdx,
2665                                              Register yz_idx1, Register yz_idx2,
2666                                              Register tmp, Register tmp3, Register tmp4,
2667                                              Register tmp6, Register product_hi) {
2668 
2669   //   jlong carry, x[], y[], z[];
2670   //   int kdx = ystart+1;
2671   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2672   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2673   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2674   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2675   //     carry  = (jlong)(tmp4 >>> 64);
2676   //     z[kdx+idx+1] = (jlong)tmp3;
2677   //     z[kdx+idx] = (jlong)tmp4;
2678   //   }
2679   //   idx += 2;
2680   //   if (idx > 0) {
2681   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2682   //     z[kdx+idx] = (jlong)yz_idx1;
2683   //     carry  = (jlong)(yz_idx1 >>> 64);
2684   //   }
2685   //
2686 
2687   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2688 
2689   lsrw(jdx, idx, 2);
2690 
2691   bind(L_third_loop);
2692 
2693   subsw(jdx, jdx, 1);
2694   br(Assembler::MI, L_third_loop_exit);
2695   subw(idx, idx, 4);
2696 
2697   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2698 
2699   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2700 
2701   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2702 
2703   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2704   ror(yz_idx2, yz_idx2, 32);
2705 
2706   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2707 
2708   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2709   umulh(tmp4, product_hi, yz_idx1);
2710 
2711   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2712   ror(rscratch2, rscratch2, 32);
2713 
2714   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2715   umulh(carry2, product_hi, yz_idx2);
2716 
2717   // propagate sum of both multiplications into carry:tmp4:tmp3
2718   adds(tmp3, tmp3, carry);
2719   adc(tmp4, tmp4, zr);
2720   adds(tmp3, tmp3, rscratch1);
2721   adcs(tmp4, tmp4, tmp);
2722   adc(carry, carry2, zr);
2723   adds(tmp4, tmp4, rscratch2);
2724   adc(carry, carry, zr);
2725 
2726   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2727   ror(tmp4, tmp4, 32);
2728   stp(tmp4, tmp3, Address(tmp6, 0));
2729 
2730   b(L_third_loop);
2731   bind (L_third_loop_exit);
2732 
2733   andw (idx, idx, 0x3);
2734   cbz(idx, L_post_third_loop_done);
2735 
2736   Label L_check_1;
2737   subsw(idx, idx, 2);
2738   br(Assembler::MI, L_check_1);
2739 
2740   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2741   ldr(yz_idx1, Address(rscratch1, 0));
2742   ror(yz_idx1, yz_idx1, 32);
2743   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2744   umulh(tmp4, product_hi, yz_idx1);
2745   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2746   ldr(yz_idx2, Address(rscratch1, 0));
2747   ror(yz_idx2, yz_idx2, 32);
2748 
2749   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2750 
2751   ror(tmp3, tmp3, 32);
2752   str(tmp3, Address(rscratch1, 0));
2753 
2754   bind (L_check_1);
2755 
2756   andw (idx, idx, 0x1);
2757   subsw(idx, idx, 1);
2758   br(Assembler::MI, L_post_third_loop_done);
2759   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2760   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2761   umulh(carry2, tmp4, product_hi);
2762   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2763 
2764   add2_with_carry(carry2, tmp3, tmp4, carry);
2765 
2766   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2767   extr(carry, carry2, tmp3, 32);
2768 
2769   bind(L_post_third_loop_done);
2770 }
2771 
2772 /**
2773  * Code for BigInteger::multiplyToLen() instrinsic.
2774  *
2775  * r0: x
2776  * r1: xlen
2777  * r2: y
2778  * r3: ylen
2779  * r4:  z
2780  * r5: zlen
2781  * r10: tmp1
2782  * r11: tmp2
2783  * r12: tmp3
2784  * r13: tmp4
2785  * r14: tmp5
2786  * r15: tmp6
2787  * r16: tmp7
2788  *
2789  */
2790 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2791                                      Register z, Register zlen,
2792                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2793                                      Register tmp5, Register tmp6, Register product_hi) {
2794 
2795   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2796 
2797   const Register idx = tmp1;
2798   const Register kdx = tmp2;
2799   const Register xstart = tmp3;
2800 
2801   const Register y_idx = tmp4;
2802   const Register carry = tmp5;
2803   const Register product  = xlen;
2804   const Register x_xstart = zlen;  // reuse register
2805 
2806   // First Loop.
2807   //
2808   //  final static long LONG_MASK = 0xffffffffL;
2809   //  int xstart = xlen - 1;
2810   //  int ystart = ylen - 1;
2811   //  long carry = 0;
2812   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2813   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2814   //    z[kdx] = (int)product;
2815   //    carry = product >>> 32;
2816   //  }
2817   //  z[xstart] = (int)carry;
2818   //
2819 
2820   movw(idx, ylen);      // idx = ylen;
2821   movw(kdx, zlen);      // kdx = xlen+ylen;
2822   mov(carry, zr);       // carry = 0;
2823 
2824   Label L_done;
2825 
2826   movw(xstart, xlen);
2827   subsw(xstart, xstart, 1);
2828   br(Assembler::MI, L_done);
2829 
2830   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2831 
2832   Label L_second_loop;
2833   cbzw(kdx, L_second_loop);
2834 
2835   Label L_carry;
2836   subw(kdx, kdx, 1);
2837   cbzw(kdx, L_carry);
2838 
2839   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2840   lsr(carry, carry, 32);
2841   subw(kdx, kdx, 1);
2842 
2843   bind(L_carry);
2844   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2845 
2846   // Second and third (nested) loops.
2847   //
2848   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2849   //   carry = 0;
2850   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2851   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2852   //                    (z[k] & LONG_MASK) + carry;
2853   //     z[k] = (int)product;
2854   //     carry = product >>> 32;
2855   //   }
2856   //   z[i] = (int)carry;
2857   // }
2858   //
2859   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2860 
2861   const Register jdx = tmp1;
2862 
2863   bind(L_second_loop);
2864   mov(carry, zr);                // carry = 0;
2865   movw(jdx, ylen);               // j = ystart+1
2866 
2867   subsw(xstart, xstart, 1);      // i = xstart-1;
2868   br(Assembler::MI, L_done);
2869 
2870   str(z, Address(pre(sp, -4 * wordSize)));
2871 
2872   Label L_last_x;
2873   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2874   subsw(xstart, xstart, 1);       // i = xstart-1;
2875   br(Assembler::MI, L_last_x);
2876 
2877   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2878   ldr(product_hi, Address(rscratch1));
2879   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2880 
2881   Label L_third_loop_prologue;
2882   bind(L_third_loop_prologue);
2883 
2884   str(ylen, Address(sp, wordSize));
2885   stp(x, xstart, Address(sp, 2 * wordSize));
2886   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2887                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2888   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2889   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2890 
2891   addw(tmp3, xlen, 1);
2892   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2893   subsw(tmp3, tmp3, 1);
2894   br(Assembler::MI, L_done);
2895 
2896   lsr(carry, carry, 32);
2897   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2898   b(L_second_loop);
2899 
2900   // Next infrequent code is moved outside loops.
2901   bind(L_last_x);
2902   ldrw(product_hi, Address(x,  0));
2903   b(L_third_loop_prologue);
2904 
2905   bind(L_done);
2906 }
2907 
2908 /**
2909  * Emits code to update CRC-32 with a byte value according to constants in table
2910  *
2911  * @param [in,out]crc   Register containing the crc.
2912  * @param [in]val       Register containing the byte to fold into the CRC.
2913  * @param [in]table     Register containing the table of crc constants.
2914  *
2915  * uint32_t crc;
2916  * val = crc_table[(val ^ crc) & 0xFF];
2917  * crc = val ^ (crc >> 8);
2918  *
2919  */
2920 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2921   eor(val, val, crc);
2922   andr(val, val, 0xff);
2923   ldrw(val, Address(table, val, Address::lsl(2)));
2924   eor(crc, val, crc, Assembler::LSR, 8);
2925 }
2926 
2927 /**
2928  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2929  *
2930  * @param [in,out]crc   Register containing the crc.
2931  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2932  * @param [in]table0    Register containing table 0 of crc constants.
2933  * @param [in]table1    Register containing table 1 of crc constants.
2934  * @param [in]table2    Register containing table 2 of crc constants.
2935  * @param [in]table3    Register containing table 3 of crc constants.
2936  *
2937  * uint32_t crc;
2938  *   v = crc ^ v
2939  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2940  *
2941  */
2942 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2943         Register table0, Register table1, Register table2, Register table3,
2944         bool upper) {
2945   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2946   uxtb(tmp, v);
2947   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2948   ubfx(tmp, v, 8, 8);
2949   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2950   eor(crc, crc, tmp);
2951   ubfx(tmp, v, 16, 8);
2952   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2953   eor(crc, crc, tmp);
2954   ubfx(tmp, v, 24, 8);
2955   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2956   eor(crc, crc, tmp);
2957 }
2958 
2959 /**
2960  * @param crc   register containing existing CRC (32-bit)
2961  * @param buf   register pointing to input byte buffer (byte*)
2962  * @param len   register containing number of bytes
2963  * @param table register that will contain address of CRC table
2964  * @param tmp   scratch register
2965  */
2966 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2967         Register table0, Register table1, Register table2, Register table3,
2968         Register tmp, Register tmp2, Register tmp3) {
2969   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2970   unsigned long offset;
2971 
2972     ornw(crc, zr, crc);
2973 
2974   if (UseCRC32) {
2975     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2976 
2977       subs(len, len, 64);
2978       br(Assembler::GE, CRC_by64_loop);
2979       adds(len, len, 64-4);
2980       br(Assembler::GE, CRC_by4_loop);
2981       adds(len, len, 4);
2982       br(Assembler::GT, CRC_by1_loop);
2983       b(L_exit);
2984 
2985     BIND(CRC_by4_loop);
2986       ldrw(tmp, Address(post(buf, 4)));
2987       subs(len, len, 4);
2988       crc32w(crc, crc, tmp);
2989       br(Assembler::GE, CRC_by4_loop);
2990       adds(len, len, 4);
2991       br(Assembler::LE, L_exit);
2992     BIND(CRC_by1_loop);
2993       ldrb(tmp, Address(post(buf, 1)));
2994       subs(len, len, 1);
2995       crc32b(crc, crc, tmp);
2996       br(Assembler::GT, CRC_by1_loop);
2997       b(L_exit);
2998 
2999       align(CodeEntryAlignment);
3000     BIND(CRC_by64_loop);
3001       subs(len, len, 64);
3002       ldp(tmp, tmp3, Address(post(buf, 16)));
3003       crc32x(crc, crc, tmp);
3004       crc32x(crc, crc, tmp3);
3005       ldp(tmp, tmp3, Address(post(buf, 16)));
3006       crc32x(crc, crc, tmp);
3007       crc32x(crc, crc, tmp3);
3008       ldp(tmp, tmp3, Address(post(buf, 16)));
3009       crc32x(crc, crc, tmp);
3010       crc32x(crc, crc, tmp3);
3011       ldp(tmp, tmp3, Address(post(buf, 16)));
3012       crc32x(crc, crc, tmp);
3013       crc32x(crc, crc, tmp3);
3014       br(Assembler::GE, CRC_by64_loop);
3015       adds(len, len, 64-4);
3016       br(Assembler::GE, CRC_by4_loop);
3017       adds(len, len, 4);
3018       br(Assembler::GT, CRC_by1_loop);
3019     BIND(L_exit);
3020       ornw(crc, zr, crc);
3021       return;
3022   }
3023 
3024     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3025     if (offset) add(table0, table0, offset);
3026     add(table1, table0, 1*256*sizeof(juint));
3027     add(table2, table0, 2*256*sizeof(juint));
3028     add(table3, table0, 3*256*sizeof(juint));
3029 
3030   if (UseNeon) {
3031       cmp(len, 64);
3032       br(Assembler::LT, L_by16);
3033       eor(v16, T16B, v16, v16);
3034 
3035     Label L_fold;
3036 
3037       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3038 
3039       ld1(v0, v1, T2D, post(buf, 32));
3040       ld1r(v4, T2D, post(tmp, 8));
3041       ld1r(v5, T2D, post(tmp, 8));
3042       ld1r(v6, T2D, post(tmp, 8));
3043       ld1r(v7, T2D, post(tmp, 8));
3044       mov(v16, T4S, 0, crc);
3045 
3046       eor(v0, T16B, v0, v16);
3047       sub(len, len, 64);
3048 
3049     BIND(L_fold);
3050       pmull(v22, T8H, v0, v5, T8B);
3051       pmull(v20, T8H, v0, v7, T8B);
3052       pmull(v23, T8H, v0, v4, T8B);
3053       pmull(v21, T8H, v0, v6, T8B);
3054 
3055       pmull2(v18, T8H, v0, v5, T16B);
3056       pmull2(v16, T8H, v0, v7, T16B);
3057       pmull2(v19, T8H, v0, v4, T16B);
3058       pmull2(v17, T8H, v0, v6, T16B);
3059 
3060       uzp1(v24, v20, v22, T8H);
3061       uzp2(v25, v20, v22, T8H);
3062       eor(v20, T16B, v24, v25);
3063 
3064       uzp1(v26, v16, v18, T8H);
3065       uzp2(v27, v16, v18, T8H);
3066       eor(v16, T16B, v26, v27);
3067 
3068       ushll2(v22, T4S, v20, T8H, 8);
3069       ushll(v20, T4S, v20, T4H, 8);
3070 
3071       ushll2(v18, T4S, v16, T8H, 8);
3072       ushll(v16, T4S, v16, T4H, 8);
3073 
3074       eor(v22, T16B, v23, v22);
3075       eor(v18, T16B, v19, v18);
3076       eor(v20, T16B, v21, v20);
3077       eor(v16, T16B, v17, v16);
3078 
3079       uzp1(v17, v16, v20, T2D);
3080       uzp2(v21, v16, v20, T2D);
3081       eor(v17, T16B, v17, v21);
3082 
3083       ushll2(v20, T2D, v17, T4S, 16);
3084       ushll(v16, T2D, v17, T2S, 16);
3085 
3086       eor(v20, T16B, v20, v22);
3087       eor(v16, T16B, v16, v18);
3088 
3089       uzp1(v17, v20, v16, T2D);
3090       uzp2(v21, v20, v16, T2D);
3091       eor(v28, T16B, v17, v21);
3092 
3093       pmull(v22, T8H, v1, v5, T8B);
3094       pmull(v20, T8H, v1, v7, T8B);
3095       pmull(v23, T8H, v1, v4, T8B);
3096       pmull(v21, T8H, v1, v6, T8B);
3097 
3098       pmull2(v18, T8H, v1, v5, T16B);
3099       pmull2(v16, T8H, v1, v7, T16B);
3100       pmull2(v19, T8H, v1, v4, T16B);
3101       pmull2(v17, T8H, v1, v6, T16B);
3102 
3103       ld1(v0, v1, T2D, post(buf, 32));
3104 
3105       uzp1(v24, v20, v22, T8H);
3106       uzp2(v25, v20, v22, T8H);
3107       eor(v20, T16B, v24, v25);
3108 
3109       uzp1(v26, v16, v18, T8H);
3110       uzp2(v27, v16, v18, T8H);
3111       eor(v16, T16B, v26, v27);
3112 
3113       ushll2(v22, T4S, v20, T8H, 8);
3114       ushll(v20, T4S, v20, T4H, 8);
3115 
3116       ushll2(v18, T4S, v16, T8H, 8);
3117       ushll(v16, T4S, v16, T4H, 8);
3118 
3119       eor(v22, T16B, v23, v22);
3120       eor(v18, T16B, v19, v18);
3121       eor(v20, T16B, v21, v20);
3122       eor(v16, T16B, v17, v16);
3123 
3124       uzp1(v17, v16, v20, T2D);
3125       uzp2(v21, v16, v20, T2D);
3126       eor(v16, T16B, v17, v21);
3127 
3128       ushll2(v20, T2D, v16, T4S, 16);
3129       ushll(v16, T2D, v16, T2S, 16);
3130 
3131       eor(v20, T16B, v22, v20);
3132       eor(v16, T16B, v16, v18);
3133 
3134       uzp1(v17, v20, v16, T2D);
3135       uzp2(v21, v20, v16, T2D);
3136       eor(v20, T16B, v17, v21);
3137 
3138       shl(v16, T2D, v28, 1);
3139       shl(v17, T2D, v20, 1);
3140 
3141       eor(v0, T16B, v0, v16);
3142       eor(v1, T16B, v1, v17);
3143 
3144       subs(len, len, 32);
3145       br(Assembler::GE, L_fold);
3146 
3147       mov(crc, 0);
3148       mov(tmp, v0, T1D, 0);
3149       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3150       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3151       mov(tmp, v0, T1D, 1);
3152       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3153       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3154       mov(tmp, v1, T1D, 0);
3155       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3156       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3157       mov(tmp, v1, T1D, 1);
3158       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3159       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3160 
3161       add(len, len, 32);
3162   }
3163 
3164   BIND(L_by16);
3165     subs(len, len, 16);
3166     br(Assembler::GE, L_by16_loop);
3167     adds(len, len, 16-4);
3168     br(Assembler::GE, L_by4_loop);
3169     adds(len, len, 4);
3170     br(Assembler::GT, L_by1_loop);
3171     b(L_exit);
3172 
3173   BIND(L_by4_loop);
3174     ldrw(tmp, Address(post(buf, 4)));
3175     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3176     subs(len, len, 4);
3177     br(Assembler::GE, L_by4_loop);
3178     adds(len, len, 4);
3179     br(Assembler::LE, L_exit);
3180   BIND(L_by1_loop);
3181     subs(len, len, 1);
3182     ldrb(tmp, Address(post(buf, 1)));
3183     update_byte_crc32(crc, tmp, table0);
3184     br(Assembler::GT, L_by1_loop);
3185     b(L_exit);
3186 
3187     align(CodeEntryAlignment);
3188   BIND(L_by16_loop);
3189     subs(len, len, 16);
3190     ldp(tmp, tmp3, Address(post(buf, 16)));
3191     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3192     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3193     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3194     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3195     br(Assembler::GE, L_by16_loop);
3196     adds(len, len, 16-4);
3197     br(Assembler::GE, L_by4_loop);
3198     adds(len, len, 4);
3199     br(Assembler::GT, L_by1_loop);
3200   BIND(L_exit);
3201     ornw(crc, zr, crc);
3202 }
3203 
3204 /**
3205  * @param crc   register containing existing CRC (32-bit)
3206  * @param buf   register pointing to input byte buffer (byte*)
3207  * @param len   register containing number of bytes
3208  * @param table register that will contain address of CRC table
3209  * @param tmp   scratch register
3210  */
3211 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3212         Register table0, Register table1, Register table2, Register table3,
3213         Register tmp, Register tmp2, Register tmp3) {
3214   Label L_exit;
3215   Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
3216 
3217     subs(len, len, 64);
3218     br(Assembler::GE, CRC_by64_loop);
3219     adds(len, len, 64-4);
3220     br(Assembler::GE, CRC_by4_loop);
3221     adds(len, len, 4);
3222     br(Assembler::GT, CRC_by1_loop);
3223     b(L_exit);
3224 
3225   BIND(CRC_by4_loop);
3226     ldrw(tmp, Address(post(buf, 4)));
3227     subs(len, len, 4);
3228     crc32cw(crc, crc, tmp);
3229     br(Assembler::GE, CRC_by4_loop);
3230     adds(len, len, 4);
3231     br(Assembler::LE, L_exit);
3232   BIND(CRC_by1_loop);
3233     ldrb(tmp, Address(post(buf, 1)));
3234     subs(len, len, 1);
3235     crc32cb(crc, crc, tmp);
3236     br(Assembler::GT, CRC_by1_loop);
3237     b(L_exit);
3238 
3239     align(CodeEntryAlignment);
3240   BIND(CRC_by64_loop);
3241     subs(len, len, 64);
3242     ldp(tmp, tmp3, Address(post(buf, 16)));
3243     crc32cx(crc, crc, tmp);
3244     crc32cx(crc, crc, tmp3);
3245     ldp(tmp, tmp3, Address(post(buf, 16)));
3246     crc32cx(crc, crc, tmp);
3247     crc32cx(crc, crc, tmp3);
3248     ldp(tmp, tmp3, Address(post(buf, 16)));
3249     crc32cx(crc, crc, tmp);
3250     crc32cx(crc, crc, tmp3);
3251     ldp(tmp, tmp3, Address(post(buf, 16)));
3252     crc32cx(crc, crc, tmp);
3253     crc32cx(crc, crc, tmp3);
3254     br(Assembler::GE, CRC_by64_loop);
3255     adds(len, len, 64-4);
3256     br(Assembler::GE, CRC_by4_loop);
3257     adds(len, len, 4);
3258     br(Assembler::GT, CRC_by1_loop);
3259   BIND(L_exit);
3260     return;
3261 }
3262 
3263 SkipIfEqual::SkipIfEqual(
3264     MacroAssembler* masm, const bool* flag_addr, bool value) {
3265   _masm = masm;
3266   unsigned long offset;
3267   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3268   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3269   _masm->cbzw(rscratch1, _label);
3270 }
3271 
3272 SkipIfEqual::~SkipIfEqual() {
3273   _masm->bind(_label);
3274 }
3275 
3276 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3277   Address adr;
3278   switch(dst.getMode()) {
3279   case Address::base_plus_offset:
3280     // This is the expected mode, although we allow all the other
3281     // forms below.
3282     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3283     break;
3284   default:
3285     lea(rscratch2, dst);
3286     adr = Address(rscratch2);
3287     break;
3288   }
3289   ldr(rscratch1, adr);
3290   add(rscratch1, rscratch1, src);
3291   str(rscratch1, adr);
3292 }
3293 
3294 void MacroAssembler::cmpptr(Register src1, Address src2) {
3295   unsigned long offset;
3296   adrp(rscratch1, src2, offset);
3297   ldr(rscratch1, Address(rscratch1, offset));
3298   cmp(src1, rscratch1);
3299 }
3300 
3301 void MacroAssembler::store_check(Register obj, Address dst) {
3302   store_check(obj);
3303 }
3304 
3305 void MacroAssembler::store_check(Register obj) {
3306   // Does a store check for the oop in register obj. The content of
3307   // register obj is destroyed afterwards.
3308 
3309   BarrierSet* bs = Universe::heap()->barrier_set();
3310   assert(bs->kind() == BarrierSet::CardTableForRS ||
3311          bs->kind() == BarrierSet::CardTableExtension,
3312          "Wrong barrier set kind");
3313 
3314   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
3315   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3316 
3317   lsr(obj, obj, CardTableModRefBS::card_shift);
3318 
3319   assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
3320 
3321   load_byte_map_base(rscratch1);
3322 
3323   if (UseCondCardMark) {
3324     Label L_already_dirty;
3325     membar(StoreLoad);
3326     ldrb(rscratch2,  Address(obj, rscratch1));
3327     cbz(rscratch2, L_already_dirty);
3328     strb(zr, Address(obj, rscratch1));
3329     bind(L_already_dirty);
3330   } else {
3331     if (UseConcMarkSweepGC && CMSPrecleaningEnabled) {
3332       membar(StoreStore);
3333     }
3334     strb(zr, Address(obj, rscratch1));
3335   }
3336 }
3337 
3338 void MacroAssembler::load_klass(Register dst, Register src) {
3339   if (UseCompressedClassPointers) {
3340     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3341     decode_klass_not_null(dst);
3342   } else {
3343     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3344   }
3345 }
3346 
3347 void MacroAssembler::load_mirror(Register dst, Register method) {
3348   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3349   ldr(dst, Address(rmethod, Method::const_offset()));
3350   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3351   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3352   ldr(dst, Address(dst, mirror_offset));
3353 }
3354 
3355 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3356   if (UseCompressedClassPointers) {
3357     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3358     if (Universe::narrow_klass_base() == NULL) {
3359       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3360       return;
3361     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3362                && Universe::narrow_klass_shift() == 0) {
3363       // Only the bottom 32 bits matter
3364       cmpw(trial_klass, tmp);
3365       return;
3366     }
3367     decode_klass_not_null(tmp);
3368   } else {
3369     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3370   }
3371   cmp(trial_klass, tmp);
3372 }
3373 
3374 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3375   load_klass(dst, src);
3376   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3377 }
3378 
3379 void MacroAssembler::store_klass(Register dst, Register src) {
3380   // FIXME: Should this be a store release?  concurrent gcs assumes
3381   // klass length is valid if klass field is not null.
3382   if (UseCompressedClassPointers) {
3383     encode_klass_not_null(src);
3384     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3385   } else {
3386     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3387   }
3388 }
3389 
3390 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3391   if (UseCompressedClassPointers) {
3392     // Store to klass gap in destination
3393     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3394   }
3395 }
3396 
3397 // Algorithm must match oop.inline.hpp encode_heap_oop.
3398 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3399 #ifdef ASSERT
3400   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3401 #endif
3402   verify_oop(s, "broken oop in encode_heap_oop");
3403   if (Universe::narrow_oop_base() == NULL) {
3404     if (Universe::narrow_oop_shift() != 0) {
3405       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3406       lsr(d, s, LogMinObjAlignmentInBytes);
3407     } else {
3408       mov(d, s);
3409     }
3410   } else {
3411     subs(d, s, rheapbase);
3412     csel(d, d, zr, Assembler::HS);
3413     lsr(d, d, LogMinObjAlignmentInBytes);
3414 
3415     /*  Old algorithm: is this any worse?
3416     Label nonnull;
3417     cbnz(r, nonnull);
3418     sub(r, r, rheapbase);
3419     bind(nonnull);
3420     lsr(r, r, LogMinObjAlignmentInBytes);
3421     */
3422   }
3423 }
3424 
3425 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3426 #ifdef ASSERT
3427   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3428   if (CheckCompressedOops) {
3429     Label ok;
3430     cbnz(r, ok);
3431     stop("null oop passed to encode_heap_oop_not_null");
3432     bind(ok);
3433   }
3434 #endif
3435   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3436   if (Universe::narrow_oop_base() != NULL) {
3437     sub(r, r, rheapbase);
3438   }
3439   if (Universe::narrow_oop_shift() != 0) {
3440     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3441     lsr(r, r, LogMinObjAlignmentInBytes);
3442   }
3443 }
3444 
3445 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3446 #ifdef ASSERT
3447   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3448   if (CheckCompressedOops) {
3449     Label ok;
3450     cbnz(src, ok);
3451     stop("null oop passed to encode_heap_oop_not_null2");
3452     bind(ok);
3453   }
3454 #endif
3455   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3456 
3457   Register data = src;
3458   if (Universe::narrow_oop_base() != NULL) {
3459     sub(dst, src, rheapbase);
3460     data = dst;
3461   }
3462   if (Universe::narrow_oop_shift() != 0) {
3463     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3464     lsr(dst, data, LogMinObjAlignmentInBytes);
3465     data = dst;
3466   }
3467   if (data == src)
3468     mov(dst, src);
3469 }
3470 
3471 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3472 #ifdef ASSERT
3473   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3474 #endif
3475   if (Universe::narrow_oop_base() == NULL) {
3476     if (Universe::narrow_oop_shift() != 0 || d != s) {
3477       lsl(d, s, Universe::narrow_oop_shift());
3478     }
3479   } else {
3480     Label done;
3481     if (d != s)
3482       mov(d, s);
3483     cbz(s, done);
3484     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3485     bind(done);
3486   }
3487   verify_oop(d, "broken oop in decode_heap_oop");
3488 }
3489 
3490 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3491   assert (UseCompressedOops, "should only be used for compressed headers");
3492   assert (Universe::heap() != NULL, "java heap should be initialized");
3493   // Cannot assert, unverified entry point counts instructions (see .ad file)
3494   // vtableStubs also counts instructions in pd_code_size_limit.
3495   // Also do not verify_oop as this is called by verify_oop.
3496   if (Universe::narrow_oop_shift() != 0) {
3497     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3498     if (Universe::narrow_oop_base() != NULL) {
3499       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3500     } else {
3501       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3502     }
3503   } else {
3504     assert (Universe::narrow_oop_base() == NULL, "sanity");
3505   }
3506 }
3507 
3508 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3509   assert (UseCompressedOops, "should only be used for compressed headers");
3510   assert (Universe::heap() != NULL, "java heap should be initialized");
3511   // Cannot assert, unverified entry point counts instructions (see .ad file)
3512   // vtableStubs also counts instructions in pd_code_size_limit.
3513   // Also do not verify_oop as this is called by verify_oop.
3514   if (Universe::narrow_oop_shift() != 0) {
3515     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3516     if (Universe::narrow_oop_base() != NULL) {
3517       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3518     } else {
3519       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3520     }
3521   } else {
3522     assert (Universe::narrow_oop_base() == NULL, "sanity");
3523     if (dst != src) {
3524       mov(dst, src);
3525     }
3526   }
3527 }
3528 
3529 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3530   if (Universe::narrow_klass_base() == NULL) {
3531     if (Universe::narrow_klass_shift() != 0) {
3532       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3533       lsr(dst, src, LogKlassAlignmentInBytes);
3534     } else {
3535       if (dst != src) mov(dst, src);
3536     }
3537     return;
3538   }
3539 
3540   if (use_XOR_for_compressed_class_base) {
3541     if (Universe::narrow_klass_shift() != 0) {
3542       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3543       lsr(dst, dst, LogKlassAlignmentInBytes);
3544     } else {
3545       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3546     }
3547     return;
3548   }
3549 
3550   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3551       && Universe::narrow_klass_shift() == 0) {
3552     movw(dst, src);
3553     return;
3554   }
3555 
3556 #ifdef ASSERT
3557   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3558 #endif
3559 
3560   Register rbase = dst;
3561   if (dst == src) rbase = rheapbase;
3562   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3563   sub(dst, src, rbase);
3564   if (Universe::narrow_klass_shift() != 0) {
3565     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3566     lsr(dst, dst, LogKlassAlignmentInBytes);
3567   }
3568   if (dst == src) reinit_heapbase();
3569 }
3570 
3571 void MacroAssembler::encode_klass_not_null(Register r) {
3572   encode_klass_not_null(r, r);
3573 }
3574 
3575 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3576   Register rbase = dst;
3577   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3578 
3579   if (Universe::narrow_klass_base() == NULL) {
3580     if (Universe::narrow_klass_shift() != 0) {
3581       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3582       lsl(dst, src, LogKlassAlignmentInBytes);
3583     } else {
3584       if (dst != src) mov(dst, src);
3585     }
3586     return;
3587   }
3588 
3589   if (use_XOR_for_compressed_class_base) {
3590     if (Universe::narrow_klass_shift() != 0) {
3591       lsl(dst, src, LogKlassAlignmentInBytes);
3592       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3593     } else {
3594       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3595     }
3596     return;
3597   }
3598 
3599   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3600       && Universe::narrow_klass_shift() == 0) {
3601     if (dst != src)
3602       movw(dst, src);
3603     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3604     return;
3605   }
3606 
3607   // Cannot assert, unverified entry point counts instructions (see .ad file)
3608   // vtableStubs also counts instructions in pd_code_size_limit.
3609   // Also do not verify_oop as this is called by verify_oop.
3610   if (dst == src) rbase = rheapbase;
3611   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3612   if (Universe::narrow_klass_shift() != 0) {
3613     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3614     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3615   } else {
3616     add(dst, rbase, src);
3617   }
3618   if (dst == src) reinit_heapbase();
3619 }
3620 
3621 void  MacroAssembler::decode_klass_not_null(Register r) {
3622   decode_klass_not_null(r, r);
3623 }
3624 
3625 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3626   assert (UseCompressedOops, "should only be used for compressed oops");
3627   assert (Universe::heap() != NULL, "java heap should be initialized");
3628   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3629 
3630   int oop_index = oop_recorder()->find_index(obj);
3631   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3632 
3633   InstructionMark im(this);
3634   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3635   code_section()->relocate(inst_mark(), rspec);
3636   movz(dst, 0xDEAD, 16);
3637   movk(dst, 0xBEEF);
3638 }
3639 
3640 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3641   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3642   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3643   int index = oop_recorder()->find_index(k);
3644   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3645 
3646   InstructionMark im(this);
3647   RelocationHolder rspec = metadata_Relocation::spec(index);
3648   code_section()->relocate(inst_mark(), rspec);
3649   narrowKlass nk = Klass::encode_klass(k);
3650   movz(dst, (nk >> 16), 16);
3651   movk(dst, nk & 0xffff);
3652 }
3653 
3654 void MacroAssembler::load_heap_oop(Register dst, Address src)
3655 {
3656   if (UseCompressedOops) {
3657     ldrw(dst, src);
3658     decode_heap_oop(dst);
3659   } else {
3660     ldr(dst, src);
3661   }
3662 }
3663 
3664 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3665 {
3666   if (UseCompressedOops) {
3667     ldrw(dst, src);
3668     decode_heap_oop_not_null(dst);
3669   } else {
3670     ldr(dst, src);
3671   }
3672 }
3673 
3674 void MacroAssembler::store_heap_oop(Address dst, Register src) {
3675   if (UseCompressedOops) {
3676     assert(!dst.uses(src), "not enough registers");
3677     encode_heap_oop(src);
3678     strw(src, dst);
3679   } else
3680     str(src, dst);
3681 }
3682 
3683 // Used for storing NULLs.
3684 void MacroAssembler::store_heap_oop_null(Address dst) {
3685   if (UseCompressedOops) {
3686     strw(zr, dst);
3687   } else
3688     str(zr, dst);
3689 }
3690 
3691 #if INCLUDE_ALL_GCS
3692 void MacroAssembler::g1_write_barrier_pre(Register obj,
3693                                           Register pre_val,
3694                                           Register thread,
3695                                           Register tmp,
3696                                           bool tosca_live,
3697                                           bool expand_call) {
3698   // If expand_call is true then we expand the call_VM_leaf macro
3699   // directly to skip generating the check by
3700   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3701 
3702   assert(thread == rthread, "must be");
3703 
3704   Label done;
3705   Label runtime;
3706 
3707   assert(pre_val != noreg, "check this code");
3708 
3709   if (obj != noreg)
3710     assert_different_registers(obj, pre_val, tmp);
3711 
3712   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3713                                        SATBMarkQueue::byte_offset_of_active()));
3714   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3715                                        SATBMarkQueue::byte_offset_of_index()));
3716   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3717                                        SATBMarkQueue::byte_offset_of_buf()));
3718 
3719 
3720   // Is marking active?
3721   if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
3722     ldrw(tmp, in_progress);
3723   } else {
3724     assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
3725     ldrb(tmp, in_progress);
3726   }
3727   cbzw(tmp, done);
3728 
3729   // Do we need to load the previous value?
3730   if (obj != noreg) {
3731     load_heap_oop(pre_val, Address(obj, 0));
3732   }
3733 
3734   // Is the previous value null?
3735   cbz(pre_val, done);
3736 
3737   // Can we store original value in the thread's buffer?
3738   // Is index == 0?
3739   // (The index field is typed as size_t.)
3740 
3741   ldr(tmp, index);                      // tmp := *index_adr
3742   cbz(tmp, runtime);                    // tmp == 0?
3743                                         // If yes, goto runtime
3744 
3745   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3746   str(tmp, index);                      // *index_adr := tmp
3747   ldr(rscratch1, buffer);
3748   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3749 
3750   // Record the previous value
3751   str(pre_val, Address(tmp, 0));
3752   b(done);
3753 
3754   bind(runtime);
3755   // save the live input values
3756   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3757 
3758   // Calling the runtime using the regular call_VM_leaf mechanism generates
3759   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3760   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3761   //
3762   // If we care generating the pre-barrier without a frame (e.g. in the
3763   // intrinsified Reference.get() routine) then ebp might be pointing to
3764   // the caller frame and so this check will most likely fail at runtime.
3765   //
3766   // Expanding the call directly bypasses the generation of the check.
3767   // So when we do not have have a full interpreter frame on the stack
3768   // expand_call should be passed true.
3769 
3770   if (expand_call) {
3771     assert(pre_val != c_rarg1, "smashed arg");
3772     pass_arg1(this, thread);
3773     pass_arg0(this, pre_val);
3774     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3775   } else {
3776     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3777   }
3778 
3779   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3780 
3781   bind(done);
3782 }
3783 
3784 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3785                                            Register new_val,
3786                                            Register thread,
3787                                            Register tmp,
3788                                            Register tmp2) {
3789   assert(thread == rthread, "must be");
3790 
3791   if (UseShenandoahGC) {
3792     // No need for this in Shenandoah.
3793     return;
3794   }
3795 
3796   assert(UseG1GC, "expect G1 GC");
3797 
3798   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3799                                        DirtyCardQueue::byte_offset_of_index()));
3800   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3801                                        DirtyCardQueue::byte_offset_of_buf()));
3802 
3803   BarrierSet* bs = Universe::heap()->barrier_set();
3804   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3805   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3806 
3807   Label done;
3808   Label runtime;
3809 
3810   // Does store cross heap regions?
3811 
3812   eor(tmp, store_addr, new_val);
3813   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3814   cbz(tmp, done);
3815 
3816   // crosses regions, storing NULL?
3817 
3818   cbz(new_val, done);
3819 
3820   // storing region crossing non-NULL, is card already dirty?
3821 
3822   ExternalAddress cardtable((address) ct->byte_map_base);
3823   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3824   const Register card_addr = tmp;
3825 
3826   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3827 
3828   // get the address of the card
3829   load_byte_map_base(tmp2);
3830   add(card_addr, card_addr, tmp2);
3831   ldrb(tmp2, Address(card_addr));
3832   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3833   br(Assembler::EQ, done);
3834 
3835   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3836 
3837   membar(Assembler::StoreLoad);
3838 
3839   ldrb(tmp2, Address(card_addr));
3840   cbzw(tmp2, done);
3841 
3842   // storing a region crossing, non-NULL oop, card is clean.
3843   // dirty card and log.
3844 
3845   strb(zr, Address(card_addr));
3846 
3847   ldr(rscratch1, queue_index);
3848   cbz(rscratch1, runtime);
3849   sub(rscratch1, rscratch1, wordSize);
3850   str(rscratch1, queue_index);
3851 
3852   ldr(tmp2, buffer);
3853   str(card_addr, Address(tmp2, rscratch1));
3854   b(done);
3855 
3856   bind(runtime);
3857   // save the live input values
3858   push(store_addr->bit(true) | new_val->bit(true), sp);
3859   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3860   pop(store_addr->bit(true) | new_val->bit(true), sp);
3861 
3862   bind(done);
3863 }
3864 
3865 void MacroAssembler::shenandoah_write_barrier(Register dst) {
3866   assert(UseShenandoahGC, "must only be called with Shenandoah GC active");
3867   assert(dst != rscratch1, "need rscratch1");
3868   assert(dst != rscratch2, "need rscratch2");
3869 
3870   Label done;
3871 
3872   // Check for evacuation-in-progress
3873   Address evacuation_in_progress = Address(rthread, in_bytes(JavaThread::evacuation_in_progress_offset()));
3874   ldrb(rscratch1, evacuation_in_progress);
3875   membar(Assembler::LoadLoad);
3876 
3877   // The read-barrier.
3878   ldr(dst, Address(dst, BrooksPointer::byte_offset()));
3879 
3880   // Evac-check ...
3881   cbzw(rscratch1, done);
3882 
3883   RegSet to_save = RegSet::of(r0);
3884   if (dst != r0) {
3885     push(to_save, sp);
3886     mov(r0, dst);
3887   }
3888 
3889   assert(StubRoutines::aarch64::shenandoah_wb() != NULL, "need write barrier stub");
3890   far_call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::aarch64::shenandoah_wb())));
3891 
3892   if (dst != r0) {
3893     mov(dst, r0);
3894     pop(to_save, sp);
3895   }
3896   block_comment("} Shenandoah write barrier");
3897 
3898   bind(done);
3899 }
3900 
3901 #endif // INCLUDE_ALL_GCS
3902 
3903 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3904   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3905   int index = oop_recorder()->allocate_metadata_index(obj);
3906   RelocationHolder rspec = metadata_Relocation::spec(index);
3907   return Address((address)obj, rspec);
3908 }
3909 
3910 // Move an oop into a register.  immediate is true if we want
3911 // immediate instrcutions, i.e. we are not going to patch this
3912 // instruction while the code is being executed by another thread.  In
3913 // that case we can use move immediates rather than the constant pool.
3914 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3915   int oop_index;
3916   if (obj == NULL) {
3917     oop_index = oop_recorder()->allocate_oop_index(obj);
3918   } else {
3919     oop_index = oop_recorder()->find_index(obj);
3920     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3921   }
3922   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3923   if (! immediate) {
3924     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3925     ldr_constant(dst, Address(dummy, rspec));
3926   } else
3927     mov(dst, Address((address)obj, rspec));
3928 }
3929 
3930 // Move a metadata address into a register.
3931 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3932   int oop_index;
3933   if (obj == NULL) {
3934     oop_index = oop_recorder()->allocate_metadata_index(obj);
3935   } else {
3936     oop_index = oop_recorder()->find_index(obj);
3937   }
3938   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3939   mov(dst, Address((address)obj, rspec));
3940 }
3941 
3942 Address MacroAssembler::constant_oop_address(jobject obj) {
3943   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3944   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3945   int oop_index = oop_recorder()->find_index(obj);
3946   return Address((address)obj, oop_Relocation::spec(oop_index));
3947 }
3948 
3949 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3950 void MacroAssembler::tlab_allocate(Register obj,
3951                                    Register var_size_in_bytes,
3952                                    int con_size_in_bytes,
3953                                    Register t1,
3954                                    Register t2,
3955                                    Label& slow_case) {
3956   assert_different_registers(obj, t2);
3957   assert_different_registers(obj, var_size_in_bytes);
3958   Register end = t2;
3959 
3960   // verify_tlab();
3961 
3962   int oop_extra_words = Universe::heap()->oop_extra_words();
3963 
3964   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3965   if (var_size_in_bytes == noreg) {
3966     lea(end, Address(obj, con_size_in_bytes + oop_extra_words * HeapWordSize));
3967   } else {
3968     if (oop_extra_words > 0) {
3969       add(var_size_in_bytes, var_size_in_bytes, oop_extra_words * HeapWordSize);
3970     }
3971     lea(end, Address(obj, var_size_in_bytes));
3972   }
3973   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3974   cmp(end, rscratch1);
3975   br(Assembler::HI, slow_case);
3976 
3977   // update the tlab top pointer
3978   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3979 
3980   Universe::heap()->compile_prepare_oop(this, obj);
3981 
3982   // recover var_size_in_bytes if necessary
3983   if (var_size_in_bytes == end) {
3984     sub(var_size_in_bytes, var_size_in_bytes, obj);
3985   }
3986   // verify_tlab();
3987 }
3988 
3989 // Preserves r19, and r3.
3990 Register MacroAssembler::tlab_refill(Label& retry,
3991                                      Label& try_eden,
3992                                      Label& slow_case) {
3993   Register top = r0;
3994   Register t1  = r2;
3995   Register t2  = r4;
3996   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3997   Label do_refill, discard_tlab;
3998 
3999   if (!Universe::heap()->supports_inline_contig_alloc()) {
4000     // No allocation in the shared eden.
4001     b(slow_case);
4002   }
4003 
4004   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4005   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4006 
4007   // calculate amount of free space
4008   sub(t1, t1, top);
4009   lsr(t1, t1, LogHeapWordSize);
4010 
4011   // Retain tlab and allocate object in shared space if
4012   // the amount free in the tlab is too large to discard.
4013 
4014   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4015   cmp(t1, rscratch1);
4016   br(Assembler::LE, discard_tlab);
4017 
4018   // Retain
4019   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4020   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
4021   add(rscratch1, rscratch1, t2);
4022   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
4023 
4024   if (TLABStats) {
4025     // increment number of slow_allocations
4026     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
4027          1, rscratch1);
4028   }
4029   b(try_eden);
4030 
4031   bind(discard_tlab);
4032   if (TLABStats) {
4033     // increment number of refills
4034     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
4035          rscratch1);
4036     // accumulate wastage -- t1 is amount free in tlab
4037     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
4038          rscratch1);
4039   }
4040 
4041   // if tlab is currently allocated (top or end != null) then
4042   // fill [top, end + alignment_reserve) with array object
4043   cbz(top, do_refill);
4044 
4045   // set up the mark word
4046   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
4047   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
4048   // set the length to the remaining space
4049   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
4050   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4051   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
4052   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
4053   // set klass to intArrayKlass
4054   {
4055     unsigned long offset;
4056     // dubious reloc why not an oop reloc?
4057     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
4058          offset);
4059     ldr(t1, Address(rscratch1, offset));
4060   }
4061   // store klass last.  concurrent gcs assumes klass length is valid if
4062   // klass field is not null.
4063   store_klass(top, t1);
4064 
4065   mov(t1, top);
4066   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4067   sub(t1, t1, rscratch1);
4068   incr_allocated_bytes(rthread, t1, 0, rscratch1);
4069 
4070   // refill the tlab with an eden allocation
4071   bind(do_refill);
4072   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4073   lsl(t1, t1, LogHeapWordSize);
4074   // allocate new tlab, address returned in top
4075   eden_allocate(top, t1, 0, t2, slow_case);
4076 
4077   // Check that t1 was preserved in eden_allocate.
4078 #ifdef ASSERT
4079   if (UseTLAB) {
4080     Label ok;
4081     Register tsize = r4;
4082     assert_different_registers(tsize, rthread, t1);
4083     str(tsize, Address(pre(sp, -16)));
4084     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4085     lsl(tsize, tsize, LogHeapWordSize);
4086     cmp(t1, tsize);
4087     br(Assembler::EQ, ok);
4088     STOP("assert(t1 != tlab size)");
4089     should_not_reach_here();
4090 
4091     bind(ok);
4092     ldr(tsize, Address(post(sp, 16)));
4093   }
4094 #endif
4095   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4096   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4097   add(top, top, t1);
4098   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4099   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4100 
4101   if (ZeroTLAB) {
4102     // This is a fast TLAB refill, therefore the GC is not notified of it.
4103     // So compiled code must fill the new TLAB with zeroes.
4104     ldr(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4105     zero_memory(top,t1,t2);
4106   }
4107 
4108   verify_tlab();
4109   b(retry);
4110 
4111   return rthread; // for use by caller
4112 }
4113 
4114 // Zero words; len is in bytes
4115 // Destroys all registers except addr
4116 // len must be a nonzero multiple of wordSize
4117 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4118   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4119 
4120 #ifdef ASSERT
4121   { Label L;
4122     tst(len, BytesPerWord - 1);
4123     br(Assembler::EQ, L);
4124     stop("len is not a multiple of BytesPerWord");
4125     bind(L);
4126   }
4127 #endif
4128 
4129 #ifndef PRODUCT
4130   block_comment("zero memory");
4131 #endif
4132 
4133   Label loop;
4134   Label entry;
4135 
4136 //  Algorithm:
4137 //
4138 //    scratch1 = cnt & 7;
4139 //    cnt -= scratch1;
4140 //    p += scratch1;
4141 //    switch (scratch1) {
4142 //      do {
4143 //        cnt -= 8;
4144 //          p[-8] = 0;
4145 //        case 7:
4146 //          p[-7] = 0;
4147 //        case 6:
4148 //          p[-6] = 0;
4149 //          // ...
4150 //        case 1:
4151 //          p[-1] = 0;
4152 //        case 0:
4153 //          p += 8;
4154 //      } while (cnt);
4155 //    }
4156 
4157   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4158 
4159   lsr(len, len, LogBytesPerWord);
4160   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4161   sub(len, len, rscratch1);      // cnt -= unroll
4162   // t1 always points to the end of the region we're about to zero
4163   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4164   adr(rscratch2, entry);
4165   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4166   br(rscratch2);
4167   bind(loop);
4168   sub(len, len, unroll);
4169   for (int i = -unroll; i < 0; i++)
4170     str(zr, Address(t1, i * wordSize));
4171   bind(entry);
4172   add(t1, t1, unroll * wordSize);
4173   cbnz(len, loop);
4174 }
4175 
4176 // Defines obj, preserves var_size_in_bytes
4177 void MacroAssembler::eden_allocate(Register obj,
4178                                    Register var_size_in_bytes,
4179                                    int con_size_in_bytes,
4180                                    Register t1,
4181                                    Label& slow_case) {
4182   assert_different_registers(obj, var_size_in_bytes, t1);
4183   if (!Universe::heap()->supports_inline_contig_alloc()) {
4184     b(slow_case);
4185   } else {
4186     Register end = t1;
4187     Register heap_end = rscratch2;
4188     Label retry;
4189     bind(retry);
4190     {
4191       unsigned long offset;
4192       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
4193       ldr(heap_end, Address(rscratch1, offset));
4194     }
4195 
4196     ExternalAddress heap_top((address) Universe::heap()->top_addr());
4197 
4198     // Get the current top of the heap
4199     {
4200       unsigned long offset;
4201       adrp(rscratch1, heap_top, offset);
4202       // Use add() here after ARDP, rather than lea().
4203       // lea() does not generate anything if its offset is zero.
4204       // However, relocs expect to find either an ADD or a load/store
4205       // insn after an ADRP.  add() always generates an ADD insn, even
4206       // for add(Rn, Rn, 0).
4207       add(rscratch1, rscratch1, offset);
4208       ldaxr(obj, rscratch1);
4209     }
4210 
4211     // Adjust it my the size of our new object
4212     if (var_size_in_bytes == noreg) {
4213       lea(end, Address(obj, con_size_in_bytes));
4214     } else {
4215       lea(end, Address(obj, var_size_in_bytes));
4216     }
4217 
4218     // if end < obj then we wrapped around high memory
4219     cmp(end, obj);
4220     br(Assembler::LO, slow_case);
4221 
4222     cmp(end, heap_end);
4223     br(Assembler::HI, slow_case);
4224 
4225     // If heap_top hasn't been changed by some other thread, update it.
4226     stlxr(rscratch2, end, rscratch1);
4227     cbnzw(rscratch2, retry);
4228   }
4229 }
4230 
4231 void MacroAssembler::verify_tlab() {
4232 #ifdef ASSERT
4233   if (UseTLAB && VerifyOops) {
4234     Label next, ok;
4235 
4236     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4237 
4238     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4239     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4240     cmp(rscratch2, rscratch1);
4241     br(Assembler::HS, next);
4242     STOP("assert(top >= start)");
4243     should_not_reach_here();
4244 
4245     bind(next);
4246     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4247     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4248     cmp(rscratch2, rscratch1);
4249     br(Assembler::HS, ok);
4250     STOP("assert(top <= end)");
4251     should_not_reach_here();
4252 
4253     bind(ok);
4254     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4255   }
4256 #endif
4257 }
4258 
4259 // Writes to stack successive pages until offset reached to check for
4260 // stack overflow + shadow pages.  This clobbers tmp.
4261 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4262   assert_different_registers(tmp, size, rscratch1);
4263   mov(tmp, sp);
4264   // Bang stack for total size given plus shadow page size.
4265   // Bang one page at a time because large size can bang beyond yellow and
4266   // red zones.
4267   Label loop;
4268   mov(rscratch1, os::vm_page_size());
4269   bind(loop);
4270   lea(tmp, Address(tmp, -os::vm_page_size()));
4271   subsw(size, size, rscratch1);
4272   str(size, Address(tmp));
4273   br(Assembler::GT, loop);
4274 
4275   // Bang down shadow pages too.
4276   // At this point, (tmp-0) is the last address touched, so don't
4277   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4278   // was post-decremented.)  Skip this address by starting at i=1, and
4279   // touch a few more pages below.  N.B.  It is important to touch all
4280   // the way down to and including i=StackShadowPages.
4281   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4282     // this could be any sized move but this is can be a debugging crumb
4283     // so the bigger the better.
4284     lea(tmp, Address(tmp, -os::vm_page_size()));
4285     str(size, Address(tmp));
4286   }
4287 }
4288 
4289 
4290 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4291   unsigned long off;
4292   adrp(r, Address(page, rtype), off);
4293   InstructionMark im(this);
4294   code_section()->relocate(inst_mark(), rtype);
4295   ldrw(zr, Address(r, off));
4296   return inst_mark();
4297 }
4298 
4299 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4300   InstructionMark im(this);
4301   code_section()->relocate(inst_mark(), rtype);
4302   ldrw(zr, Address(r, 0));
4303   return inst_mark();
4304 }
4305 
4306 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4307   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4308   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4309   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4310   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4311   long offset_low = dest_page - low_page;
4312   long offset_high = dest_page - high_page;
4313 
4314   assert(is_valid_AArch64_address(dest.target()), "bad address");
4315   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4316 
4317   InstructionMark im(this);
4318   code_section()->relocate(inst_mark(), dest.rspec());
4319   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4320   // the code cache so that if it is relocated we know it will still reach
4321   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4322     _adrp(reg1, dest.target());
4323   } else {
4324     unsigned long target = (unsigned long)dest.target();
4325     unsigned long adrp_target
4326       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4327 
4328     _adrp(reg1, (address)adrp_target);
4329     movk(reg1, target >> 32, 32);
4330   }
4331   byte_offset = (unsigned long)dest.target() & 0xfff;
4332 }
4333 
4334 void MacroAssembler::load_byte_map_base(Register reg) {
4335   jbyte *byte_map_base =
4336     ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base;
4337 
4338   if (is_valid_AArch64_address((address)byte_map_base)) {
4339     // Strictly speaking the byte_map_base isn't an address at all,
4340     // and it might even be negative.
4341     unsigned long offset;
4342     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4343     // We expect offset to be zero with most collectors.
4344     if (offset != 0) {
4345       add(reg, reg, offset);
4346     }
4347   } else {
4348     mov(reg, (uint64_t)byte_map_base);
4349   }
4350 }
4351 
4352 void MacroAssembler::build_frame(int framesize) {
4353   assert(framesize > 0, "framesize must be > 0");
4354   if (framesize < ((1 << 9) + 2 * wordSize)) {
4355     sub(sp, sp, framesize);
4356     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4357     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4358   } else {
4359     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4360     if (PreserveFramePointer) mov(rfp, sp);
4361     if (framesize < ((1 << 12) + 2 * wordSize))
4362       sub(sp, sp, framesize - 2 * wordSize);
4363     else {
4364       mov(rscratch1, framesize - 2 * wordSize);
4365       sub(sp, sp, rscratch1);
4366     }
4367   }
4368 }
4369 
4370 void MacroAssembler::remove_frame(int framesize) {
4371   assert(framesize > 0, "framesize must be > 0");
4372   if (framesize < ((1 << 9) + 2 * wordSize)) {
4373     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4374     add(sp, sp, framesize);
4375   } else {
4376     if (framesize < ((1 << 12) + 2 * wordSize))
4377       add(sp, sp, framesize - 2 * wordSize);
4378     else {
4379       mov(rscratch1, framesize - 2 * wordSize);
4380       add(sp, sp, rscratch1);
4381     }
4382     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4383   }
4384 }
4385 
4386 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4387 
4388 // Search for str1 in str2 and return index or -1
4389 void MacroAssembler::string_indexof(Register str2, Register str1,
4390                                     Register cnt2, Register cnt1,
4391                                     Register tmp1, Register tmp2,
4392                                     Register tmp3, Register tmp4,
4393                                     int icnt1, Register result, int ae) {
4394   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
4395 
4396   Register ch1 = rscratch1;
4397   Register ch2 = rscratch2;
4398   Register cnt1tmp = tmp1;
4399   Register cnt2tmp = tmp2;
4400   Register cnt1_neg = cnt1;
4401   Register cnt2_neg = cnt2;
4402   Register result_tmp = tmp4;
4403 
4404   bool isL = ae == StrIntrinsicNode::LL;
4405 
4406   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4407   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4408   int str1_chr_shift = str1_isL ? 0:1;
4409   int str2_chr_shift = str2_isL ? 0:1;
4410   int str1_chr_size = str1_isL ? 1:2;
4411   int str2_chr_size = str2_isL ? 1:2;
4412   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4413                                       (chr_insn)&MacroAssembler::ldrh;
4414   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4415                                       (chr_insn)&MacroAssembler::ldrh;
4416   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4417   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4418 
4419   // Note, inline_string_indexOf() generates checks:
4420   // if (substr.count > string.count) return -1;
4421   // if (substr.count == 0) return 0;
4422 
4423 // We have two strings, a source string in str2, cnt2 and a pattern string
4424 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4425 
4426 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4427 // With a small pattern and source we use linear scan.
4428 
4429   if (icnt1 == -1) {
4430     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4431     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
4432     br(LO, LINEARSEARCH);       // a byte array.
4433     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
4434     br(HS, LINEARSEARCH);
4435   }
4436 
4437 // The Boyer Moore alogorithm is based on the description here:-
4438 //
4439 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4440 //
4441 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4442 // and the 'Good Suffix' rule.
4443 //
4444 // These rules are essentially heuristics for how far we can shift the
4445 // pattern along the search string.
4446 //
4447 // The implementation here uses the 'Bad Character' rule only because of the
4448 // complexity of initialisation for the 'Good Suffix' rule.
4449 //
4450 // This is also known as the Boyer-Moore-Horspool algorithm:-
4451 //
4452 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4453 //
4454 // #define ASIZE 128
4455 //
4456 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4457 //       int i, j;
4458 //       unsigned c;
4459 //       unsigned char bc[ASIZE];
4460 //
4461 //       /* Preprocessing */
4462 //       for (i = 0; i < ASIZE; ++i)
4463 //          bc[i] = 0;
4464 //       for (i = 0; i < m - 1; ) {
4465 //          c = x[i];
4466 //          ++i;
4467 //          if (c < ASIZE) bc[c] = i;
4468 //       }
4469 //
4470 //       /* Searching */
4471 //       j = 0;
4472 //       while (j <= n - m) {
4473 //          c = y[i+j];
4474 //          if (x[m-1] == c)
4475 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4476 //          if (i < 0) return j;
4477 //          if (c < ASIZE)
4478 //            j = j - bc[y[j+m-1]] + m;
4479 //          else
4480 //            j += 1; // Advance by 1 only if char >= ASIZE
4481 //       }
4482 //    }
4483 
4484   if (icnt1 == -1) {
4485     BIND(BM);
4486 
4487     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4488     Label BMADV, BMMATCH, BMCHECKEND;
4489 
4490     Register cnt1end = tmp2;
4491     Register str2end = cnt2;
4492     Register skipch = tmp2;
4493 
4494     // Restrict ASIZE to 128 to reduce stack space/initialisation.
4495     // The presence of chars >= ASIZE in the target string does not affect
4496     // performance, but we must be careful not to initialise them in the stack
4497     // array.
4498     // The presence of chars >= ASIZE in the source string may adversely affect
4499     // performance since we can only advance by one when we encounter one.
4500 
4501       stp(zr, zr, pre(sp, -128));
4502       for (int i = 1; i < 8; i++)
4503           stp(zr, zr, Address(sp, i*16));
4504 
4505       mov(cnt1tmp, 0);
4506       sub(cnt1end, cnt1, 1);
4507     BIND(BCLOOP);
4508       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4509       cmp(ch1, 128);
4510       add(cnt1tmp, cnt1tmp, 1);
4511       br(HS, BCSKIP);
4512       strb(cnt1tmp, Address(sp, ch1));
4513     BIND(BCSKIP);
4514       cmp(cnt1tmp, cnt1end);
4515       br(LT, BCLOOP);
4516 
4517       mov(result_tmp, str2);
4518 
4519       sub(cnt2, cnt2, cnt1);
4520       add(str2end, str2, cnt2, LSL, str2_chr_shift);
4521     BIND(BMLOOPSTR2);
4522       sub(cnt1tmp, cnt1, 1);
4523       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4524       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4525       cmp(ch1, skipch);
4526       br(NE, BMSKIP);
4527       subs(cnt1tmp, cnt1tmp, 1);
4528       br(LT, BMMATCH);
4529     BIND(BMLOOPSTR1);
4530       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4531       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4532       cmp(ch1, ch2);
4533       br(NE, BMSKIP);
4534       subs(cnt1tmp, cnt1tmp, 1);
4535       br(GE, BMLOOPSTR1);
4536     BIND(BMMATCH);
4537       sub(result, str2, result_tmp);
4538       if (!str2_isL) lsr(result, result, 1);
4539       add(sp, sp, 128);
4540       b(DONE);
4541     BIND(BMADV);
4542       add(str2, str2, str2_chr_size);
4543       b(BMCHECKEND);
4544     BIND(BMSKIP);
4545       cmp(skipch, 128);
4546       br(HS, BMADV);
4547       ldrb(ch2, Address(sp, skipch));
4548       add(str2, str2, cnt1, LSL, str2_chr_shift);
4549       sub(str2, str2, ch2, LSL, str2_chr_shift);
4550     BIND(BMCHECKEND);
4551       cmp(str2, str2end);
4552       br(LE, BMLOOPSTR2);
4553       add(sp, sp, 128);
4554       b(NOMATCH);
4555   }
4556 
4557   BIND(LINEARSEARCH);
4558   {
4559     Label DO1, DO2, DO3;
4560 
4561     Register str2tmp = tmp2;
4562     Register first = tmp3;
4563 
4564     if (icnt1 == -1)
4565     {
4566         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4567 
4568         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4569         br(LT, DOSHORT);
4570 
4571         sub(cnt2, cnt2, cnt1);
4572         mov(result_tmp, cnt2);
4573 
4574         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4575         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4576         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4577         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4578         (this->*str1_load_1chr)(first, Address(str1, cnt1_neg));
4579 
4580       BIND(FIRST_LOOP);
4581         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4582         cmp(first, ch2);
4583         br(EQ, STR1_LOOP);
4584       BIND(STR2_NEXT);
4585         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4586         br(LE, FIRST_LOOP);
4587         b(NOMATCH);
4588 
4589       BIND(STR1_LOOP);
4590         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4591         add(cnt2tmp, cnt2_neg, str2_chr_size);
4592         br(GE, MATCH);
4593 
4594       BIND(STR1_NEXT);
4595         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4596         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4597         cmp(ch1, ch2);
4598         br(NE, STR2_NEXT);
4599         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4600         add(cnt2tmp, cnt2tmp, str2_chr_size);
4601         br(LT, STR1_NEXT);
4602         b(MATCH);
4603 
4604       BIND(DOSHORT);
4605       if (str1_isL == str2_isL) {
4606         cmp(cnt1, 2);
4607         br(LT, DO1);
4608         br(GT, DO3);
4609       }
4610     }
4611 
4612     if (icnt1 == 4) {
4613       Label CH1_LOOP;
4614 
4615         (this->*load_4chr)(ch1, str1);
4616         sub(cnt2, cnt2, 4);
4617         mov(result_tmp, cnt2);
4618         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4619         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4620 
4621       BIND(CH1_LOOP);
4622         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4623         cmp(ch1, ch2);
4624         br(EQ, MATCH);
4625         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4626         br(LE, CH1_LOOP);
4627         b(NOMATCH);
4628     }
4629 
4630     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4631       Label CH1_LOOP;
4632 
4633       BIND(DO2);
4634         (this->*load_2chr)(ch1, str1);
4635         sub(cnt2, cnt2, 2);
4636         mov(result_tmp, cnt2);
4637         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4638         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4639 
4640       BIND(CH1_LOOP);
4641         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4642         cmp(ch1, ch2);
4643         br(EQ, MATCH);
4644         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4645         br(LE, CH1_LOOP);
4646         b(NOMATCH);
4647     }
4648 
4649     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4650       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4651 
4652       BIND(DO3);
4653         (this->*load_2chr)(first, str1);
4654         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4655 
4656         sub(cnt2, cnt2, 3);
4657         mov(result_tmp, cnt2);
4658         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4659         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4660 
4661       BIND(FIRST_LOOP);
4662         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4663         cmpw(first, ch2);
4664         br(EQ, STR1_LOOP);
4665       BIND(STR2_NEXT);
4666         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4667         br(LE, FIRST_LOOP);
4668         b(NOMATCH);
4669 
4670       BIND(STR1_LOOP);
4671         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4672         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4673         cmp(ch1, ch2);
4674         br(NE, STR2_NEXT);
4675         b(MATCH);
4676     }
4677 
4678     if (icnt1 == -1 || icnt1 == 1) {
4679       Label CH1_LOOP, HAS_ZERO;
4680       Label DO1_SHORT, DO1_LOOP;
4681 
4682       BIND(DO1);
4683         (this->*str1_load_1chr)(ch1, str1);
4684         cmp(cnt2, 8);
4685         br(LT, DO1_SHORT);
4686 
4687         if (str2_isL) {
4688           if (!str1_isL) {
4689             tst(ch1, 0xff00);
4690             br(NE, NOMATCH);
4691           }
4692           orr(ch1, ch1, ch1, LSL, 8);
4693         }
4694         orr(ch1, ch1, ch1, LSL, 16);
4695         orr(ch1, ch1, ch1, LSL, 32);
4696 
4697         sub(cnt2, cnt2, 8/str2_chr_size);
4698         mov(result_tmp, cnt2);
4699         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4700         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4701 
4702         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4703       BIND(CH1_LOOP);
4704         ldr(ch2, Address(str2, cnt2_neg));
4705         eor(ch2, ch1, ch2);
4706         sub(tmp1, ch2, tmp3);
4707         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4708         bics(tmp1, tmp1, tmp2);
4709         br(NE, HAS_ZERO);
4710         adds(cnt2_neg, cnt2_neg, 8);
4711         br(LT, CH1_LOOP);
4712 
4713         cmp(cnt2_neg, 8);
4714         mov(cnt2_neg, 0);
4715         br(LT, CH1_LOOP);
4716         b(NOMATCH);
4717 
4718       BIND(HAS_ZERO);
4719         rev(tmp1, tmp1);
4720         clz(tmp1, tmp1);
4721         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4722         b(MATCH);
4723 
4724       BIND(DO1_SHORT);
4725         mov(result_tmp, cnt2);
4726         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4727         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4728       BIND(DO1_LOOP);
4729         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4730         cmpw(ch1, ch2);
4731         br(EQ, MATCH);
4732         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4733         br(LT, DO1_LOOP);
4734     }
4735   }
4736   BIND(NOMATCH);
4737     mov(result, -1);
4738     b(DONE);
4739   BIND(MATCH);
4740     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4741   BIND(DONE);
4742 }
4743 
4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4746 
4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4748                                          Register ch, Register result,
4749                                          Register tmp1, Register tmp2, Register tmp3)
4750 {
4751   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4752   Register cnt1_neg = cnt1;
4753   Register ch1 = rscratch1;
4754   Register result_tmp = rscratch2;
4755 
4756   cmp(cnt1, 4);
4757   br(LT, DO1_SHORT);
4758 
4759   orr(ch, ch, ch, LSL, 16);
4760   orr(ch, ch, ch, LSL, 32);
4761 
4762   sub(cnt1, cnt1, 4);
4763   mov(result_tmp, cnt1);
4764   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4765   sub(cnt1_neg, zr, cnt1, LSL, 1);
4766 
4767   mov(tmp3, 0x0001000100010001);
4768 
4769   BIND(CH1_LOOP);
4770     ldr(ch1, Address(str1, cnt1_neg));
4771     eor(ch1, ch, ch1);
4772     sub(tmp1, ch1, tmp3);
4773     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4774     bics(tmp1, tmp1, tmp2);
4775     br(NE, HAS_ZERO);
4776     adds(cnt1_neg, cnt1_neg, 8);
4777     br(LT, CH1_LOOP);
4778 
4779     cmp(cnt1_neg, 8);
4780     mov(cnt1_neg, 0);
4781     br(LT, CH1_LOOP);
4782     b(NOMATCH);
4783 
4784   BIND(HAS_ZERO);
4785     rev(tmp1, tmp1);
4786     clz(tmp1, tmp1);
4787     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4788     b(MATCH);
4789 
4790   BIND(DO1_SHORT);
4791     mov(result_tmp, cnt1);
4792     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4793     sub(cnt1_neg, zr, cnt1, LSL, 1);
4794   BIND(DO1_LOOP);
4795     ldrh(ch1, Address(str1, cnt1_neg));
4796     cmpw(ch, ch1);
4797     br(EQ, MATCH);
4798     adds(cnt1_neg, cnt1_neg, 2);
4799     br(LT, DO1_LOOP);
4800   BIND(NOMATCH);
4801     mov(result, -1);
4802     b(DONE);
4803   BIND(MATCH);
4804     add(result, result_tmp, cnt1_neg, ASR, 1);
4805   BIND(DONE);
4806 }
4807 
4808 // Compare strings.
4809 void MacroAssembler::string_compare(Register str1, Register str2,
4810                                     Register cnt1, Register cnt2, Register result,
4811                                     Register tmp1,
4812                                     FloatRegister vtmp, FloatRegister vtmpZ, int ae) {
4813   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4814     NEXT_WORD, DIFFERENCE;
4815 
4816   bool isLL = ae == StrIntrinsicNode::LL;
4817   bool isLU = ae == StrIntrinsicNode::LU;
4818   bool isUL = ae == StrIntrinsicNode::UL;
4819 
4820   bool str1_isL = isLL || isLU;
4821   bool str2_isL = isLL || isUL;
4822 
4823   int str1_chr_shift = str1_isL ? 0 : 1;
4824   int str2_chr_shift = str2_isL ? 0 : 1;
4825   int str1_chr_size = str1_isL ? 1 : 2;
4826   int str2_chr_size = str2_isL ? 1 : 2;
4827 
4828   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4829                                       (chr_insn)&MacroAssembler::ldrh;
4830   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4831                                       (chr_insn)&MacroAssembler::ldrh;
4832   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4833                             (uxt_insn)&MacroAssembler::uxthw;
4834 
4835   BLOCK_COMMENT("string_compare {");
4836 
4837   // Bizzarely, the counts are passed in bytes, regardless of whether they
4838   // are L or U strings, however the result is always in characters.
4839   if (!str1_isL) asrw(cnt1, cnt1, 1);
4840   if (!str2_isL) asrw(cnt2, cnt2, 1);
4841 
4842   // Compute the minimum of the string lengths and save the difference.
4843   subsw(tmp1, cnt1, cnt2);
4844   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4845 
4846   // A very short string
4847   cmpw(cnt2, isLL ? 8:4);
4848   br(Assembler::LT, SHORT_STRING);
4849 
4850   // Check if the strings start at the same location.
4851   cmp(str1, str2);
4852   br(Assembler::EQ, LENGTH_DIFF);
4853 
4854   // Compare longwords
4855   {
4856     subw(cnt2, cnt2, isLL ? 8:4); // The last longword is a special case
4857 
4858     // Move both string pointers to the last longword of their
4859     // strings, negate the remaining count, and convert it to bytes.
4860     lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4861     lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4862     if (isLU || isUL) {
4863       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4864       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4865     }
4866     sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4867 
4868     // Loop, loading longwords and comparing them into rscratch2.
4869     bind(NEXT_WORD);
4870     if (isLU) {
4871       ldrs(vtmp, Address(str1, cnt1));
4872       zip1(vtmp, T8B, vtmp, vtmpZ);
4873       umov(result, vtmp, D, 0);
4874     } else {
4875       ldr(result, Address(str1, isUL ? cnt1:cnt2));
4876     }
4877     if (isUL) {
4878       ldrs(vtmp, Address(str2, cnt2));
4879       zip1(vtmp, T8B, vtmp, vtmpZ);
4880       umov(rscratch1, vtmp, D, 0);
4881     } else {
4882       ldr(rscratch1, Address(str2, cnt2));
4883     }
4884     adds(cnt2, cnt2, isUL ? 4:8);
4885     if (isLU || isUL) add(cnt1, cnt1, isLU ? 4:8);
4886     eor(rscratch2, result, rscratch1);
4887     cbnz(rscratch2, DIFFERENCE);
4888     br(Assembler::LT, NEXT_WORD);
4889 
4890     // Last longword.  In the case where length == 4 we compare the
4891     // same longword twice, but that's still faster than another
4892     // conditional branch.
4893 
4894     if (isLU) {
4895       ldrs(vtmp, Address(str1));
4896       zip1(vtmp, T8B, vtmp, vtmpZ);
4897       umov(result, vtmp, D, 0);
4898     } else {
4899       ldr(result, Address(str1));
4900     }
4901     if (isUL) {
4902       ldrs(vtmp, Address(str2));
4903       zip1(vtmp, T8B, vtmp, vtmpZ);
4904       umov(rscratch1, vtmp, D, 0);
4905     } else {
4906       ldr(rscratch1, Address(str2));
4907     }
4908     eor(rscratch2, result, rscratch1);
4909     cbz(rscratch2, LENGTH_DIFF);
4910 
4911     // Find the first different characters in the longwords and
4912     // compute their difference.
4913     bind(DIFFERENCE);
4914     rev(rscratch2, rscratch2);
4915     clz(rscratch2, rscratch2);
4916     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4917     lsrv(result, result, rscratch2);
4918     (this->*ext_chr)(result, result);
4919     lsrv(rscratch1, rscratch1, rscratch2);
4920     (this->*ext_chr)(rscratch1, rscratch1);
4921     subw(result, result, rscratch1);
4922     b(DONE);
4923   }
4924 
4925   bind(SHORT_STRING);
4926   // Is the minimum length zero?
4927   cbz(cnt2, LENGTH_DIFF);
4928 
4929   bind(SHORT_LOOP);
4930   (this->*str1_load_chr)(result, Address(post(str1, str1_chr_size)));
4931   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4932   subw(result, result, cnt1);
4933   cbnz(result, DONE);
4934   sub(cnt2, cnt2, 1);
4935   cbnz(cnt2, SHORT_LOOP);
4936 
4937   // Strings are equal up to min length.  Return the length difference.
4938   bind(LENGTH_DIFF);
4939   mov(result, tmp1);
4940 
4941   // That's it
4942   bind(DONE);
4943 
4944   BLOCK_COMMENT("} string_compare");
4945 }
4946 
4947 // Compare Strings or char/byte arrays.
4948 
4949 // is_string is true iff this is a string comparison.
4950 
4951 // For Strings we're passed the address of the first characters in a1
4952 // and a2 and the length in cnt1.
4953 
4954 // For byte and char arrays we're passed the arrays themselves and we
4955 // have to extract length fields and do null checks here.
4956 
4957 // elem_size is the element size in bytes: either 1 or 2.
4958 
4959 // There are two implementations.  For arrays >= 8 bytes, all
4960 // comparisons (including the final one, which may overlap) are
4961 // performed 8 bytes at a time.  For arrays < 8 bytes, we compare a
4962 // halfword, then a short, and then a byte.
4963 
4964 void MacroAssembler::arrays_equals(Register a1, Register a2,
4965                                    Register result, Register cnt1,
4966                                    int elem_size, bool is_string)
4967 {
4968   Label SAME, DONE, SHORT, NEXT_WORD, ONE;
4969   Register tmp1 = rscratch1;
4970   Register tmp2 = rscratch2;
4971   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4972   int elem_per_word = wordSize/elem_size;
4973   int log_elem_size = exact_log2(elem_size);
4974   int length_offset = arrayOopDesc::length_offset_in_bytes();
4975   int base_offset
4976     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4977 
4978   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4979   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4980 
4981 #ifndef PRODUCT
4982   {
4983     const char kind = (elem_size == 2) ? 'U' : 'L';
4984     char comment[64];
4985     snprintf(comment, sizeof comment, "%s%c%s {",
4986              is_string ? "string_equals" : "array_equals",
4987              kind, "{");
4988     BLOCK_COMMENT(comment);
4989   }
4990 #endif
4991 
4992   mov(result, false);
4993 
4994   if (!is_string) {
4995     // if (a==a2)
4996     //     return true;
4997     cmp(a1, a2);
4998     oopDesc::bs()->asm_acmp_barrier(this, a1, a2);
4999     br(Assembler::EQ, SAME);
5000     // if (a==null || a2==null)
5001     //     return false;
5002     cbz(a1, DONE);
5003     cbz(a2, DONE);
5004     // if (a1.length != a2.length)
5005     //      return false;
5006     ldrw(cnt1, Address(a1, length_offset));
5007     ldrw(cnt2, Address(a2, length_offset));
5008     eorw(tmp1, cnt1, cnt2);
5009     cbnzw(tmp1, DONE);
5010 
5011     lea(a1, Address(a1, base_offset));
5012     lea(a2, Address(a2, base_offset));
5013   }
5014 
5015   // Check for short strings, i.e. smaller than wordSize.
5016   subs(cnt1, cnt1, elem_per_word);
5017   br(Assembler::LT, SHORT);
5018   // Main 8 byte comparison loop.
5019   bind(NEXT_WORD); {
5020     ldr(tmp1, Address(post(a1, wordSize)));
5021     ldr(tmp2, Address(post(a2, wordSize)));
5022     subs(cnt1, cnt1, elem_per_word);
5023     eor(tmp1, tmp1, tmp2);
5024     cbnz(tmp1, DONE);
5025   } br(GT, NEXT_WORD);
5026   // Last longword.  In the case where length == 4 we compare the
5027   // same longword twice, but that's still faster than another
5028   // conditional branch.
5029   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5030   // length == 4.
5031   if (log_elem_size > 0)
5032     lsl(cnt1, cnt1, log_elem_size);
5033   ldr(tmp1, Address(a1, cnt1));
5034   ldr(tmp2, Address(a2, cnt1));
5035   eor(tmp1, tmp1, tmp2);
5036   cbnz(tmp1, DONE);
5037   b(SAME);
5038 
5039   bind(SHORT);
5040   Label TAIL03, TAIL01;
5041 
5042   tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5043   {
5044     ldrw(tmp1, Address(post(a1, 4)));
5045     ldrw(tmp2, Address(post(a2, 4)));
5046     eorw(tmp1, tmp1, tmp2);
5047     cbnzw(tmp1, DONE);
5048   }
5049   bind(TAIL03);
5050   tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5051   {
5052     ldrh(tmp1, Address(post(a1, 2)));
5053     ldrh(tmp2, Address(post(a2, 2)));
5054     eorw(tmp1, tmp1, tmp2);
5055     cbnzw(tmp1, DONE);
5056   }
5057   bind(TAIL01);
5058   if (elem_size == 1) { // Only needed when comparing byte arrays.
5059     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5060     {
5061       ldrb(tmp1, a1);
5062       ldrb(tmp2, a2);
5063       eorw(tmp1, tmp1, tmp2);
5064       cbnzw(tmp1, DONE);
5065     }
5066   }
5067   // Arrays are equal.
5068   bind(SAME);
5069   mov(result, true);
5070 
5071   // That's it.
5072   bind(DONE);
5073   BLOCK_COMMENT(is_string ? "} string_equals" : "} array_equals");
5074 }
5075 
5076 
5077 // base:     Address of a buffer to be zeroed, 8 bytes aligned.
5078 // cnt:      Count in HeapWords.
5079 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
5080 void MacroAssembler::zero_words(Register base, Register cnt)
5081 {
5082   if (UseBlockZeroing) {
5083     block_zero(base, cnt);
5084   } else {
5085     fill_words(base, cnt, zr);
5086   }
5087 }
5088 
5089 // r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
5090 // cnt:          Immediate count in HeapWords.
5091 // r11 = tmp:    For use as cnt if we need to call out
5092 #define ShortArraySize (18 * BytesPerLong)
5093 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5094 {
5095   Register tmp = r11;
5096   int i = cnt & 1;  // store any odd word to start
5097   if (i) str(zr, Address(base));
5098 
5099   if (cnt <= ShortArraySize / BytesPerLong) {
5100     for (; i < (int)cnt; i += 2)
5101       stp(zr, zr, Address(base, i * wordSize));
5102   } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
5103     mov(tmp, cnt);
5104     block_zero(base, tmp, true);
5105   } else {
5106     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5107     int remainder = cnt % (2 * unroll);
5108     for (; i < remainder; i += 2)
5109       stp(zr, zr, Address(base, i * wordSize));
5110 
5111     Label loop;
5112     Register cnt_reg = rscratch1;
5113     Register loop_base = rscratch2;
5114     cnt = cnt - remainder;
5115     mov(cnt_reg, cnt);
5116     // adjust base and prebias by -2 * wordSize so we can pre-increment
5117     add(loop_base, base, (remainder - 2) * wordSize);
5118     bind(loop);
5119     sub(cnt_reg, cnt_reg, 2 * unroll);
5120     for (i = 1; i < unroll; i++)
5121       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5122     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5123     cbnz(cnt_reg, loop);
5124   }
5125 }
5126 
5127 // base:   Address of a buffer to be filled, 8 bytes aligned.
5128 // cnt:    Count in 8-byte unit.
5129 // value:  Value to be filled with.
5130 // base will point to the end of the buffer after filling.
5131 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5132 {
5133 //  Algorithm:
5134 //
5135 //    scratch1 = cnt & 7;
5136 //    cnt -= scratch1;
5137 //    p += scratch1;
5138 //    switch (scratch1) {
5139 //      do {
5140 //        cnt -= 8;
5141 //          p[-8] = v;
5142 //        case 7:
5143 //          p[-7] = v;
5144 //        case 6:
5145 //          p[-6] = v;
5146 //          // ...
5147 //        case 1:
5148 //          p[-1] = v;
5149 //        case 0:
5150 //          p += 8;
5151 //      } while (cnt);
5152 //    }
5153 
5154   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5155 
5156   Label fini, skip, entry, loop;
5157   const int unroll = 8; // Number of stp instructions we'll unroll
5158 
5159   cbz(cnt, fini);
5160   tbz(base, 3, skip);
5161   str(value, Address(post(base, 8)));
5162   sub(cnt, cnt, 1);
5163   bind(skip);
5164 
5165   andr(rscratch1, cnt, (unroll-1) * 2);
5166   sub(cnt, cnt, rscratch1);
5167   add(base, base, rscratch1, Assembler::LSL, 3);
5168   adr(rscratch2, entry);
5169   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5170   br(rscratch2);
5171 
5172   bind(loop);
5173   add(base, base, unroll * 16);
5174   for (int i = -unroll; i < 0; i++)
5175     stp(value, value, Address(base, i * 16));
5176   bind(entry);
5177   subs(cnt, cnt, unroll * 2);
5178   br(Assembler::GE, loop);
5179 
5180   tbz(cnt, 0, fini);
5181   str(value, Address(post(base, 8)));
5182   bind(fini);
5183 }
5184 
5185 // Use DC ZVA to do fast zeroing.
5186 // base:   Address of a buffer to be zeroed, 8 bytes aligned.
5187 // cnt:    Count in HeapWords.
5188 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
5189 void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
5190 {
5191   Label small;
5192   Label store_pair, loop_store_pair, done;
5193   Label base_aligned;
5194 
5195   assert_different_registers(base, cnt, rscratch1);
5196   guarantee(base == r10 && cnt == r11, "fix register usage");
5197 
5198   Register tmp = rscratch1;
5199   Register tmp2 = rscratch2;
5200   int zva_length = VM_Version::zva_length();
5201 
5202   // Ensure ZVA length can be divided by 16. This is required by
5203   // the subsequent operations.
5204   assert (zva_length % 16 == 0, "Unexpected ZVA Length");
5205 
5206   if (!is_large) cbz(cnt, done);
5207   tbz(base, 3, base_aligned);
5208   str(zr, Address(post(base, 8)));
5209   sub(cnt, cnt, 1);
5210   bind(base_aligned);
5211 
5212   // Ensure count >= zva_length * 2 so that it still deserves a zva after
5213   // alignment.
5214   if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
5215     int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
5216     subs(tmp, cnt, low_limit >> 3);
5217     br(Assembler::LT, small);
5218   }
5219 
5220   far_call(StubRoutines::aarch64::get_zero_longs());
5221 
5222   bind(small);
5223 
5224   const int unroll = 8; // Number of stp instructions we'll unroll
5225   Label small_loop, small_table_end;
5226 
5227   andr(tmp, cnt, (unroll-1) * 2);
5228   sub(cnt, cnt, tmp);
5229   add(base, base, tmp, Assembler::LSL, 3);
5230   adr(tmp2, small_table_end);
5231   sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
5232   br(tmp2);
5233 
5234   bind(small_loop);
5235   add(base, base, unroll * 16);
5236   for (int i = -unroll; i < 0; i++)
5237     stp(zr, zr, Address(base, i * 16));
5238   bind(small_table_end);
5239   subs(cnt, cnt, unroll * 2);
5240   br(Assembler::GE, small_loop);
5241 
5242   tbz(cnt, 0, done);
5243   str(zr, Address(post(base, 8)));
5244 
5245   bind(done);
5246 }
5247 
5248 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5249 // java/lang/StringUTF16.compress.
5250 void MacroAssembler::encode_iso_array(Register src, Register dst,
5251                       Register len, Register result,
5252                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5253                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5254 {
5255     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
5256     Register tmp1 = rscratch1;
5257 
5258       mov(result, len); // Save initial len
5259 
5260 #ifndef BUILTIN_SIM
5261       subs(len, len, 32);
5262       br(LT, LOOP_8);
5263 
5264 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
5265 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
5266 // any char could not fit in a byte, so clear the FPSR so we can test it.
5267       clear_fpsr();
5268 
5269     BIND(NEXT_32);
5270       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5271       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
5272       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
5273       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
5274       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
5275       get_fpsr(tmp1);
5276       cbnzw(tmp1, LOOP_8);
5277       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
5278       subs(len, len, 32);
5279       add(src, src, 64);
5280       br(GE, NEXT_32);
5281 
5282     BIND(LOOP_8);
5283       adds(len, len, 32-8);
5284       br(LT, LOOP_1);
5285       clear_fpsr(); // QC may be set from loop above, clear again
5286     BIND(NEXT_8);
5287       ld1(Vtmp1, T8H, src);
5288       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
5289       get_fpsr(tmp1);
5290       cbnzw(tmp1, LOOP_1);
5291       st1(Vtmp1, T8B, post(dst, 8));
5292       subs(len, len, 8);
5293       add(src, src, 16);
5294       br(GE, NEXT_8);
5295 
5296     BIND(LOOP_1);
5297       adds(len, len, 8);
5298       br(LE, DONE);
5299 #else
5300       cbz(len, DONE);
5301 #endif
5302     BIND(NEXT_1);
5303       ldrh(tmp1, Address(post(src, 2)));
5304       tst(tmp1, 0xff00);
5305       br(NE, DONE);
5306       strb(tmp1, Address(post(dst, 1)));
5307       subs(len, len, 1);
5308       br(GT, NEXT_1);
5309 
5310     BIND(DONE);
5311       sub(result, result, len); // Return index where we stopped
5312                                 // Return len == 0 if we processed all
5313                                 // characters
5314 }
5315 
5316 
5317 // Inflate byte[] array to char[].
5318 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5319                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5320                                         Register tmp4) {
5321   Label big, done;
5322 
5323   assert_different_registers(src, dst, len, tmp4, rscratch1);
5324 
5325   fmovd(vtmp1 , zr);
5326   lsrw(rscratch1, len, 3);
5327 
5328   cbnzw(rscratch1, big);
5329 
5330   // Short string: less than 8 bytes.
5331   {
5332     Label loop, around, tiny;
5333 
5334     subsw(len, len, 4);
5335     andw(len, len, 3);
5336     br(LO, tiny);
5337 
5338     // Use SIMD to do 4 bytes.
5339     ldrs(vtmp2, post(src, 4));
5340     zip1(vtmp3, T8B, vtmp2, vtmp1);
5341     strd(vtmp3, post(dst, 8));
5342 
5343     cbzw(len, done);
5344 
5345     // Do the remaining bytes by steam.
5346     bind(loop);
5347     ldrb(tmp4, post(src, 1));
5348     strh(tmp4, post(dst, 2));
5349     subw(len, len, 1);
5350 
5351     bind(tiny);
5352     cbnz(len, loop);
5353 
5354     bind(around);
5355     b(done);
5356   }
5357 
5358   // Unpack the bytes 8 at a time.
5359   bind(big);
5360   andw(len, len, 7);
5361 
5362   {
5363     Label loop, around;
5364 
5365     bind(loop);
5366     ldrd(vtmp2, post(src, 8));
5367     sub(rscratch1, rscratch1, 1);
5368     zip1(vtmp3, T16B, vtmp2, vtmp1);
5369     st1(vtmp3, T8H, post(dst, 16));
5370     cbnz(rscratch1, loop);
5371 
5372     bind(around);
5373   }
5374 
5375   // Do the tail of up to 8 bytes.
5376   sub(src, src, 8);
5377   add(src, src, len, ext::uxtw, 0);
5378   ldrd(vtmp2, Address(src));
5379   sub(dst, dst, 16);
5380   add(dst, dst, len, ext::uxtw, 1);
5381   zip1(vtmp3, T16B, vtmp2, vtmp1);
5382   st1(vtmp3, T8H, Address(dst));
5383 
5384   bind(done);
5385 }
5386 
5387 // Compress char[] array to byte[].
5388 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5389                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5390                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5391                                          Register result) {
5392   encode_iso_array(src, dst, len, result,
5393                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5394   cmp(len, zr);
5395   csel(result, result, zr, EQ);
5396 }
5397 
5398 // get_thread() can be called anywhere inside generated code so we
5399 // need to save whatever non-callee save context might get clobbered
5400 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5401 // the call setup code.
5402 //
5403 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5404 //
5405 void MacroAssembler::get_thread(Register dst) {
5406   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5407   push(saved_regs, sp);
5408 
5409   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5410   blrt(lr, 1, 0, 1);
5411   if (dst != c_rarg0) {
5412     mov(dst, c_rarg0);
5413   }
5414 
5415   pop(saved_regs, sp);
5416 }
5417 
5418 // Shenandoah requires that all objects are evacuated before being
5419 // written to, and that fromspace pointers are not written into
5420 // objects during concurrent marking.  These methods check for that.
5421 
5422 void MacroAssembler::in_heap_check(Register r, Register tmp, Label &nope) {
5423   ShenandoahHeap *h = (ShenandoahHeap *)Universe::heap();
5424 
5425   HeapWord* first_region_bottom = h->first_region_bottom();
5426   HeapWord* last_region_end = first_region_bottom + (ShenandoahHeapRegion::region_size_bytes() / HeapWordSize) * h->max_regions();
5427 
5428   mov(tmp, (uintptr_t)first_region_bottom);
5429   cmp(r, tmp);
5430   br(Assembler::LO, nope);
5431   mov(tmp, (uintptr_t)last_region_end);
5432   cmp(r, tmp);
5433   br(Assembler::HS, nope);
5434 }
5435 
5436 void MacroAssembler::shenandoah_cset_check(Register obj, Register tmp1, Register tmp2, Label& done) {
5437 
5438   // Test that oop is not in to-space.
5439   lsr(tmp1, obj, ShenandoahHeapRegion::region_size_shift_jint());
5440   assert(ShenandoahHeap::in_cset_fast_test_addr() != 0, "sanity");
5441   mov(tmp2, ShenandoahHeap::in_cset_fast_test_addr());
5442   ldrb(tmp2, Address(tmp2, tmp1));
5443   tbz(tmp2, 0, done);
5444 
5445   // Check for cancelled GC.
5446   assert(ShenandoahHeap::cancelled_concgc_addr() != 0, "sanity");
5447   mov(tmp2, ShenandoahHeap::cancelled_concgc_addr());
5448   ldrb(tmp2, Address(tmp2));
5449   cbnz(tmp2, done);
5450 }
5451 
5452 void MacroAssembler::_shenandoah_store_check(Address addr, Register value, const char* msg, const char* file, int line) {
5453   _shenandoah_store_check(addr.base(), value, msg, file, line);
5454 }
5455 
5456 void MacroAssembler::_shenandoah_store_check(Register addr, Register value, const char* msg, const char* file, int line) {
5457 
5458   if (! UseShenandoahGC || ! ShenandoahStoreCheck) return;
5459   if (addr == r31_sp || addr == sp) return; // Stack-based target
5460 
5461   Register raddr = r8;
5462   Register rval = r9;
5463   Register tmp1 = r10;
5464   Register tmp2 = r11;
5465 
5466   RegSet to_save = RegSet::of(raddr, rval, tmp1, tmp2);
5467 
5468   // Push tmp regs and flags.
5469   push(to_save, sp);
5470   get_nzcv(tmp1);
5471   push(RegSet::of(tmp1), sp);
5472 
5473   mov(rval, value);
5474   mov(raddr, addr);
5475 
5476   Label done;
5477 
5478   // If not in-heap target, skip check.
5479   in_heap_check(raddr, tmp1, done);
5480 
5481   // Test that target oop is not in to-space.
5482   shenandoah_cset_check(raddr, tmp1, tmp2, done);
5483 
5484   // Do value-check only when concurrent mark is in progress.
5485   mov(tmp1, ShenandoahHeap::concurrent_mark_in_progress_addr());
5486   ldrw(tmp1, Address(tmp1));
5487   cbzw(tmp1, done);
5488 
5489   // Null-check value.
5490   cbz(rval, done);
5491 
5492   // Test that value oop is not in to-space.
5493   shenandoah_cset_check(rval, tmp1, tmp2, done);
5494 
5495   // Failure.
5496   // Pop tmp regs and flags.
5497   pop(RegSet::of(tmp1), sp);
5498   set_nzcv(tmp1);
5499   pop(to_save, sp);
5500   const char* b = NULL;
5501   {
5502     ResourceMark rm;
5503     stringStream ss;
5504     ss.print("shenandoah_store_check: %s in file: %s line: %i", msg, file, line);
5505     b = code_string(ss.as_string());
5506   }
5507   // hlt(0);
5508 
5509   stop(b);
5510 
5511   bind(done);
5512   // Pop tmp regs and flags.
5513   pop(RegSet::of(tmp1), sp);
5514   set_nzcv(tmp1);
5515   pop(to_save, sp);
5516 }
5517 
5518 void MacroAssembler::_shenandoah_store_addr_check(Address addr, const char* msg, const char* file, int line) {
5519   _shenandoah_store_addr_check(addr.base(), msg, file, line);
5520 }
5521 
5522 void MacroAssembler::_shenandoah_store_addr_check(Register dst, const char* msg, const char* file, int line) {
5523 
5524   if (! UseShenandoahGC || ! ShenandoahStoreCheck) return;
5525   if (dst == r31_sp || dst == sp) return; // Stack-based target
5526 
5527   Register addr = r8;
5528   Register tmp1 = r9;
5529   Register tmp2 = r10;
5530 
5531   Label done;
5532   RegSet to_save = RegSet::of(addr, tmp1, tmp2);
5533 
5534   // Push tmp regs and flags.
5535   push(to_save, sp);
5536   get_nzcv(tmp1);
5537   push(RegSet::of(tmp1), sp);
5538 
5539   orr(addr, zr, dst);
5540   // mov(addr, dst);
5541 
5542   // Check null.
5543   cbz(addr, done);
5544 
5545   in_heap_check(addr, tmp1, done);
5546 
5547   shenandoah_cset_check(addr, tmp1, tmp2, done);
5548 
5549   // Fail.
5550   // Pop tmp regs and flags.
5551   pop(RegSet::of(tmp1), sp);
5552   set_nzcv(tmp1);
5553   pop(to_save, sp);
5554   const char* b = NULL;
5555   {
5556     ResourceMark rm;
5557     stringStream ss;
5558     ss.print("shenandoah_store_check: %s in file: %s line: %i", msg, file, line);
5559     b = code_string(ss.as_string());
5560   }
5561   // hlt(0);
5562   stop(b);
5563   // should_not_reach_here();
5564 
5565   bind(done);
5566   // Pop tmp regs and flags.
5567   pop(RegSet::of(tmp1), sp);
5568   set_nzcv(tmp1);
5569   pop(to_save, sp);
5570 
5571 }