1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER1
  52 #include "c1/c1_LIRAssembler.hpp"
  53 #endif
  54 #ifdef COMPILER2
  55 #include "oops/oop.hpp"
  56 #include "opto/compile.hpp"
  57 #include "opto/node.hpp"
  58 #include "opto/output.hpp"
  59 #endif
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #define STOP(error) stop(error)
  64 #else
  65 #define BLOCK_COMMENT(str) block_comment(str)
  66 #define STOP(error) block_comment(error); stop(error)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   int instructions = 1;
  75   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  76   long offset = (target - branch) >> 2;
  77   unsigned insn = *(unsigned*)branch;
  78   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  79     // Load register (literal)
  80     Instruction_aarch64::spatch(branch, 23, 5, offset);
  81   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  82     // Unconditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 25, 0, offset);
  84   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  85     // Conditional branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  88     // Compare & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 23, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  91     // Test & branch (immediate)
  92     Instruction_aarch64::spatch(branch, 18, 5, offset);
  93   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  94     // PC-rel. addressing
  95     offset = target-branch;
  96     int shift = Instruction_aarch64::extract(insn, 31, 31);
  97     if (shift) {
  98       u_int64_t dest = (u_int64_t)target;
  99       uint64_t pc_page = (uint64_t)branch >> 12;
 100       uint64_t adr_page = (uint64_t)target >> 12;
 101       unsigned offset_lo = dest & 0xfff;
 102       offset = adr_page - pc_page;
 103 
 104       // We handle 4 types of PC relative addressing
 105       //   1 - adrp    Rx, target_page
 106       //       ldr/str Ry, [Rx, #offset_in_page]
 107       //   2 - adrp    Rx, target_page
 108       //       add     Ry, Rx, #offset_in_page
 109       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       //       movk    Rx, #imm16<<32
 111       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 112       // In the first 3 cases we must check that Rx is the same in the adrp and the
 113       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 114       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 115       // to be followed by a random unrelated ldr/str, add or movk instruction.
 116       //
 117       unsigned insn2 = ((unsigned*)branch)[1];
 118       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 9, 5)) {
 121         // Load/store register (unsigned immediate)
 122         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 123         Instruction_aarch64::patch(branch + sizeof (unsigned),
 124                                     21, 10, offset_lo >> size);
 125         guarantee(((dest >> size) << size) == dest, "misaligned target");
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 128                 Instruction_aarch64::extract(insn, 4, 0) ==
 129                         Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // add (immediate)
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                    21, 10, offset_lo);
 133         instructions = 2;
 134       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 135                    Instruction_aarch64::extract(insn, 4, 0) ==
 136                      Instruction_aarch64::extract(insn2, 4, 0)) {
 137         // movk #imm16<<32
 138         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 139         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 140         long pc_page = (long)branch >> 12;
 141         long adr_page = (long)dest >> 12;
 142         offset = adr_page - pc_page;
 143         instructions = 2;
 144       }
 145     }
 146     int offset_lo = offset & 3;
 147     offset >>= 2;
 148     Instruction_aarch64::spatch(branch, 23, 5, offset);
 149     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 150   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 151     u_int64_t dest = (u_int64_t)target;
 152     // Move wide constant
 153     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 154     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 155     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 156     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 157     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 158     assert(target_addr_for_insn(branch) == target, "should be");
 159     instructions = 3;
 160   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 161              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 162     // nothing to do
 163     assert(target == 0, "did not expect to relocate target for polling page load");
 164   } else {
 165     ShouldNotReachHere();
 166   }
 167   return instructions * NativeInstruction::instruction_size;
 168 }
 169 
 170 int MacroAssembler::patch_oop(address insn_addr, address o) {
 171   int instructions;
 172   unsigned insn = *(unsigned*)insn_addr;
 173   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 174 
 175   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 176   // narrow OOPs by setting the upper 16 bits in the first
 177   // instruction.
 178   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 179     // Move narrow OOP
 180     narrowOop n = CompressedOops::encode((oop)o);
 181     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 183     instructions = 2;
 184   } else {
 185     // Move wide OOP
 186     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 187     uintptr_t dest = (uintptr_t)o;
 188     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 190     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 191     instructions = 3;
 192   }
 193   return instructions * NativeInstruction::instruction_size;
 194 }
 195 
 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 197   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 198   // We encode narrow ones by setting the upper 16 bits in the first
 199   // instruction.
 200   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 201   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 202          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 203 
 204   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 205   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 206   return 2 * NativeInstruction::instruction_size;
 207 }
 208 
 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 210   long offset = 0;
 211   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 212     // Load register (literal)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214     return address(((uint64_t)insn_addr + (offset << 2)));
 215   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 216     // Unconditional branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 25, 0);
 218   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 219     // Conditional branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 23, 5);
 221   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 222     // Compare & branch (immediate)
 223     offset = Instruction_aarch64::sextract(insn, 23, 5);
 224    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 225     // Test & branch (immediate)
 226     offset = Instruction_aarch64::sextract(insn, 18, 5);
 227   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 228     // PC-rel. addressing
 229     offset = Instruction_aarch64::extract(insn, 30, 29);
 230     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 231     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 232     if (shift) {
 233       offset <<= shift;
 234       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 235       target_page &= ((uint64_t)-1) << shift;
 236       // Return the target address for the following sequences
 237       //   1 - adrp    Rx, target_page
 238       //       ldr/str Ry, [Rx, #offset_in_page]
 239       //   2 - adrp    Rx, target_page
 240       //       add     Ry, Rx, #offset_in_page
 241       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //       movk    Rx, #imm12<<32
 243       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 244       //
 245       // In the first two cases  we check that the register is the same and
 246       // return the target_page + the offset within the page.
 247       // Otherwise we assume it is a page aligned relocation and return
 248       // the target page only.
 249       //
 250       unsigned insn2 = ((unsigned*)insn_addr)[1];
 251       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 9, 5)) {
 254         // Load/store register (unsigned immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 257         return address(target_page + (byte_offset << size));
 258       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 259                 Instruction_aarch64::extract(insn, 4, 0) ==
 260                         Instruction_aarch64::extract(insn2, 4, 0)) {
 261         // add (immediate)
 262         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 263         return address(target_page + byte_offset);
 264       } else {
 265         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 266                Instruction_aarch64::extract(insn, 4, 0) ==
 267                  Instruction_aarch64::extract(insn2, 4, 0)) {
 268           target_page = (target_page & 0xffffffff) |
 269                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 270         }
 271         return (address)target_page;
 272       }
 273     } else {
 274       ShouldNotReachHere();
 275     }
 276   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 277     u_int32_t *insns = (u_int32_t *)insn_addr;
 278     // Move wide constant: movz, movk, movk.  See movptr().
 279     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 280     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 281     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 283                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 284   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 285              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 286     return 0;
 287   } else {
 288     ShouldNotReachHere();
 289   }
 290   return address(((uint64_t)insn_addr + (offset << 2)));
 291 }
 292 
 293 void MacroAssembler::safepoint_poll(Label& slow_path) {
 294   ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 295   tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 296 }
 297 
 298 // Just like safepoint_poll, but use an acquiring load for thread-
 299 // local polling.
 300 //
 301 // We need an acquire here to ensure that any subsequent load of the
 302 // global SafepointSynchronize::_state flag is ordered after this load
 303 // of the local Thread::_polling page.  We don't want this poll to
 304 // return false (i.e. not safepointing) and a later poll of the global
 305 // SafepointSynchronize::_state spuriously to return true.
 306 //
 307 // This is to avoid a race when we're in a native->Java transition
 308 // racing the code which wakes up from a safepoint.
 309 //
 310 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 311   lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 312   ldar(rscratch1, rscratch1);
 313   tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 314 }
 315 
 316 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 317   // we must set sp to zero to clear frame
 318   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 319 
 320   // must clear fp, so that compiled frames are not confused; it is
 321   // possible that we need it only for debugging
 322   if (clear_fp) {
 323     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 324   }
 325 
 326   // Always clear the pc because it could have been set by make_walkable()
 327   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 328 }
 329 
 330 // Calls to C land
 331 //
 332 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 333 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 334 // has to be reset to 0. This is required to allow proper stack traversal.
 335 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 336                                          Register last_java_fp,
 337                                          Register last_java_pc,
 338                                          Register scratch) {
 339 
 340   if (last_java_pc->is_valid()) {
 341       str(last_java_pc, Address(rthread,
 342                                 JavaThread::frame_anchor_offset()
 343                                 + JavaFrameAnchor::last_Java_pc_offset()));
 344     }
 345 
 346   // determine last_java_sp register
 347   if (last_java_sp == sp) {
 348     mov(scratch, sp);
 349     last_java_sp = scratch;
 350   } else if (!last_java_sp->is_valid()) {
 351     last_java_sp = esp;
 352   }
 353 
 354   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 355 
 356   // last_java_fp is optional
 357   if (last_java_fp->is_valid()) {
 358     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 359   }
 360 }
 361 
 362 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 363                                          Register last_java_fp,
 364                                          address  last_java_pc,
 365                                          Register scratch) {
 366   assert(last_java_pc != NULL, "must provide a valid PC");
 367 
 368   adr(scratch, last_java_pc);
 369   str(scratch, Address(rthread,
 370                        JavaThread::frame_anchor_offset()
 371                        + JavaFrameAnchor::last_Java_pc_offset()));
 372 
 373   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 374 }
 375 
 376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 377                                          Register last_java_fp,
 378                                          Label &L,
 379                                          Register scratch) {
 380   if (L.is_bound()) {
 381     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 382   } else {
 383     InstructionMark im(this);
 384     L.add_patch_at(code(), locator());
 385     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 386   }
 387 }
 388 
 389 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 390   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 391   assert(CodeCache::find_blob(entry.target()) != NULL,
 392          "destination of far call not found in code cache");
 393   if (far_branches()) {
 394     unsigned long offset;
 395     // We can use ADRP here because we know that the total size of
 396     // the code cache cannot exceed 2Gb.
 397     adrp(tmp, entry, offset);
 398     add(tmp, tmp, offset);
 399     if (cbuf) cbuf->set_insts_mark();
 400     blr(tmp);
 401   } else {
 402     if (cbuf) cbuf->set_insts_mark();
 403     bl(entry);
 404   }
 405 }
 406 
 407 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 408   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 409   assert(CodeCache::find_blob(entry.target()) != NULL,
 410          "destination of far call not found in code cache");
 411   if (far_branches()) {
 412     unsigned long offset;
 413     // We can use ADRP here because we know that the total size of
 414     // the code cache cannot exceed 2Gb.
 415     adrp(tmp, entry, offset);
 416     add(tmp, tmp, offset);
 417     if (cbuf) cbuf->set_insts_mark();
 418     br(tmp);
 419   } else {
 420     if (cbuf) cbuf->set_insts_mark();
 421     b(entry);
 422   }
 423 }
 424 
 425 void MacroAssembler::reserved_stack_check() {
 426     // testing if reserved zone needs to be enabled
 427     Label no_reserved_zone_enabling;
 428 
 429     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 430     cmp(sp, rscratch1);
 431     br(Assembler::LO, no_reserved_zone_enabling);
 432 
 433     enter();   // LR and FP are live.
 434     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 435     mov(c_rarg0, rthread);
 436     blr(rscratch1);
 437     leave();
 438 
 439     // We have already removed our own frame.
 440     // throw_delayed_StackOverflowError will think that it's been
 441     // called by our caller.
 442     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 443     br(rscratch1);
 444     should_not_reach_here();
 445 
 446     bind(no_reserved_zone_enabling);
 447 }
 448 
 449 int MacroAssembler::biased_locking_enter(Register lock_reg,
 450                                          Register obj_reg,
 451                                          Register swap_reg,
 452                                          Register tmp_reg,
 453                                          bool swap_reg_contains_mark,
 454                                          Label& done,
 455                                          Label* slow_case,
 456                                          BiasedLockingCounters* counters) {
 457   assert(UseBiasedLocking, "why call this otherwise?");
 458   assert_different_registers(lock_reg, obj_reg, swap_reg);
 459 
 460   if (PrintBiasedLockingStatistics && counters == NULL)
 461     counters = BiasedLocking::counters();
 462 
 463   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 464   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
 465   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 466   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 467   Address saved_mark_addr(lock_reg, 0);
 468 
 469   // Biased locking
 470   // See whether the lock is currently biased toward our thread and
 471   // whether the epoch is still valid
 472   // Note that the runtime guarantees sufficient alignment of JavaThread
 473   // pointers to allow age to be placed into low bits
 474   // First check to see whether biasing is even enabled for this object
 475   Label cas_label;
 476   int null_check_offset = -1;
 477   if (!swap_reg_contains_mark) {
 478     null_check_offset = offset();
 479     ldr(swap_reg, mark_addr);
 480   }
 481   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
 482   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
 483   br(Assembler::NE, cas_label);
 484   // The bias pattern is present in the object's header. Need to check
 485   // whether the bias owner and the epoch are both still current.
 486   load_prototype_header(tmp_reg, obj_reg);
 487   orr(tmp_reg, tmp_reg, rthread);
 488   eor(tmp_reg, swap_reg, tmp_reg);
 489   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
 490   if (counters != NULL) {
 491     Label around;
 492     cbnz(tmp_reg, around);
 493     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 494     b(done);
 495     bind(around);
 496   } else {
 497     cbz(tmp_reg, done);
 498   }
 499 
 500   Label try_revoke_bias;
 501   Label try_rebias;
 502 
 503   // At this point we know that the header has the bias pattern and
 504   // that we are not the bias owner in the current epoch. We need to
 505   // figure out more details about the state of the header in order to
 506   // know what operations can be legally performed on the object's
 507   // header.
 508 
 509   // If the low three bits in the xor result aren't clear, that means
 510   // the prototype header is no longer biased and we have to revoke
 511   // the bias on this object.
 512   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
 513   cbnz(rscratch1, try_revoke_bias);
 514 
 515   // Biasing is still enabled for this data type. See whether the
 516   // epoch of the current bias is still valid, meaning that the epoch
 517   // bits of the mark word are equal to the epoch bits of the
 518   // prototype header. (Note that the prototype header's epoch bits
 519   // only change at a safepoint.) If not, attempt to rebias the object
 520   // toward the current thread. Note that we must be absolutely sure
 521   // that the current epoch is invalid in order to do this because
 522   // otherwise the manipulations it performs on the mark word are
 523   // illegal.
 524   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
 525   cbnz(rscratch1, try_rebias);
 526 
 527   // The epoch of the current bias is still valid but we know nothing
 528   // about the owner; it might be set or it might be clear. Try to
 529   // acquire the bias of the object using an atomic operation. If this
 530   // fails we will go in to the runtime to revoke the object's bias.
 531   // Note that we first construct the presumed unbiased header so we
 532   // don't accidentally blow away another thread's valid bias.
 533   {
 534     Label here;
 535     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
 536     andr(swap_reg, swap_reg, rscratch1);
 537     orr(tmp_reg, swap_reg, rthread);
 538     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 539     // If the biasing toward our thread failed, this means that
 540     // another thread succeeded in biasing it toward itself and we
 541     // need to revoke that bias. The revocation will occur in the
 542     // interpreter runtime in the slow case.
 543     bind(here);
 544     if (counters != NULL) {
 545       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 546                   tmp_reg, rscratch1, rscratch2);
 547     }
 548   }
 549   b(done);
 550 
 551   bind(try_rebias);
 552   // At this point we know the epoch has expired, meaning that the
 553   // current "bias owner", if any, is actually invalid. Under these
 554   // circumstances _only_, we are allowed to use the current header's
 555   // value as the comparison value when doing the cas to acquire the
 556   // bias in the current epoch. In other words, we allow transfer of
 557   // the bias from one thread to another directly in this situation.
 558   //
 559   // FIXME: due to a lack of registers we currently blow away the age
 560   // bits in this situation. Should attempt to preserve them.
 561   {
 562     Label here;
 563     load_prototype_header(tmp_reg, obj_reg);
 564     orr(tmp_reg, rthread, tmp_reg);
 565     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 566     // If the biasing toward our thread failed, then another thread
 567     // succeeded in biasing it toward itself and we need to revoke that
 568     // bias. The revocation will occur in the runtime in the slow case.
 569     bind(here);
 570     if (counters != NULL) {
 571       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 572                   tmp_reg, rscratch1, rscratch2);
 573     }
 574   }
 575   b(done);
 576 
 577   bind(try_revoke_bias);
 578   // The prototype mark in the klass doesn't have the bias bit set any
 579   // more, indicating that objects of this data type are not supposed
 580   // to be biased any more. We are going to try to reset the mark of
 581   // this object to the prototype value and fall through to the
 582   // CAS-based locking scheme. Note that if our CAS fails, it means
 583   // that another thread raced us for the privilege of revoking the
 584   // bias of this particular object, so it's okay to continue in the
 585   // normal locking code.
 586   //
 587   // FIXME: due to a lack of registers we currently blow away the age
 588   // bits in this situation. Should attempt to preserve them.
 589   {
 590     Label here, nope;
 591     load_prototype_header(tmp_reg, obj_reg);
 592     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 593     bind(here);
 594 
 595     // Fall through to the normal CAS-based lock, because no matter what
 596     // the result of the above CAS, some thread must have succeeded in
 597     // removing the bias bit from the object's header.
 598     if (counters != NULL) {
 599       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 600                   rscratch1, rscratch2);
 601     }
 602     bind(nope);
 603   }
 604 
 605   bind(cas_label);
 606 
 607   return null_check_offset;
 608 }
 609 
 610 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 611   assert(UseBiasedLocking, "why call this otherwise?");
 612 
 613   // Check for biased locking unlock case, which is a no-op
 614   // Note: we do not have to check the thread ID for two reasons.
 615   // First, the interpreter checks for IllegalMonitorStateException at
 616   // a higher level. Second, if the bias was revoked while we held the
 617   // lock, the object could not be rebiased toward another thread, so
 618   // the bias bit would be clear.
 619   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 620   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
 621   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
 622   br(Assembler::EQ, done);
 623 }
 624 
 625 static void pass_arg0(MacroAssembler* masm, Register arg) {
 626   if (c_rarg0 != arg ) {
 627     masm->mov(c_rarg0, arg);
 628   }
 629 }
 630 
 631 static void pass_arg1(MacroAssembler* masm, Register arg) {
 632   if (c_rarg1 != arg ) {
 633     masm->mov(c_rarg1, arg);
 634   }
 635 }
 636 
 637 static void pass_arg2(MacroAssembler* masm, Register arg) {
 638   if (c_rarg2 != arg ) {
 639     masm->mov(c_rarg2, arg);
 640   }
 641 }
 642 
 643 static void pass_arg3(MacroAssembler* masm, Register arg) {
 644   if (c_rarg3 != arg ) {
 645     masm->mov(c_rarg3, arg);
 646   }
 647 }
 648 
 649 void MacroAssembler::call_VM_base(Register oop_result,
 650                                   Register java_thread,
 651                                   Register last_java_sp,
 652                                   address  entry_point,
 653                                   int      number_of_arguments,
 654                                   bool     check_exceptions) {
 655    // determine java_thread register
 656   if (!java_thread->is_valid()) {
 657     java_thread = rthread;
 658   }
 659 
 660   // determine last_java_sp register
 661   if (!last_java_sp->is_valid()) {
 662     last_java_sp = esp;
 663   }
 664 
 665   // debugging support
 666   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 667   assert(java_thread == rthread, "unexpected register");
 668 #ifdef ASSERT
 669   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 670   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 671 #endif // ASSERT
 672 
 673   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 674   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 675 
 676   // push java thread (becomes first argument of C function)
 677 
 678   mov(c_rarg0, java_thread);
 679 
 680   // set last Java frame before call
 681   assert(last_java_sp != rfp, "can't use rfp");
 682 
 683   Label l;
 684   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 685 
 686   // do the call, remove parameters
 687   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 688 
 689   // reset last Java frame
 690   // Only interpreter should have to clear fp
 691   reset_last_Java_frame(true);
 692 
 693    // C++ interp handles this in the interpreter
 694   check_and_handle_popframe(java_thread);
 695   check_and_handle_earlyret(java_thread);
 696 
 697   if (check_exceptions) {
 698     // check for pending exceptions (java_thread is set upon return)
 699     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 700     Label ok;
 701     cbz(rscratch1, ok);
 702     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 703     br(rscratch1);
 704     bind(ok);
 705   }
 706 
 707   // get oop result if there is one and reset the value in the thread
 708   if (oop_result->is_valid()) {
 709     get_vm_result(oop_result, java_thread);
 710   }
 711 }
 712 
 713 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 714   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 715 }
 716 
 717 // Maybe emit a call via a trampoline.  If the code cache is small
 718 // trampolines won't be emitted.
 719 
 720 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 721   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 722   assert(entry.rspec().type() == relocInfo::runtime_call_type
 723          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 724          || entry.rspec().type() == relocInfo::static_call_type
 725          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 726 
 727   // We need a trampoline if branches are far.
 728   if (far_branches()) {
 729     bool in_scratch_emit_size = false;
 730 #ifdef COMPILER2
 731     // We don't want to emit a trampoline if C2 is generating dummy
 732     // code during its branch shortening phase.
 733     CompileTask* task = ciEnv::current()->task();
 734     in_scratch_emit_size =
 735       (task != NULL && is_c2_compile(task->comp_level()) &&
 736        Compile::current()->output()->in_scratch_emit_size());
 737 #endif
 738     if (!in_scratch_emit_size) {
 739       address stub = emit_trampoline_stub(offset(), entry.target());
 740       if (stub == NULL) {
 741         return NULL; // CodeCache is full
 742       }
 743     }
 744   }
 745 
 746   if (cbuf) cbuf->set_insts_mark();
 747   relocate(entry.rspec());
 748   if (!far_branches()) {
 749     bl(entry.target());
 750   } else {
 751     bl(pc());
 752   }
 753   // just need to return a non-null address
 754   return pc();
 755 }
 756 
 757 
 758 // Emit a trampoline stub for a call to a target which is too far away.
 759 //
 760 // code sequences:
 761 //
 762 // call-site:
 763 //   branch-and-link to <destination> or <trampoline stub>
 764 //
 765 // Related trampoline stub for this call site in the stub section:
 766 //   load the call target from the constant pool
 767 //   branch (LR still points to the call site above)
 768 
 769 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 770                                              address dest) {
 771   // Max stub size: alignment nop, TrampolineStub.
 772   address stub = start_a_stub(NativeInstruction::instruction_size
 773                    + NativeCallTrampolineStub::instruction_size);
 774   if (stub == NULL) {
 775     return NULL;  // CodeBuffer::expand failed
 776   }
 777 
 778   // Create a trampoline stub relocation which relates this trampoline stub
 779   // with the call instruction at insts_call_instruction_offset in the
 780   // instructions code-section.
 781   align(wordSize);
 782   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 783                                             + insts_call_instruction_offset));
 784   const int stub_start_offset = offset();
 785 
 786   // Now, create the trampoline stub's code:
 787   // - load the call
 788   // - call
 789   Label target;
 790   ldr(rscratch1, target);
 791   br(rscratch1);
 792   bind(target);
 793   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 794          "should be");
 795   emit_int64((int64_t)dest);
 796 
 797   const address stub_start_addr = addr_at(stub_start_offset);
 798 
 799   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 800 
 801   end_a_stub();
 802   return stub_start_addr;
 803 }
 804 
 805 void MacroAssembler::emit_static_call_stub() {
 806   // CompiledDirectStaticCall::set_to_interpreted knows the
 807   // exact layout of this stub.
 808 
 809   isb();
 810   mov_metadata(rmethod, (Metadata*)NULL);
 811 
 812   // Jump to the entry point of the i2c stub.
 813   movptr(rscratch1, 0);
 814   br(rscratch1);
 815 }
 816 
 817 void MacroAssembler::c2bool(Register x) {
 818   // implements x == 0 ? 0 : 1
 819   // note: must only look at least-significant byte of x
 820   //       since C-style booleans are stored in one byte
 821   //       only! (was bug)
 822   tst(x, 0xff);
 823   cset(x, Assembler::NE);
 824 }
 825 
 826 address MacroAssembler::ic_call(address entry, jint method_index) {
 827   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 828   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 829   // unsigned long offset;
 830   // ldr_constant(rscratch2, const_ptr);
 831   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 832   return trampoline_call(Address(entry, rh));
 833 }
 834 
 835 // Implementation of call_VM versions
 836 
 837 void MacroAssembler::call_VM(Register oop_result,
 838                              address entry_point,
 839                              bool check_exceptions) {
 840   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 841 }
 842 
 843 void MacroAssembler::call_VM(Register oop_result,
 844                              address entry_point,
 845                              Register arg_1,
 846                              bool check_exceptions) {
 847   pass_arg1(this, arg_1);
 848   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 849 }
 850 
 851 void MacroAssembler::call_VM(Register oop_result,
 852                              address entry_point,
 853                              Register arg_1,
 854                              Register arg_2,
 855                              bool check_exceptions) {
 856   assert(arg_1 != c_rarg2, "smashed arg");
 857   pass_arg2(this, arg_2);
 858   pass_arg1(this, arg_1);
 859   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 860 }
 861 
 862 void MacroAssembler::call_VM(Register oop_result,
 863                              address entry_point,
 864                              Register arg_1,
 865                              Register arg_2,
 866                              Register arg_3,
 867                              bool check_exceptions) {
 868   assert(arg_1 != c_rarg3, "smashed arg");
 869   assert(arg_2 != c_rarg3, "smashed arg");
 870   pass_arg3(this, arg_3);
 871 
 872   assert(arg_1 != c_rarg2, "smashed arg");
 873   pass_arg2(this, arg_2);
 874 
 875   pass_arg1(this, arg_1);
 876   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 877 }
 878 
 879 void MacroAssembler::call_VM(Register oop_result,
 880                              Register last_java_sp,
 881                              address entry_point,
 882                              int number_of_arguments,
 883                              bool check_exceptions) {
 884   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 885 }
 886 
 887 void MacroAssembler::call_VM(Register oop_result,
 888                              Register last_java_sp,
 889                              address entry_point,
 890                              Register arg_1,
 891                              bool check_exceptions) {
 892   pass_arg1(this, arg_1);
 893   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 894 }
 895 
 896 void MacroAssembler::call_VM(Register oop_result,
 897                              Register last_java_sp,
 898                              address entry_point,
 899                              Register arg_1,
 900                              Register arg_2,
 901                              bool check_exceptions) {
 902 
 903   assert(arg_1 != c_rarg2, "smashed arg");
 904   pass_arg2(this, arg_2);
 905   pass_arg1(this, arg_1);
 906   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 907 }
 908 
 909 void MacroAssembler::call_VM(Register oop_result,
 910                              Register last_java_sp,
 911                              address entry_point,
 912                              Register arg_1,
 913                              Register arg_2,
 914                              Register arg_3,
 915                              bool check_exceptions) {
 916   assert(arg_1 != c_rarg3, "smashed arg");
 917   assert(arg_2 != c_rarg3, "smashed arg");
 918   pass_arg3(this, arg_3);
 919   assert(arg_1 != c_rarg2, "smashed arg");
 920   pass_arg2(this, arg_2);
 921   pass_arg1(this, arg_1);
 922   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 923 }
 924 
 925 
 926 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 927   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 928   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 929   verify_oop(oop_result, "broken oop in call_VM_base");
 930 }
 931 
 932 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 933   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 934   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 935 }
 936 
 937 void MacroAssembler::align(int modulus) {
 938   while (offset() % modulus != 0) nop();
 939 }
 940 
 941 // these are no-ops overridden by InterpreterMacroAssembler
 942 
 943 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 944 
 945 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 946 
 947 
 948 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 949                                                       Register tmp,
 950                                                       int offset) {
 951   intptr_t value = *delayed_value_addr;
 952   if (value != 0)
 953     return RegisterOrConstant(value + offset);
 954 
 955   // load indirectly to solve generation ordering problem
 956   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 957 
 958   if (offset != 0)
 959     add(tmp, tmp, offset);
 960 
 961   return RegisterOrConstant(tmp);
 962 }
 963 
 964 // Look up the method for a megamorphic invokeinterface call.
 965 // The target method is determined by <intf_klass, itable_index>.
 966 // The receiver klass is in recv_klass.
 967 // On success, the result will be in method_result, and execution falls through.
 968 // On failure, execution transfers to the given label.
 969 void MacroAssembler::lookup_interface_method(Register recv_klass,
 970                                              Register intf_klass,
 971                                              RegisterOrConstant itable_index,
 972                                              Register method_result,
 973                                              Register scan_temp,
 974                                              Label& L_no_such_interface,
 975                          bool return_method) {
 976   assert_different_registers(recv_klass, intf_klass, scan_temp);
 977   assert_different_registers(method_result, intf_klass, scan_temp);
 978   assert(recv_klass != method_result || !return_method,
 979      "recv_klass can be destroyed when method isn't needed");
 980   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 981          "caller must use same register for non-constant itable index as for method");
 982 
 983   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 984   int vtable_base = in_bytes(Klass::vtable_start_offset());
 985   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 986   int scan_step   = itableOffsetEntry::size() * wordSize;
 987   int vte_size    = vtableEntry::size_in_bytes();
 988   assert(vte_size == wordSize, "else adjust times_vte_scale");
 989 
 990   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 991 
 992   // %%% Could store the aligned, prescaled offset in the klassoop.
 993   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 994   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 995   add(scan_temp, scan_temp, vtable_base);
 996 
 997   if (return_method) {
 998     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 999     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1000     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1001     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1002     if (itentry_off)
1003       add(recv_klass, recv_klass, itentry_off);
1004   }
1005 
1006   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1007   //   if (scan->interface() == intf) {
1008   //     result = (klass + scan->offset() + itable_index);
1009   //   }
1010   // }
1011   Label search, found_method;
1012 
1013   for (int peel = 1; peel >= 0; peel--) {
1014     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1015     cmp(intf_klass, method_result);
1016 
1017     if (peel) {
1018       br(Assembler::EQ, found_method);
1019     } else {
1020       br(Assembler::NE, search);
1021       // (invert the test to fall through to found_method...)
1022     }
1023 
1024     if (!peel)  break;
1025 
1026     bind(search);
1027 
1028     // Check that the previous entry is non-null.  A null entry means that
1029     // the receiver class doesn't implement the interface, and wasn't the
1030     // same as when the caller was compiled.
1031     cbz(method_result, L_no_such_interface);
1032     add(scan_temp, scan_temp, scan_step);
1033   }
1034 
1035   bind(found_method);
1036 
1037   // Got a hit.
1038   if (return_method) {
1039     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1040     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1041   }
1042 }
1043 
1044 // virtual method calling
1045 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1046                                            RegisterOrConstant vtable_index,
1047                                            Register method_result) {
1048   const int base = in_bytes(Klass::vtable_start_offset());
1049   assert(vtableEntry::size() * wordSize == 8,
1050          "adjust the scaling in the code below");
1051   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1052 
1053   if (vtable_index.is_register()) {
1054     lea(method_result, Address(recv_klass,
1055                                vtable_index.as_register(),
1056                                Address::lsl(LogBytesPerWord)));
1057     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1058   } else {
1059     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1060     ldr(method_result,
1061         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1062   }
1063 }
1064 
1065 void MacroAssembler::check_klass_subtype(Register sub_klass,
1066                            Register super_klass,
1067                            Register temp_reg,
1068                            Label& L_success) {
1069   Label L_failure;
1070   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1071   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1072   bind(L_failure);
1073 }
1074 
1075 
1076 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1077                                                    Register super_klass,
1078                                                    Register temp_reg,
1079                                                    Label* L_success,
1080                                                    Label* L_failure,
1081                                                    Label* L_slow_path,
1082                                         RegisterOrConstant super_check_offset) {
1083   assert_different_registers(sub_klass, super_klass, temp_reg);
1084   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1085   if (super_check_offset.is_register()) {
1086     assert_different_registers(sub_klass, super_klass,
1087                                super_check_offset.as_register());
1088   } else if (must_load_sco) {
1089     assert(temp_reg != noreg, "supply either a temp or a register offset");
1090   }
1091 
1092   Label L_fallthrough;
1093   int label_nulls = 0;
1094   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1095   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1096   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1097   assert(label_nulls <= 1, "at most one NULL in the batch");
1098 
1099   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1100   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1101   Address super_check_offset_addr(super_klass, sco_offset);
1102 
1103   // Hacked jmp, which may only be used just before L_fallthrough.
1104 #define final_jmp(label)                                                \
1105   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1106   else                            b(label)                /*omit semi*/
1107 
1108   // If the pointers are equal, we are done (e.g., String[] elements).
1109   // This self-check enables sharing of secondary supertype arrays among
1110   // non-primary types such as array-of-interface.  Otherwise, each such
1111   // type would need its own customized SSA.
1112   // We move this check to the front of the fast path because many
1113   // type checks are in fact trivially successful in this manner,
1114   // so we get a nicely predicted branch right at the start of the check.
1115   cmp(sub_klass, super_klass);
1116   br(Assembler::EQ, *L_success);
1117 
1118   // Check the supertype display:
1119   if (must_load_sco) {
1120     ldrw(temp_reg, super_check_offset_addr);
1121     super_check_offset = RegisterOrConstant(temp_reg);
1122   }
1123   Address super_check_addr(sub_klass, super_check_offset);
1124   ldr(rscratch1, super_check_addr);
1125   cmp(super_klass, rscratch1); // load displayed supertype
1126 
1127   // This check has worked decisively for primary supers.
1128   // Secondary supers are sought in the super_cache ('super_cache_addr').
1129   // (Secondary supers are interfaces and very deeply nested subtypes.)
1130   // This works in the same check above because of a tricky aliasing
1131   // between the super_cache and the primary super display elements.
1132   // (The 'super_check_addr' can address either, as the case requires.)
1133   // Note that the cache is updated below if it does not help us find
1134   // what we need immediately.
1135   // So if it was a primary super, we can just fail immediately.
1136   // Otherwise, it's the slow path for us (no success at this point).
1137 
1138   if (super_check_offset.is_register()) {
1139     br(Assembler::EQ, *L_success);
1140     subs(zr, super_check_offset.as_register(), sc_offset);
1141     if (L_failure == &L_fallthrough) {
1142       br(Assembler::EQ, *L_slow_path);
1143     } else {
1144       br(Assembler::NE, *L_failure);
1145       final_jmp(*L_slow_path);
1146     }
1147   } else if (super_check_offset.as_constant() == sc_offset) {
1148     // Need a slow path; fast failure is impossible.
1149     if (L_slow_path == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_slow_path);
1153       final_jmp(*L_success);
1154     }
1155   } else {
1156     // No slow path; it's a fast decision.
1157     if (L_failure == &L_fallthrough) {
1158       br(Assembler::EQ, *L_success);
1159     } else {
1160       br(Assembler::NE, *L_failure);
1161       final_jmp(*L_success);
1162     }
1163   }
1164 
1165   bind(L_fallthrough);
1166 
1167 #undef final_jmp
1168 }
1169 
1170 // These two are taken from x86, but they look generally useful
1171 
1172 // scans count pointer sized words at [addr] for occurence of value,
1173 // generic
1174 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1175                                 Register scratch) {
1176   Label Lloop, Lexit;
1177   cbz(count, Lexit);
1178   bind(Lloop);
1179   ldr(scratch, post(addr, wordSize));
1180   cmp(value, scratch);
1181   br(EQ, Lexit);
1182   sub(count, count, 1);
1183   cbnz(count, Lloop);
1184   bind(Lexit);
1185 }
1186 
1187 // scans count 4 byte words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldrw(scratch, post(addr, wordSize));
1195   cmpw(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1203                                                    Register super_klass,
1204                                                    Register temp_reg,
1205                                                    Register temp2_reg,
1206                                                    Label* L_success,
1207                                                    Label* L_failure,
1208                                                    bool set_cond_codes) {
1209   assert_different_registers(sub_klass, super_klass, temp_reg);
1210   if (temp2_reg != noreg)
1211     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1212 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1213 
1214   Label L_fallthrough;
1215   int label_nulls = 0;
1216   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1217   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1218   assert(label_nulls <= 1, "at most one NULL in the batch");
1219 
1220   // a couple of useful fields in sub_klass:
1221   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1222   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1223   Address secondary_supers_addr(sub_klass, ss_offset);
1224   Address super_cache_addr(     sub_klass, sc_offset);
1225 
1226   BLOCK_COMMENT("check_klass_subtype_slow_path");
1227 
1228   // Do a linear scan of the secondary super-klass chain.
1229   // This code is rarely used, so simplicity is a virtue here.
1230   // The repne_scan instruction uses fixed registers, which we must spill.
1231   // Don't worry too much about pre-existing connections with the input regs.
1232 
1233   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1234   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1235 
1236   RegSet pushed_registers;
1237   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1238   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1239 
1240   if (super_klass != r0 || UseCompressedOops) {
1241     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1242   }
1243 
1244   push(pushed_registers, sp);
1245 
1246   // Get super_klass value into r0 (even if it was in r5 or r2).
1247   if (super_klass != r0) {
1248     mov(r0, super_klass);
1249   }
1250 
1251 #ifndef PRODUCT
1252   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1253   Address pst_counter_addr(rscratch2);
1254   ldr(rscratch1, pst_counter_addr);
1255   add(rscratch1, rscratch1, 1);
1256   str(rscratch1, pst_counter_addr);
1257 #endif //PRODUCT
1258 
1259   // We will consult the secondary-super array.
1260   ldr(r5, secondary_supers_addr);
1261   // Load the array length.
1262   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1263   // Skip to start of data.
1264   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1265 
1266   cmp(sp, zr); // Clear Z flag; SP is never zero
1267   // Scan R2 words at [R5] for an occurrence of R0.
1268   // Set NZ/Z based on last compare.
1269   repne_scan(r5, r0, r2, rscratch1);
1270 
1271   // Unspill the temp. registers:
1272   pop(pushed_registers, sp);
1273 
1274   br(Assembler::NE, *L_failure);
1275 
1276   // Success.  Cache the super we found and proceed in triumph.
1277   str(super_klass, super_cache_addr);
1278 
1279   if (L_success != &L_fallthrough) {
1280     b(*L_success);
1281   }
1282 
1283 #undef IS_A_TEMP
1284 
1285   bind(L_fallthrough);
1286 }
1287 
1288 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1289   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1290   assert_different_registers(klass, rthread, scratch);
1291 
1292   Label L_fallthrough, L_tmp;
1293   if (L_fast_path == NULL) {
1294     L_fast_path = &L_fallthrough;
1295   } else if (L_slow_path == NULL) {
1296     L_slow_path = &L_fallthrough;
1297   }
1298   // Fast path check: class is fully initialized
1299   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1300   subs(zr, scratch, InstanceKlass::fully_initialized);
1301   br(Assembler::EQ, *L_fast_path);
1302 
1303   // Fast path check: current thread is initializer thread
1304   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1305   cmp(rthread, scratch);
1306 
1307   if (L_slow_path == &L_fallthrough) {
1308     br(Assembler::EQ, *L_fast_path);
1309     bind(*L_slow_path);
1310   } else if (L_fast_path == &L_fallthrough) {
1311     br(Assembler::NE, *L_slow_path);
1312     bind(*L_fast_path);
1313   } else {
1314     Unimplemented();
1315   }
1316 }
1317 
1318 void MacroAssembler::verify_oop(Register reg, const char* s) {
1319   if (!VerifyOops) return;
1320 
1321   // Pass register number to verify_oop_subroutine
1322   const char* b = NULL;
1323   {
1324     ResourceMark rm;
1325     stringStream ss;
1326     ss.print("verify_oop: %s: %s", reg->name(), s);
1327     b = code_string(ss.as_string());
1328   }
1329   BLOCK_COMMENT("verify_oop {");
1330 
1331   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1332   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1333 
1334   mov(r0, reg);
1335   mov(rscratch1, (address)b);
1336 
1337   // call indirectly to solve generation ordering problem
1338   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1339   ldr(rscratch2, Address(rscratch2));
1340   blr(rscratch2);
1341 
1342   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1343   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1344 
1345   BLOCK_COMMENT("} verify_oop");
1346 }
1347 
1348 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1349   if (!VerifyOops) return;
1350 
1351   const char* b = NULL;
1352   {
1353     ResourceMark rm;
1354     stringStream ss;
1355     ss.print("verify_oop_addr: %s", s);
1356     b = code_string(ss.as_string());
1357   }
1358   BLOCK_COMMENT("verify_oop_addr {");
1359 
1360   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1361   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1362 
1363   // addr may contain sp so we will have to adjust it based on the
1364   // pushes that we just did.
1365   if (addr.uses(sp)) {
1366     lea(r0, addr);
1367     ldr(r0, Address(r0, 4 * wordSize));
1368   } else {
1369     ldr(r0, addr);
1370   }
1371   mov(rscratch1, (address)b);
1372 
1373   // call indirectly to solve generation ordering problem
1374   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1375   ldr(rscratch2, Address(rscratch2));
1376   blr(rscratch2);
1377 
1378   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1379   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1380 
1381   BLOCK_COMMENT("} verify_oop_addr");
1382 }
1383 
1384 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1385                                          int extra_slot_offset) {
1386   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1387   int stackElementSize = Interpreter::stackElementSize;
1388   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1389 #ifdef ASSERT
1390   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1391   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1392 #endif
1393   if (arg_slot.is_constant()) {
1394     return Address(esp, arg_slot.as_constant() * stackElementSize
1395                    + offset);
1396   } else {
1397     add(rscratch1, esp, arg_slot.as_register(),
1398         ext::uxtx, exact_log2(stackElementSize));
1399     return Address(rscratch1, offset);
1400   }
1401 }
1402 
1403 void MacroAssembler::call_VM_leaf_base(address entry_point,
1404                                        int number_of_arguments,
1405                                        Label *retaddr) {
1406   Label E, L;
1407 
1408   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1409 
1410   mov(rscratch1, entry_point);
1411   blr(rscratch1);
1412   if (retaddr)
1413     bind(*retaddr);
1414 
1415   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1416   maybe_isb();
1417 }
1418 
1419 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1420   call_VM_leaf_base(entry_point, number_of_arguments);
1421 }
1422 
1423 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1424   pass_arg0(this, arg_0);
1425   call_VM_leaf_base(entry_point, 1);
1426 }
1427 
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1429   pass_arg0(this, arg_0);
1430   pass_arg1(this, arg_1);
1431   call_VM_leaf_base(entry_point, 2);
1432 }
1433 
1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1435                                   Register arg_1, Register arg_2) {
1436   pass_arg0(this, arg_0);
1437   pass_arg1(this, arg_1);
1438   pass_arg2(this, arg_2);
1439   call_VM_leaf_base(entry_point, 3);
1440 }
1441 
1442 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1443   pass_arg0(this, arg_0);
1444   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1445 }
1446 
1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1448 
1449   assert(arg_0 != c_rarg1, "smashed arg");
1450   pass_arg1(this, arg_1);
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1456   assert(arg_0 != c_rarg2, "smashed arg");
1457   assert(arg_1 != c_rarg2, "smashed arg");
1458   pass_arg2(this, arg_2);
1459   assert(arg_0 != c_rarg1, "smashed arg");
1460   pass_arg1(this, arg_1);
1461   pass_arg0(this, arg_0);
1462   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1463 }
1464 
1465 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1466   assert(arg_0 != c_rarg3, "smashed arg");
1467   assert(arg_1 != c_rarg3, "smashed arg");
1468   assert(arg_2 != c_rarg3, "smashed arg");
1469   pass_arg3(this, arg_3);
1470   assert(arg_0 != c_rarg2, "smashed arg");
1471   assert(arg_1 != c_rarg2, "smashed arg");
1472   pass_arg2(this, arg_2);
1473   assert(arg_0 != c_rarg1, "smashed arg");
1474   pass_arg1(this, arg_1);
1475   pass_arg0(this, arg_0);
1476   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1477 }
1478 
1479 void MacroAssembler::null_check(Register reg, int offset) {
1480   if (needs_explicit_null_check(offset)) {
1481     // provoke OS NULL exception if reg = NULL by
1482     // accessing M[reg] w/o changing any registers
1483     // NOTE: this is plenty to provoke a segv
1484     ldr(zr, Address(reg));
1485   } else {
1486     // nothing to do, (later) access of M[reg + offset]
1487     // will provoke OS NULL exception if reg = NULL
1488   }
1489 }
1490 
1491 // MacroAssembler protected routines needed to implement
1492 // public methods
1493 
1494 void MacroAssembler::mov(Register r, Address dest) {
1495   code_section()->relocate(pc(), dest.rspec());
1496   u_int64_t imm64 = (u_int64_t)dest.target();
1497   movptr(r, imm64);
1498 }
1499 
1500 // Move a constant pointer into r.  In AArch64 mode the virtual
1501 // address space is 48 bits in size, so we only need three
1502 // instructions to create a patchable instruction sequence that can
1503 // reach anywhere.
1504 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1505 #ifndef PRODUCT
1506   {
1507     char buffer[64];
1508     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1509     block_comment(buffer);
1510   }
1511 #endif
1512   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1513   movz(r, imm64 & 0xffff);
1514   imm64 >>= 16;
1515   movk(r, imm64 & 0xffff, 16);
1516   imm64 >>= 16;
1517   movk(r, imm64 & 0xffff, 32);
1518 }
1519 
1520 // Macro to mov replicated immediate to vector register.
1521 //  Vd will get the following values for different arrangements in T
1522 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1523 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1524 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1525 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1526 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1527 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1528 //   T1D/T2D: invalid
1529 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1530   assert(T != T1D && T != T2D, "invalid arrangement");
1531   if (T == T8B || T == T16B) {
1532     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1533     movi(Vd, T, imm32 & 0xff, 0);
1534     return;
1535   }
1536   u_int32_t nimm32 = ~imm32;
1537   if (T == T4H || T == T8H) {
1538     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1539     imm32 &= 0xffff;
1540     nimm32 &= 0xffff;
1541   }
1542   u_int32_t x = imm32;
1543   int movi_cnt = 0;
1544   int movn_cnt = 0;
1545   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1546   x = nimm32;
1547   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1548   if (movn_cnt < movi_cnt) imm32 = nimm32;
1549   unsigned lsl = 0;
1550   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1551   if (movn_cnt < movi_cnt)
1552     mvni(Vd, T, imm32 & 0xff, lsl);
1553   else
1554     movi(Vd, T, imm32 & 0xff, lsl);
1555   imm32 >>= 8; lsl += 8;
1556   while (imm32) {
1557     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1558     if (movn_cnt < movi_cnt)
1559       bici(Vd, T, imm32 & 0xff, lsl);
1560     else
1561       orri(Vd, T, imm32 & 0xff, lsl);
1562     lsl += 8; imm32 >>= 8;
1563   }
1564 }
1565 
1566 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1567 {
1568 #ifndef PRODUCT
1569   {
1570     char buffer[64];
1571     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1572     block_comment(buffer);
1573   }
1574 #endif
1575   if (operand_valid_for_logical_immediate(false, imm64)) {
1576     orr(dst, zr, imm64);
1577   } else {
1578     // we can use a combination of MOVZ or MOVN with
1579     // MOVK to build up the constant
1580     u_int64_t imm_h[4];
1581     int zero_count = 0;
1582     int neg_count = 0;
1583     int i;
1584     for (i = 0; i < 4; i++) {
1585       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1586       if (imm_h[i] == 0) {
1587         zero_count++;
1588       } else if (imm_h[i] == 0xffffL) {
1589         neg_count++;
1590       }
1591     }
1592     if (zero_count == 4) {
1593       // one MOVZ will do
1594       movz(dst, 0);
1595     } else if (neg_count == 4) {
1596       // one MOVN will do
1597       movn(dst, 0);
1598     } else if (zero_count == 3) {
1599       for (i = 0; i < 4; i++) {
1600         if (imm_h[i] != 0L) {
1601           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1602           break;
1603         }
1604       }
1605     } else if (neg_count == 3) {
1606       // one MOVN will do
1607       for (int i = 0; i < 4; i++) {
1608         if (imm_h[i] != 0xffffL) {
1609           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1610           break;
1611         }
1612       }
1613     } else if (zero_count == 2) {
1614       // one MOVZ and one MOVK will do
1615       for (i = 0; i < 3; i++) {
1616         if (imm_h[i] != 0L) {
1617           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1618           i++;
1619           break;
1620         }
1621       }
1622       for (;i < 4; i++) {
1623         if (imm_h[i] != 0L) {
1624           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1625         }
1626       }
1627     } else if (neg_count == 2) {
1628       // one MOVN and one MOVK will do
1629       for (i = 0; i < 4; i++) {
1630         if (imm_h[i] != 0xffffL) {
1631           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1632           i++;
1633           break;
1634         }
1635       }
1636       for (;i < 4; i++) {
1637         if (imm_h[i] != 0xffffL) {
1638           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1639         }
1640       }
1641     } else if (zero_count == 1) {
1642       // one MOVZ and two MOVKs will do
1643       for (i = 0; i < 4; i++) {
1644         if (imm_h[i] != 0L) {
1645           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1646           i++;
1647           break;
1648         }
1649       }
1650       for (;i < 4; i++) {
1651         if (imm_h[i] != 0x0L) {
1652           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1653         }
1654       }
1655     } else if (neg_count == 1) {
1656       // one MOVN and two MOVKs will do
1657       for (i = 0; i < 4; i++) {
1658         if (imm_h[i] != 0xffffL) {
1659           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1660           i++;
1661           break;
1662         }
1663       }
1664       for (;i < 4; i++) {
1665         if (imm_h[i] != 0xffffL) {
1666           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1667         }
1668       }
1669     } else {
1670       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1671       movz(dst, (u_int32_t)imm_h[0], 0);
1672       for (i = 1; i < 4; i++) {
1673         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1674       }
1675     }
1676   }
1677 }
1678 
1679 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1680 {
1681 #ifndef PRODUCT
1682     {
1683       char buffer[64];
1684       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1685       block_comment(buffer);
1686     }
1687 #endif
1688   if (operand_valid_for_logical_immediate(true, imm32)) {
1689     orrw(dst, zr, imm32);
1690   } else {
1691     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1692     // constant
1693     u_int32_t imm_h[2];
1694     imm_h[0] = imm32 & 0xffff;
1695     imm_h[1] = ((imm32 >> 16) & 0xffff);
1696     if (imm_h[0] == 0) {
1697       movzw(dst, imm_h[1], 16);
1698     } else if (imm_h[0] == 0xffff) {
1699       movnw(dst, imm_h[1] ^ 0xffff, 16);
1700     } else if (imm_h[1] == 0) {
1701       movzw(dst, imm_h[0], 0);
1702     } else if (imm_h[1] == 0xffff) {
1703       movnw(dst, imm_h[0] ^ 0xffff, 0);
1704     } else {
1705       // use a MOVZ and MOVK (makes it easier to debug)
1706       movzw(dst, imm_h[0], 0);
1707       movkw(dst, imm_h[1], 16);
1708     }
1709   }
1710 }
1711 
1712 // Form an address from base + offset in Rd.  Rd may or may
1713 // not actually be used: you must use the Address that is returned.
1714 // It is up to you to ensure that the shift provided matches the size
1715 // of your data.
1716 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1717   if (Address::offset_ok_for_immed(byte_offset, shift))
1718     // It fits; no need for any heroics
1719     return Address(base, byte_offset);
1720 
1721   // Don't do anything clever with negative or misaligned offsets
1722   unsigned mask = (1 << shift) - 1;
1723   if (byte_offset < 0 || byte_offset & mask) {
1724     mov(Rd, byte_offset);
1725     add(Rd, base, Rd);
1726     return Address(Rd);
1727   }
1728 
1729   // See if we can do this with two 12-bit offsets
1730   {
1731     unsigned long word_offset = byte_offset >> shift;
1732     unsigned long masked_offset = word_offset & 0xfff000;
1733     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1734         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1735       add(Rd, base, masked_offset << shift);
1736       word_offset -= masked_offset;
1737       return Address(Rd, word_offset << shift);
1738     }
1739   }
1740 
1741   // Do it the hard way
1742   mov(Rd, byte_offset);
1743   add(Rd, base, Rd);
1744   return Address(Rd);
1745 }
1746 
1747 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1748   if (UseLSE) {
1749     mov(tmp, 1);
1750     ldadd(Assembler::word, tmp, zr, counter_addr);
1751     return;
1752   }
1753   Label retry_load;
1754   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1755     prfm(Address(counter_addr), PSTL1STRM);
1756   bind(retry_load);
1757   // flush and load exclusive from the memory location
1758   ldxrw(tmp, counter_addr);
1759   addw(tmp, tmp, 1);
1760   // if we store+flush with no intervening write tmp wil be zero
1761   stxrw(tmp2, tmp, counter_addr);
1762   cbnzw(tmp2, retry_load);
1763 }
1764 
1765 
1766 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1767                                     bool want_remainder, Register scratch)
1768 {
1769   // Full implementation of Java idiv and irem.  The function
1770   // returns the (pc) offset of the div instruction - may be needed
1771   // for implicit exceptions.
1772   //
1773   // constraint : ra/rb =/= scratch
1774   //         normal case
1775   //
1776   // input : ra: dividend
1777   //         rb: divisor
1778   //
1779   // result: either
1780   //         quotient  (= ra idiv rb)
1781   //         remainder (= ra irem rb)
1782 
1783   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1784 
1785   int idivl_offset = offset();
1786   if (! want_remainder) {
1787     sdivw(result, ra, rb);
1788   } else {
1789     sdivw(scratch, ra, rb);
1790     Assembler::msubw(result, scratch, rb, ra);
1791   }
1792 
1793   return idivl_offset;
1794 }
1795 
1796 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1797                                     bool want_remainder, Register scratch)
1798 {
1799   // Full implementation of Java ldiv and lrem.  The function
1800   // returns the (pc) offset of the div instruction - may be needed
1801   // for implicit exceptions.
1802   //
1803   // constraint : ra/rb =/= scratch
1804   //         normal case
1805   //
1806   // input : ra: dividend
1807   //         rb: divisor
1808   //
1809   // result: either
1810   //         quotient  (= ra idiv rb)
1811   //         remainder (= ra irem rb)
1812 
1813   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1814 
1815   int idivq_offset = offset();
1816   if (! want_remainder) {
1817     sdiv(result, ra, rb);
1818   } else {
1819     sdiv(scratch, ra, rb);
1820     Assembler::msub(result, scratch, rb, ra);
1821   }
1822 
1823   return idivq_offset;
1824 }
1825 
1826 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1827   address prev = pc() - NativeMembar::instruction_size;
1828   address last = code()->last_insn();
1829   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1830     NativeMembar *bar = NativeMembar_at(prev);
1831     // We are merging two memory barrier instructions.  On AArch64 we
1832     // can do this simply by ORing them together.
1833     bar->set_kind(bar->get_kind() | order_constraint);
1834     BLOCK_COMMENT("merged membar");
1835   } else {
1836     code()->set_last_insn(pc());
1837     dmb(Assembler::barrier(order_constraint));
1838   }
1839 }
1840 
1841 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1842   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1843     merge_ldst(rt, adr, size_in_bytes, is_store);
1844     code()->clear_last_insn();
1845     return true;
1846   } else {
1847     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1848     const unsigned mask = size_in_bytes - 1;
1849     if (adr.getMode() == Address::base_plus_offset &&
1850         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1851       code()->set_last_insn(pc());
1852     }
1853     return false;
1854   }
1855 }
1856 
1857 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1858   // We always try to merge two adjacent loads into one ldp.
1859   if (!try_merge_ldst(Rx, adr, 8, false)) {
1860     Assembler::ldr(Rx, adr);
1861   }
1862 }
1863 
1864 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1865   // We always try to merge two adjacent loads into one ldp.
1866   if (!try_merge_ldst(Rw, adr, 4, false)) {
1867     Assembler::ldrw(Rw, adr);
1868   }
1869 }
1870 
1871 void MacroAssembler::str(Register Rx, const Address &adr) {
1872   // We always try to merge two adjacent stores into one stp.
1873   if (!try_merge_ldst(Rx, adr, 8, true)) {
1874     Assembler::str(Rx, adr);
1875   }
1876 }
1877 
1878 void MacroAssembler::strw(Register Rw, const Address &adr) {
1879   // We always try to merge two adjacent stores into one stp.
1880   if (!try_merge_ldst(Rw, adr, 4, true)) {
1881     Assembler::strw(Rw, adr);
1882   }
1883 }
1884 
1885 // MacroAssembler routines found actually to be needed
1886 
1887 void MacroAssembler::push(Register src)
1888 {
1889   str(src, Address(pre(esp, -1 * wordSize)));
1890 }
1891 
1892 void MacroAssembler::pop(Register dst)
1893 {
1894   ldr(dst, Address(post(esp, 1 * wordSize)));
1895 }
1896 
1897 // Note: load_unsigned_short used to be called load_unsigned_word.
1898 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1899   int off = offset();
1900   ldrh(dst, src);
1901   return off;
1902 }
1903 
1904 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1905   int off = offset();
1906   ldrb(dst, src);
1907   return off;
1908 }
1909 
1910 int MacroAssembler::load_signed_short(Register dst, Address src) {
1911   int off = offset();
1912   ldrsh(dst, src);
1913   return off;
1914 }
1915 
1916 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1917   int off = offset();
1918   ldrsb(dst, src);
1919   return off;
1920 }
1921 
1922 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1923   int off = offset();
1924   ldrshw(dst, src);
1925   return off;
1926 }
1927 
1928 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1929   int off = offset();
1930   ldrsbw(dst, src);
1931   return off;
1932 }
1933 
1934 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1935   switch (size_in_bytes) {
1936   case  8:  ldr(dst, src); break;
1937   case  4:  ldrw(dst, src); break;
1938   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1939   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1940   default:  ShouldNotReachHere();
1941   }
1942 }
1943 
1944 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1945   switch (size_in_bytes) {
1946   case  8:  str(src, dst); break;
1947   case  4:  strw(src, dst); break;
1948   case  2:  strh(src, dst); break;
1949   case  1:  strb(src, dst); break;
1950   default:  ShouldNotReachHere();
1951   }
1952 }
1953 
1954 void MacroAssembler::decrementw(Register reg, int value)
1955 {
1956   if (value < 0)  { incrementw(reg, -value);      return; }
1957   if (value == 0) {                               return; }
1958   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1959   /* else */ {
1960     guarantee(reg != rscratch2, "invalid dst for register decrement");
1961     movw(rscratch2, (unsigned)value);
1962     subw(reg, reg, rscratch2);
1963   }
1964 }
1965 
1966 void MacroAssembler::decrement(Register reg, int value)
1967 {
1968   if (value < 0)  { increment(reg, -value);      return; }
1969   if (value == 0) {                              return; }
1970   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1971   /* else */ {
1972     assert(reg != rscratch2, "invalid dst for register decrement");
1973     mov(rscratch2, (unsigned long)value);
1974     sub(reg, reg, rscratch2);
1975   }
1976 }
1977 
1978 void MacroAssembler::decrementw(Address dst, int value)
1979 {
1980   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1981   if (dst.getMode() == Address::literal) {
1982     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1983     lea(rscratch2, dst);
1984     dst = Address(rscratch2);
1985   }
1986   ldrw(rscratch1, dst);
1987   decrementw(rscratch1, value);
1988   strw(rscratch1, dst);
1989 }
1990 
1991 void MacroAssembler::decrement(Address dst, int value)
1992 {
1993   assert(!dst.uses(rscratch1), "invalid address for decrement");
1994   if (dst.getMode() == Address::literal) {
1995     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1996     lea(rscratch2, dst);
1997     dst = Address(rscratch2);
1998   }
1999   ldr(rscratch1, dst);
2000   decrement(rscratch1, value);
2001   str(rscratch1, dst);
2002 }
2003 
2004 void MacroAssembler::incrementw(Register reg, int value)
2005 {
2006   if (value < 0)  { decrementw(reg, -value);      return; }
2007   if (value == 0) {                               return; }
2008   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2009   /* else */ {
2010     assert(reg != rscratch2, "invalid dst for register increment");
2011     movw(rscratch2, (unsigned)value);
2012     addw(reg, reg, rscratch2);
2013   }
2014 }
2015 
2016 void MacroAssembler::increment(Register reg, int value)
2017 {
2018   if (value < 0)  { decrement(reg, -value);      return; }
2019   if (value == 0) {                              return; }
2020   if (value < (1 << 12)) { add(reg, reg, value); return; }
2021   /* else */ {
2022     assert(reg != rscratch2, "invalid dst for register increment");
2023     movw(rscratch2, (unsigned)value);
2024     add(reg, reg, rscratch2);
2025   }
2026 }
2027 
2028 void MacroAssembler::incrementw(Address dst, int value)
2029 {
2030   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2031   if (dst.getMode() == Address::literal) {
2032     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2033     lea(rscratch2, dst);
2034     dst = Address(rscratch2);
2035   }
2036   ldrw(rscratch1, dst);
2037   incrementw(rscratch1, value);
2038   strw(rscratch1, dst);
2039 }
2040 
2041 void MacroAssembler::increment(Address dst, int value)
2042 {
2043   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2044   if (dst.getMode() == Address::literal) {
2045     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2046     lea(rscratch2, dst);
2047     dst = Address(rscratch2);
2048   }
2049   ldr(rscratch1, dst);
2050   increment(rscratch1, value);
2051   str(rscratch1, dst);
2052 }
2053 
2054 
2055 void MacroAssembler::pusha() {
2056   push(0x7fffffff, sp);
2057 }
2058 
2059 void MacroAssembler::popa() {
2060   pop(0x7fffffff, sp);
2061 }
2062 
2063 // Push lots of registers in the bit set supplied.  Don't push sp.
2064 // Return the number of words pushed
2065 int MacroAssembler::push(unsigned int bitset, Register stack) {
2066   int words_pushed = 0;
2067 
2068   // Scan bitset to accumulate register pairs
2069   unsigned char regs[32];
2070   int count = 0;
2071   for (int reg = 0; reg <= 30; reg++) {
2072     if (1 & bitset)
2073       regs[count++] = reg;
2074     bitset >>= 1;
2075   }
2076   regs[count++] = zr->encoding_nocheck();
2077   count &= ~1;  // Only push an even nuber of regs
2078 
2079   if (count) {
2080     stp(as_Register(regs[0]), as_Register(regs[1]),
2081        Address(pre(stack, -count * wordSize)));
2082     words_pushed += 2;
2083   }
2084   for (int i = 2; i < count; i += 2) {
2085     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2086        Address(stack, i * wordSize));
2087     words_pushed += 2;
2088   }
2089 
2090   assert(words_pushed == count, "oops, pushed != count");
2091 
2092   return count;
2093 }
2094 
2095 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2096   int words_pushed = 0;
2097 
2098   // Scan bitset to accumulate register pairs
2099   unsigned char regs[32];
2100   int count = 0;
2101   for (int reg = 0; reg <= 30; reg++) {
2102     if (1 & bitset)
2103       regs[count++] = reg;
2104     bitset >>= 1;
2105   }
2106   regs[count++] = zr->encoding_nocheck();
2107   count &= ~1;
2108 
2109   for (int i = 2; i < count; i += 2) {
2110     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2111        Address(stack, i * wordSize));
2112     words_pushed += 2;
2113   }
2114   if (count) {
2115     ldp(as_Register(regs[0]), as_Register(regs[1]),
2116        Address(post(stack, count * wordSize)));
2117     words_pushed += 2;
2118   }
2119 
2120   assert(words_pushed == count, "oops, pushed != count");
2121 
2122   return count;
2123 }
2124 
2125 // Push lots of registers in the bit set supplied.  Don't push sp.
2126 // Return the number of words pushed
2127 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2128   int words_pushed = 0;
2129 
2130   // Scan bitset to accumulate register pairs
2131   unsigned char regs[32];
2132   int count = 0;
2133   for (int reg = 0; reg <= 31; reg++) {
2134     if (1 & bitset)
2135       regs[count++] = reg;
2136     bitset >>= 1;
2137   }
2138   regs[count++] = zr->encoding_nocheck();
2139   count &= ~1;  // Only push an even number of regs
2140 
2141   // Always pushing full 128 bit registers.
2142   if (count) {
2143     stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2)));
2144     words_pushed += 2;
2145   }
2146   for (int i = 2; i < count; i += 2) {
2147     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2148     words_pushed += 2;
2149   }
2150 
2151   assert(words_pushed == count, "oops, pushed != count");
2152   return count;
2153 }
2154 
2155 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2156   int words_pushed = 0;
2157 
2158   // Scan bitset to accumulate register pairs
2159   unsigned char regs[32];
2160   int count = 0;
2161   for (int reg = 0; reg <= 31; reg++) {
2162     if (1 & bitset)
2163       regs[count++] = reg;
2164     bitset >>= 1;
2165   }
2166   regs[count++] = zr->encoding_nocheck();
2167   count &= ~1;
2168 
2169   for (int i = 2; i < count; i += 2) {
2170     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2171     words_pushed += 2;
2172   }
2173   if (count) {
2174     ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2)));
2175     words_pushed += 2;
2176   }
2177 
2178   assert(words_pushed == count, "oops, pushed != count");
2179 
2180   return count;
2181 }
2182 
2183 #ifdef ASSERT
2184 void MacroAssembler::verify_heapbase(const char* msg) {
2185 #if 0
2186   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2187   assert (Universe::heap() != NULL, "java heap should be initialized");
2188   if (!UseCompressedOops || Universe::ptr_base() == NULL) {
2189     // rheapbase is allocated as general register
2190     return;
2191   }
2192   if (CheckCompressedOops) {
2193     Label ok;
2194     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2195     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2196     br(Assembler::EQ, ok);
2197     stop(msg);
2198     bind(ok);
2199     pop(1 << rscratch1->encoding(), sp);
2200   }
2201 #endif
2202 }
2203 #endif
2204 
2205 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2206   Label done, not_weak;
2207   cbz(value, done);           // Use NULL as-is.
2208 
2209   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2210   tbz(r0, 0, not_weak);    // Test for jweak tag.
2211 
2212   // Resolve jweak.
2213   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2214                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2215   verify_oop(value);
2216   b(done);
2217 
2218   bind(not_weak);
2219   // Resolve (untagged) jobject.
2220   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2221   verify_oop(value);
2222   bind(done);
2223 }
2224 
2225 void MacroAssembler::stop(const char* msg) {
2226   address ip = pc();
2227   pusha();
2228   mov(c_rarg0, (address)msg);
2229   mov(c_rarg1, (address)ip);
2230   mov(c_rarg2, sp);
2231   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2232   blr(c_rarg3);
2233   hlt(0);
2234 }
2235 
2236 void MacroAssembler::warn(const char* msg) {
2237   pusha();
2238   mov(c_rarg0, (address)msg);
2239   mov(lr, CAST_FROM_FN_PTR(address, warning));
2240   blr(lr);
2241   popa();
2242 }
2243 
2244 void MacroAssembler::unimplemented(const char* what) {
2245   const char* buf = NULL;
2246   {
2247     ResourceMark rm;
2248     stringStream ss;
2249     ss.print("unimplemented: %s", what);
2250     buf = code_string(ss.as_string());
2251   }
2252   stop(buf);
2253 }
2254 
2255 // If a constant does not fit in an immediate field, generate some
2256 // number of MOV instructions and then perform the operation.
2257 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2258                                            add_sub_imm_insn insn1,
2259                                            add_sub_reg_insn insn2) {
2260   assert(Rd != zr, "Rd = zr and not setting flags?");
2261   if (operand_valid_for_add_sub_immediate((int)imm)) {
2262     (this->*insn1)(Rd, Rn, imm);
2263   } else {
2264     if (uabs(imm) < (1 << 24)) {
2265        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2266        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2267     } else {
2268        assert_different_registers(Rd, Rn);
2269        mov(Rd, (uint64_t)imm);
2270        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2271     }
2272   }
2273 }
2274 
2275 // Seperate vsn which sets the flags. Optimisations are more restricted
2276 // because we must set the flags correctly.
2277 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2278                                            add_sub_imm_insn insn1,
2279                                            add_sub_reg_insn insn2) {
2280   if (operand_valid_for_add_sub_immediate((int)imm)) {
2281     (this->*insn1)(Rd, Rn, imm);
2282   } else {
2283     assert_different_registers(Rd, Rn);
2284     assert(Rd != zr, "overflow in immediate operand");
2285     mov(Rd, (uint64_t)imm);
2286     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2287   }
2288 }
2289 
2290 
2291 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2292   if (increment.is_register()) {
2293     add(Rd, Rn, increment.as_register());
2294   } else {
2295     add(Rd, Rn, increment.as_constant());
2296   }
2297 }
2298 
2299 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2300   if (increment.is_register()) {
2301     addw(Rd, Rn, increment.as_register());
2302   } else {
2303     addw(Rd, Rn, increment.as_constant());
2304   }
2305 }
2306 
2307 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2308   if (decrement.is_register()) {
2309     sub(Rd, Rn, decrement.as_register());
2310   } else {
2311     sub(Rd, Rn, decrement.as_constant());
2312   }
2313 }
2314 
2315 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2316   if (decrement.is_register()) {
2317     subw(Rd, Rn, decrement.as_register());
2318   } else {
2319     subw(Rd, Rn, decrement.as_constant());
2320   }
2321 }
2322 
2323 void MacroAssembler::reinit_heapbase()
2324 {
2325   if (UseCompressedOops) {
2326     if (Universe::is_fully_initialized()) {
2327       mov(rheapbase, CompressedOops::ptrs_base());
2328     } else {
2329       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2330       ldr(rheapbase, Address(rheapbase));
2331     }
2332   }
2333 }
2334 
2335 // this simulates the behaviour of the x86 cmpxchg instruction using a
2336 // load linked/store conditional pair. we use the acquire/release
2337 // versions of these instructions so that we flush pending writes as
2338 // per Java semantics.
2339 
2340 // n.b the x86 version assumes the old value to be compared against is
2341 // in rax and updates rax with the value located in memory if the
2342 // cmpxchg fails. we supply a register for the old value explicitly
2343 
2344 // the aarch64 load linked/store conditional instructions do not
2345 // accept an offset. so, unlike x86, we must provide a plain register
2346 // to identify the memory word to be compared/exchanged rather than a
2347 // register+offset Address.
2348 
2349 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2350                                 Label &succeed, Label *fail) {
2351   // oldv holds comparison value
2352   // newv holds value to write in exchange
2353   // addr identifies memory word to compare against/update
2354   if (UseLSE) {
2355     mov(tmp, oldv);
2356     casal(Assembler::xword, oldv, newv, addr);
2357     cmp(tmp, oldv);
2358     br(Assembler::EQ, succeed);
2359     membar(AnyAny);
2360   } else {
2361     Label retry_load, nope;
2362     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2363       prfm(Address(addr), PSTL1STRM);
2364     bind(retry_load);
2365     // flush and load exclusive from the memory location
2366     // and fail if it is not what we expect
2367     ldaxr(tmp, addr);
2368     cmp(tmp, oldv);
2369     br(Assembler::NE, nope);
2370     // if we store+flush with no intervening write tmp wil be zero
2371     stlxr(tmp, newv, addr);
2372     cbzw(tmp, succeed);
2373     // retry so we only ever return after a load fails to compare
2374     // ensures we don't return a stale value after a failed write.
2375     b(retry_load);
2376     // if the memory word differs we return it in oldv and signal a fail
2377     bind(nope);
2378     membar(AnyAny);
2379     mov(oldv, tmp);
2380   }
2381   if (fail)
2382     b(*fail);
2383 }
2384 
2385 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2386                                         Label &succeed, Label *fail) {
2387   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2388   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2389 }
2390 
2391 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2392                                 Label &succeed, Label *fail) {
2393   // oldv holds comparison value
2394   // newv holds value to write in exchange
2395   // addr identifies memory word to compare against/update
2396   // tmp returns 0/1 for success/failure
2397   if (UseLSE) {
2398     mov(tmp, oldv);
2399     casal(Assembler::word, oldv, newv, addr);
2400     cmp(tmp, oldv);
2401     br(Assembler::EQ, succeed);
2402     membar(AnyAny);
2403   } else {
2404     Label retry_load, nope;
2405     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2406       prfm(Address(addr), PSTL1STRM);
2407     bind(retry_load);
2408     // flush and load exclusive from the memory location
2409     // and fail if it is not what we expect
2410     ldaxrw(tmp, addr);
2411     cmp(tmp, oldv);
2412     br(Assembler::NE, nope);
2413     // if we store+flush with no intervening write tmp wil be zero
2414     stlxrw(tmp, newv, addr);
2415     cbzw(tmp, succeed);
2416     // retry so we only ever return after a load fails to compare
2417     // ensures we don't return a stale value after a failed write.
2418     b(retry_load);
2419     // if the memory word differs we return it in oldv and signal a fail
2420     bind(nope);
2421     membar(AnyAny);
2422     mov(oldv, tmp);
2423   }
2424   if (fail)
2425     b(*fail);
2426 }
2427 
2428 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2429 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2430 // Pass a register for the result, otherwise pass noreg.
2431 
2432 // Clobbers rscratch1
2433 void MacroAssembler::cmpxchg(Register addr, Register expected,
2434                              Register new_val,
2435                              enum operand_size size,
2436                              bool acquire, bool release,
2437                              bool weak,
2438                              Register result) {
2439   if (result == noreg)  result = rscratch1;
2440   BLOCK_COMMENT("cmpxchg {");
2441   if (UseLSE) {
2442     mov(result, expected);
2443     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2444     compare_eq(result, expected, size);
2445   } else {
2446     Label retry_load, done;
2447     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2448       prfm(Address(addr), PSTL1STRM);
2449     bind(retry_load);
2450     load_exclusive(result, addr, size, acquire);
2451     compare_eq(result, expected, size);
2452     br(Assembler::NE, done);
2453     store_exclusive(rscratch1, new_val, addr, size, release);
2454     if (weak) {
2455       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2456     } else {
2457       cbnzw(rscratch1, retry_load);
2458     }
2459     bind(done);
2460   }
2461   BLOCK_COMMENT("} cmpxchg");
2462 }
2463 
2464 // A generic comparison. Only compares for equality, clobbers rscratch1.
2465 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2466   if (size == xword) {
2467     cmp(rm, rn);
2468   } else if (size == word) {
2469     cmpw(rm, rn);
2470   } else if (size == halfword) {
2471     eorw(rscratch1, rm, rn);
2472     ands(zr, rscratch1, 0xffff);
2473   } else if (size == byte) {
2474     eorw(rscratch1, rm, rn);
2475     ands(zr, rscratch1, 0xff);
2476   } else {
2477     ShouldNotReachHere();
2478   }
2479 }
2480 
2481 
2482 static bool different(Register a, RegisterOrConstant b, Register c) {
2483   if (b.is_constant())
2484     return a != c;
2485   else
2486     return a != b.as_register() && a != c && b.as_register() != c;
2487 }
2488 
2489 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2490 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2491   if (UseLSE) {                                                         \
2492     prev = prev->is_valid() ? prev : zr;                                \
2493     if (incr.is_register()) {                                           \
2494       AOP(sz, incr.as_register(), prev, addr);                          \
2495     } else {                                                            \
2496       mov(rscratch2, incr.as_constant());                               \
2497       AOP(sz, rscratch2, prev, addr);                                   \
2498     }                                                                   \
2499     return;                                                             \
2500   }                                                                     \
2501   Register result = rscratch2;                                          \
2502   if (prev->is_valid())                                                 \
2503     result = different(prev, incr, addr) ? prev : rscratch2;            \
2504                                                                         \
2505   Label retry_load;                                                     \
2506   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2507     prfm(Address(addr), PSTL1STRM);                                     \
2508   bind(retry_load);                                                     \
2509   LDXR(result, addr);                                                   \
2510   OP(rscratch1, result, incr);                                          \
2511   STXR(rscratch2, rscratch1, addr);                                     \
2512   cbnzw(rscratch2, retry_load);                                         \
2513   if (prev->is_valid() && prev != result) {                             \
2514     IOP(prev, rscratch1, incr);                                         \
2515   }                                                                     \
2516 }
2517 
2518 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2519 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2520 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2521 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2522 
2523 #undef ATOMIC_OP
2524 
2525 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2526 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2527   if (UseLSE) {                                                         \
2528     prev = prev->is_valid() ? prev : zr;                                \
2529     AOP(sz, newv, prev, addr);                                          \
2530     return;                                                             \
2531   }                                                                     \
2532   Register result = rscratch2;                                          \
2533   if (prev->is_valid())                                                 \
2534     result = different(prev, newv, addr) ? prev : rscratch2;            \
2535                                                                         \
2536   Label retry_load;                                                     \
2537   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2538     prfm(Address(addr), PSTL1STRM);                                     \
2539   bind(retry_load);                                                     \
2540   LDXR(result, addr);                                                   \
2541   STXR(rscratch1, newv, addr);                                          \
2542   cbnzw(rscratch1, retry_load);                                         \
2543   if (prev->is_valid() && prev != result)                               \
2544     mov(prev, result);                                                  \
2545 }
2546 
2547 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2548 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2549 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2550 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2551 
2552 #undef ATOMIC_XCHG
2553 
2554 #ifndef PRODUCT
2555 extern "C" void findpc(intptr_t x);
2556 #endif
2557 
2558 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2559 {
2560   // In order to get locks to work, we need to fake a in_VM state
2561   if (ShowMessageBoxOnError ) {
2562     JavaThread* thread = JavaThread::current();
2563     JavaThreadState saved_state = thread->thread_state();
2564     thread->set_thread_state(_thread_in_vm);
2565 #ifndef PRODUCT
2566     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2567       ttyLocker ttyl;
2568       BytecodeCounter::print();
2569     }
2570 #endif
2571     if (os::message_box(msg, "Execution stopped, print registers?")) {
2572       ttyLocker ttyl;
2573       tty->print_cr(" pc = 0x%016lx", pc);
2574 #ifndef PRODUCT
2575       tty->cr();
2576       findpc(pc);
2577       tty->cr();
2578 #endif
2579       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2580       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2581       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2582       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2583       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2584       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2585       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2586       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2587       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2588       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2589       tty->print_cr("r10 = 0x%016lx", regs[10]);
2590       tty->print_cr("r11 = 0x%016lx", regs[11]);
2591       tty->print_cr("r12 = 0x%016lx", regs[12]);
2592       tty->print_cr("r13 = 0x%016lx", regs[13]);
2593       tty->print_cr("r14 = 0x%016lx", regs[14]);
2594       tty->print_cr("r15 = 0x%016lx", regs[15]);
2595       tty->print_cr("r16 = 0x%016lx", regs[16]);
2596       tty->print_cr("r17 = 0x%016lx", regs[17]);
2597       tty->print_cr("r18 = 0x%016lx", regs[18]);
2598       tty->print_cr("r19 = 0x%016lx", regs[19]);
2599       tty->print_cr("r20 = 0x%016lx", regs[20]);
2600       tty->print_cr("r21 = 0x%016lx", regs[21]);
2601       tty->print_cr("r22 = 0x%016lx", regs[22]);
2602       tty->print_cr("r23 = 0x%016lx", regs[23]);
2603       tty->print_cr("r24 = 0x%016lx", regs[24]);
2604       tty->print_cr("r25 = 0x%016lx", regs[25]);
2605       tty->print_cr("r26 = 0x%016lx", regs[26]);
2606       tty->print_cr("r27 = 0x%016lx", regs[27]);
2607       tty->print_cr("r28 = 0x%016lx", regs[28]);
2608       tty->print_cr("r30 = 0x%016lx", regs[30]);
2609       tty->print_cr("r31 = 0x%016lx", regs[31]);
2610       BREAKPOINT;
2611     }
2612   }
2613   fatal("DEBUG MESSAGE: %s", msg);
2614 }
2615 
2616 void MacroAssembler::push_call_clobbered_registers() {
2617   int step = 4 * wordSize;
2618   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2619   sub(sp, sp, step);
2620   mov(rscratch1, -step);
2621   // Push v0-v7, v16-v31.
2622   for (int i = 31; i>= 4; i -= 4) {
2623     if (i <= v7->encoding() || i >= v16->encoding())
2624       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2625           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2626   }
2627   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2628       as_FloatRegister(3), T1D, Address(sp));
2629 }
2630 
2631 void MacroAssembler::pop_call_clobbered_registers() {
2632   for (int i = 0; i < 32; i += 4) {
2633     if (i <= v7->encoding() || i >= v16->encoding())
2634       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2635           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2636   }
2637 
2638   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2639 }
2640 
2641 void MacroAssembler::push_CPU_state(bool save_vectors) {
2642   int step = (save_vectors ? 8 : 4) * wordSize;
2643   push(0x3fffffff, sp);         // integer registers except lr & sp
2644   mov(rscratch1, -step);
2645   sub(sp, sp, step);
2646   for (int i = 28; i >= 4; i -= 4) {
2647     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2648         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2649   }
2650   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2651 }
2652 
2653 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2654   int step = (restore_vectors ? 8 : 4) * wordSize;
2655   for (int i = 0; i <= 28; i += 4)
2656     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2657         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2658   pop(0x3fffffff, sp);         // integer registers except lr & sp
2659 }
2660 
2661 /**
2662  * Helpers for multiply_to_len().
2663  */
2664 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2665                                      Register src1, Register src2) {
2666   adds(dest_lo, dest_lo, src1);
2667   adc(dest_hi, dest_hi, zr);
2668   adds(dest_lo, dest_lo, src2);
2669   adc(final_dest_hi, dest_hi, zr);
2670 }
2671 
2672 // Generate an address from (r + r1 extend offset).  "size" is the
2673 // size of the operand.  The result may be in rscratch2.
2674 Address MacroAssembler::offsetted_address(Register r, Register r1,
2675                                           Address::extend ext, int offset, int size) {
2676   if (offset || (ext.shift() % size != 0)) {
2677     lea(rscratch2, Address(r, r1, ext));
2678     return Address(rscratch2, offset);
2679   } else {
2680     return Address(r, r1, ext);
2681   }
2682 }
2683 
2684 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2685 {
2686   assert(offset >= 0, "spill to negative address?");
2687   // Offset reachable ?
2688   //   Not aligned - 9 bits signed offset
2689   //   Aligned - 12 bits unsigned offset shifted
2690   Register base = sp;
2691   if ((offset & (size-1)) && offset >= (1<<8)) {
2692     add(tmp, base, offset & ((1<<12)-1));
2693     base = tmp;
2694     offset &= -1u<<12;
2695   }
2696 
2697   if (offset >= (1<<12) * size) {
2698     add(tmp, base, offset & (((1<<12)-1)<<12));
2699     base = tmp;
2700     offset &= ~(((1<<12)-1)<<12);
2701   }
2702 
2703   return Address(base, offset);
2704 }
2705 
2706 // Checks whether offset is aligned.
2707 // Returns true if it is, else false.
2708 bool MacroAssembler::merge_alignment_check(Register base,
2709                                            size_t size,
2710                                            long cur_offset,
2711                                            long prev_offset) const {
2712   if (AvoidUnalignedAccesses) {
2713     if (base == sp) {
2714       // Checks whether low offset if aligned to pair of registers.
2715       long pair_mask = size * 2 - 1;
2716       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2717       return (offset & pair_mask) == 0;
2718     } else { // If base is not sp, we can't guarantee the access is aligned.
2719       return false;
2720     }
2721   } else {
2722     long mask = size - 1;
2723     // Load/store pair instruction only supports element size aligned offset.
2724     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2725   }
2726 }
2727 
2728 // Checks whether current and previous loads/stores can be merged.
2729 // Returns true if it can be merged, else false.
2730 bool MacroAssembler::ldst_can_merge(Register rt,
2731                                     const Address &adr,
2732                                     size_t cur_size_in_bytes,
2733                                     bool is_store) const {
2734   address prev = pc() - NativeInstruction::instruction_size;
2735   address last = code()->last_insn();
2736 
2737   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2738     return false;
2739   }
2740 
2741   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2742     return false;
2743   }
2744 
2745   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2746   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2747 
2748   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2749   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2750 
2751   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2752     return false;
2753   }
2754 
2755   long max_offset = 63 * prev_size_in_bytes;
2756   long min_offset = -64 * prev_size_in_bytes;
2757 
2758   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2759 
2760   // Only same base can be merged.
2761   if (adr.base() != prev_ldst->base()) {
2762     return false;
2763   }
2764 
2765   long cur_offset = adr.offset();
2766   long prev_offset = prev_ldst->offset();
2767   size_t diff = abs(cur_offset - prev_offset);
2768   if (diff != prev_size_in_bytes) {
2769     return false;
2770   }
2771 
2772   // Following cases can not be merged:
2773   // ldr x2, [x2, #8]
2774   // ldr x3, [x2, #16]
2775   // or:
2776   // ldr x2, [x3, #8]
2777   // ldr x2, [x3, #16]
2778   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2779   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2780     return false;
2781   }
2782 
2783   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2784   // Offset range must be in ldp/stp instruction's range.
2785   if (low_offset > max_offset || low_offset < min_offset) {
2786     return false;
2787   }
2788 
2789   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2790     return true;
2791   }
2792 
2793   return false;
2794 }
2795 
2796 // Merge current load/store with previous load/store into ldp/stp.
2797 void MacroAssembler::merge_ldst(Register rt,
2798                                 const Address &adr,
2799                                 size_t cur_size_in_bytes,
2800                                 bool is_store) {
2801 
2802   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2803 
2804   Register rt_low, rt_high;
2805   address prev = pc() - NativeInstruction::instruction_size;
2806   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2807 
2808   long offset;
2809 
2810   if (adr.offset() < prev_ldst->offset()) {
2811     offset = adr.offset();
2812     rt_low = rt;
2813     rt_high = prev_ldst->target();
2814   } else {
2815     offset = prev_ldst->offset();
2816     rt_low = prev_ldst->target();
2817     rt_high = rt;
2818   }
2819 
2820   Address adr_p = Address(prev_ldst->base(), offset);
2821   // Overwrite previous generated binary.
2822   code_section()->set_end(prev);
2823 
2824   const int sz = prev_ldst->size_in_bytes();
2825   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2826   if (!is_store) {
2827     BLOCK_COMMENT("merged ldr pair");
2828     if (sz == 8) {
2829       ldp(rt_low, rt_high, adr_p);
2830     } else {
2831       ldpw(rt_low, rt_high, adr_p);
2832     }
2833   } else {
2834     BLOCK_COMMENT("merged str pair");
2835     if (sz == 8) {
2836       stp(rt_low, rt_high, adr_p);
2837     } else {
2838       stpw(rt_low, rt_high, adr_p);
2839     }
2840   }
2841 }
2842 
2843 /**
2844  * Multiply 64 bit by 64 bit first loop.
2845  */
2846 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2847                                            Register y, Register y_idx, Register z,
2848                                            Register carry, Register product,
2849                                            Register idx, Register kdx) {
2850   //
2851   //  jlong carry, x[], y[], z[];
2852   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2853   //    huge_128 product = y[idx] * x[xstart] + carry;
2854   //    z[kdx] = (jlong)product;
2855   //    carry  = (jlong)(product >>> 64);
2856   //  }
2857   //  z[xstart] = carry;
2858   //
2859 
2860   Label L_first_loop, L_first_loop_exit;
2861   Label L_one_x, L_one_y, L_multiply;
2862 
2863   subsw(xstart, xstart, 1);
2864   br(Assembler::MI, L_one_x);
2865 
2866   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2867   ldr(x_xstart, Address(rscratch1));
2868   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2869 
2870   bind(L_first_loop);
2871   subsw(idx, idx, 1);
2872   br(Assembler::MI, L_first_loop_exit);
2873   subsw(idx, idx, 1);
2874   br(Assembler::MI, L_one_y);
2875   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2876   ldr(y_idx, Address(rscratch1));
2877   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2878   bind(L_multiply);
2879 
2880   // AArch64 has a multiply-accumulate instruction that we can't use
2881   // here because it has no way to process carries, so we have to use
2882   // separate add and adc instructions.  Bah.
2883   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2884   mul(product, x_xstart, y_idx);
2885   adds(product, product, carry);
2886   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2887 
2888   subw(kdx, kdx, 2);
2889   ror(product, product, 32); // back to big-endian
2890   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2891 
2892   b(L_first_loop);
2893 
2894   bind(L_one_y);
2895   ldrw(y_idx, Address(y,  0));
2896   b(L_multiply);
2897 
2898   bind(L_one_x);
2899   ldrw(x_xstart, Address(x,  0));
2900   b(L_first_loop);
2901 
2902   bind(L_first_loop_exit);
2903 }
2904 
2905 /**
2906  * Multiply 128 bit by 128. Unrolled inner loop.
2907  *
2908  */
2909 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2910                                              Register carry, Register carry2,
2911                                              Register idx, Register jdx,
2912                                              Register yz_idx1, Register yz_idx2,
2913                                              Register tmp, Register tmp3, Register tmp4,
2914                                              Register tmp6, Register product_hi) {
2915 
2916   //   jlong carry, x[], y[], z[];
2917   //   int kdx = ystart+1;
2918   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2919   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2920   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2921   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2922   //     carry  = (jlong)(tmp4 >>> 64);
2923   //     z[kdx+idx+1] = (jlong)tmp3;
2924   //     z[kdx+idx] = (jlong)tmp4;
2925   //   }
2926   //   idx += 2;
2927   //   if (idx > 0) {
2928   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2929   //     z[kdx+idx] = (jlong)yz_idx1;
2930   //     carry  = (jlong)(yz_idx1 >>> 64);
2931   //   }
2932   //
2933 
2934   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2935 
2936   lsrw(jdx, idx, 2);
2937 
2938   bind(L_third_loop);
2939 
2940   subsw(jdx, jdx, 1);
2941   br(Assembler::MI, L_third_loop_exit);
2942   subw(idx, idx, 4);
2943 
2944   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2945 
2946   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2947 
2948   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2949 
2950   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2951   ror(yz_idx2, yz_idx2, 32);
2952 
2953   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2954 
2955   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2956   umulh(tmp4, product_hi, yz_idx1);
2957 
2958   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2959   ror(rscratch2, rscratch2, 32);
2960 
2961   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2962   umulh(carry2, product_hi, yz_idx2);
2963 
2964   // propagate sum of both multiplications into carry:tmp4:tmp3
2965   adds(tmp3, tmp3, carry);
2966   adc(tmp4, tmp4, zr);
2967   adds(tmp3, tmp3, rscratch1);
2968   adcs(tmp4, tmp4, tmp);
2969   adc(carry, carry2, zr);
2970   adds(tmp4, tmp4, rscratch2);
2971   adc(carry, carry, zr);
2972 
2973   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2974   ror(tmp4, tmp4, 32);
2975   stp(tmp4, tmp3, Address(tmp6, 0));
2976 
2977   b(L_third_loop);
2978   bind (L_third_loop_exit);
2979 
2980   andw (idx, idx, 0x3);
2981   cbz(idx, L_post_third_loop_done);
2982 
2983   Label L_check_1;
2984   subsw(idx, idx, 2);
2985   br(Assembler::MI, L_check_1);
2986 
2987   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2988   ldr(yz_idx1, Address(rscratch1, 0));
2989   ror(yz_idx1, yz_idx1, 32);
2990   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2991   umulh(tmp4, product_hi, yz_idx1);
2992   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2993   ldr(yz_idx2, Address(rscratch1, 0));
2994   ror(yz_idx2, yz_idx2, 32);
2995 
2996   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2997 
2998   ror(tmp3, tmp3, 32);
2999   str(tmp3, Address(rscratch1, 0));
3000 
3001   bind (L_check_1);
3002 
3003   andw (idx, idx, 0x1);
3004   subsw(idx, idx, 1);
3005   br(Assembler::MI, L_post_third_loop_done);
3006   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3007   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3008   umulh(carry2, tmp4, product_hi);
3009   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3010 
3011   add2_with_carry(carry2, tmp3, tmp4, carry);
3012 
3013   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3014   extr(carry, carry2, tmp3, 32);
3015 
3016   bind(L_post_third_loop_done);
3017 }
3018 
3019 /**
3020  * Code for BigInteger::multiplyToLen() instrinsic.
3021  *
3022  * r0: x
3023  * r1: xlen
3024  * r2: y
3025  * r3: ylen
3026  * r4:  z
3027  * r5: zlen
3028  * r10: tmp1
3029  * r11: tmp2
3030  * r12: tmp3
3031  * r13: tmp4
3032  * r14: tmp5
3033  * r15: tmp6
3034  * r16: tmp7
3035  *
3036  */
3037 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3038                                      Register z, Register zlen,
3039                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3040                                      Register tmp5, Register tmp6, Register product_hi) {
3041 
3042   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3043 
3044   const Register idx = tmp1;
3045   const Register kdx = tmp2;
3046   const Register xstart = tmp3;
3047 
3048   const Register y_idx = tmp4;
3049   const Register carry = tmp5;
3050   const Register product  = xlen;
3051   const Register x_xstart = zlen;  // reuse register
3052 
3053   // First Loop.
3054   //
3055   //  final static long LONG_MASK = 0xffffffffL;
3056   //  int xstart = xlen - 1;
3057   //  int ystart = ylen - 1;
3058   //  long carry = 0;
3059   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3060   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3061   //    z[kdx] = (int)product;
3062   //    carry = product >>> 32;
3063   //  }
3064   //  z[xstart] = (int)carry;
3065   //
3066 
3067   movw(idx, ylen);      // idx = ylen;
3068   movw(kdx, zlen);      // kdx = xlen+ylen;
3069   mov(carry, zr);       // carry = 0;
3070 
3071   Label L_done;
3072 
3073   movw(xstart, xlen);
3074   subsw(xstart, xstart, 1);
3075   br(Assembler::MI, L_done);
3076 
3077   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3078 
3079   Label L_second_loop;
3080   cbzw(kdx, L_second_loop);
3081 
3082   Label L_carry;
3083   subw(kdx, kdx, 1);
3084   cbzw(kdx, L_carry);
3085 
3086   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3087   lsr(carry, carry, 32);
3088   subw(kdx, kdx, 1);
3089 
3090   bind(L_carry);
3091   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3092 
3093   // Second and third (nested) loops.
3094   //
3095   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3096   //   carry = 0;
3097   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3098   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3099   //                    (z[k] & LONG_MASK) + carry;
3100   //     z[k] = (int)product;
3101   //     carry = product >>> 32;
3102   //   }
3103   //   z[i] = (int)carry;
3104   // }
3105   //
3106   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3107 
3108   const Register jdx = tmp1;
3109 
3110   bind(L_second_loop);
3111   mov(carry, zr);                // carry = 0;
3112   movw(jdx, ylen);               // j = ystart+1
3113 
3114   subsw(xstart, xstart, 1);      // i = xstart-1;
3115   br(Assembler::MI, L_done);
3116 
3117   str(z, Address(pre(sp, -4 * wordSize)));
3118 
3119   Label L_last_x;
3120   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3121   subsw(xstart, xstart, 1);       // i = xstart-1;
3122   br(Assembler::MI, L_last_x);
3123 
3124   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3125   ldr(product_hi, Address(rscratch1));
3126   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3127 
3128   Label L_third_loop_prologue;
3129   bind(L_third_loop_prologue);
3130 
3131   str(ylen, Address(sp, wordSize));
3132   stp(x, xstart, Address(sp, 2 * wordSize));
3133   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3134                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3135   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3136   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3137 
3138   addw(tmp3, xlen, 1);
3139   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3140   subsw(tmp3, tmp3, 1);
3141   br(Assembler::MI, L_done);
3142 
3143   lsr(carry, carry, 32);
3144   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3145   b(L_second_loop);
3146 
3147   // Next infrequent code is moved outside loops.
3148   bind(L_last_x);
3149   ldrw(product_hi, Address(x,  0));
3150   b(L_third_loop_prologue);
3151 
3152   bind(L_done);
3153 }
3154 
3155 // Code for BigInteger::mulAdd instrinsic
3156 // out     = r0
3157 // in      = r1
3158 // offset  = r2  (already out.length-offset)
3159 // len     = r3
3160 // k       = r4
3161 //
3162 // pseudo code from java implementation:
3163 // carry = 0;
3164 // offset = out.length-offset - 1;
3165 // for (int j=len-1; j >= 0; j--) {
3166 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3167 //     out[offset--] = (int)product;
3168 //     carry = product >>> 32;
3169 // }
3170 // return (int)carry;
3171 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3172       Register len, Register k) {
3173     Label LOOP, END;
3174     // pre-loop
3175     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3176     csel(out, zr, out, Assembler::EQ);
3177     br(Assembler::EQ, END);
3178     add(in, in, len, LSL, 2); // in[j+1] address
3179     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3180     mov(out, zr); // used to keep carry now
3181     BIND(LOOP);
3182     ldrw(rscratch1, Address(pre(in, -4)));
3183     madd(rscratch1, rscratch1, k, out);
3184     ldrw(rscratch2, Address(pre(offset, -4)));
3185     add(rscratch1, rscratch1, rscratch2);
3186     strw(rscratch1, Address(offset));
3187     lsr(out, rscratch1, 32);
3188     subs(len, len, 1);
3189     br(Assembler::NE, LOOP);
3190     BIND(END);
3191 }
3192 
3193 /**
3194  * Emits code to update CRC-32 with a byte value according to constants in table
3195  *
3196  * @param [in,out]crc   Register containing the crc.
3197  * @param [in]val       Register containing the byte to fold into the CRC.
3198  * @param [in]table     Register containing the table of crc constants.
3199  *
3200  * uint32_t crc;
3201  * val = crc_table[(val ^ crc) & 0xFF];
3202  * crc = val ^ (crc >> 8);
3203  *
3204  */
3205 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3206   eor(val, val, crc);
3207   andr(val, val, 0xff);
3208   ldrw(val, Address(table, val, Address::lsl(2)));
3209   eor(crc, val, crc, Assembler::LSR, 8);
3210 }
3211 
3212 /**
3213  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3214  *
3215  * @param [in,out]crc   Register containing the crc.
3216  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3217  * @param [in]table0    Register containing table 0 of crc constants.
3218  * @param [in]table1    Register containing table 1 of crc constants.
3219  * @param [in]table2    Register containing table 2 of crc constants.
3220  * @param [in]table3    Register containing table 3 of crc constants.
3221  *
3222  * uint32_t crc;
3223  *   v = crc ^ v
3224  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3225  *
3226  */
3227 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3228         Register table0, Register table1, Register table2, Register table3,
3229         bool upper) {
3230   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3231   uxtb(tmp, v);
3232   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3233   ubfx(tmp, v, 8, 8);
3234   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3235   eor(crc, crc, tmp);
3236   ubfx(tmp, v, 16, 8);
3237   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3238   eor(crc, crc, tmp);
3239   ubfx(tmp, v, 24, 8);
3240   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3241   eor(crc, crc, tmp);
3242 }
3243 
3244 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3245         Register len, Register tmp0, Register tmp1, Register tmp2,
3246         Register tmp3) {
3247     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3248     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3249 
3250     mvnw(crc, crc);
3251 
3252     subs(len, len, 128);
3253     br(Assembler::GE, CRC_by64_pre);
3254   BIND(CRC_less64);
3255     adds(len, len, 128-32);
3256     br(Assembler::GE, CRC_by32_loop);
3257   BIND(CRC_less32);
3258     adds(len, len, 32-4);
3259     br(Assembler::GE, CRC_by4_loop);
3260     adds(len, len, 4);
3261     br(Assembler::GT, CRC_by1_loop);
3262     b(L_exit);
3263 
3264   BIND(CRC_by32_loop);
3265     ldp(tmp0, tmp1, Address(post(buf, 16)));
3266     subs(len, len, 32);
3267     crc32x(crc, crc, tmp0);
3268     ldr(tmp2, Address(post(buf, 8)));
3269     crc32x(crc, crc, tmp1);
3270     ldr(tmp3, Address(post(buf, 8)));
3271     crc32x(crc, crc, tmp2);
3272     crc32x(crc, crc, tmp3);
3273     br(Assembler::GE, CRC_by32_loop);
3274     cmn(len, 32);
3275     br(Assembler::NE, CRC_less32);
3276     b(L_exit);
3277 
3278   BIND(CRC_by4_loop);
3279     ldrw(tmp0, Address(post(buf, 4)));
3280     subs(len, len, 4);
3281     crc32w(crc, crc, tmp0);
3282     br(Assembler::GE, CRC_by4_loop);
3283     adds(len, len, 4);
3284     br(Assembler::LE, L_exit);
3285   BIND(CRC_by1_loop);
3286     ldrb(tmp0, Address(post(buf, 1)));
3287     subs(len, len, 1);
3288     crc32b(crc, crc, tmp0);
3289     br(Assembler::GT, CRC_by1_loop);
3290     b(L_exit);
3291 
3292   BIND(CRC_by64_pre);
3293     sub(buf, buf, 8);
3294     ldp(tmp0, tmp1, Address(buf, 8));
3295     crc32x(crc, crc, tmp0);
3296     ldr(tmp2, Address(buf, 24));
3297     crc32x(crc, crc, tmp1);
3298     ldr(tmp3, Address(buf, 32));
3299     crc32x(crc, crc, tmp2);
3300     ldr(tmp0, Address(buf, 40));
3301     crc32x(crc, crc, tmp3);
3302     ldr(tmp1, Address(buf, 48));
3303     crc32x(crc, crc, tmp0);
3304     ldr(tmp2, Address(buf, 56));
3305     crc32x(crc, crc, tmp1);
3306     ldr(tmp3, Address(pre(buf, 64)));
3307 
3308     b(CRC_by64_loop);
3309 
3310     align(CodeEntryAlignment);
3311   BIND(CRC_by64_loop);
3312     subs(len, len, 64);
3313     crc32x(crc, crc, tmp2);
3314     ldr(tmp0, Address(buf, 8));
3315     crc32x(crc, crc, tmp3);
3316     ldr(tmp1, Address(buf, 16));
3317     crc32x(crc, crc, tmp0);
3318     ldr(tmp2, Address(buf, 24));
3319     crc32x(crc, crc, tmp1);
3320     ldr(tmp3, Address(buf, 32));
3321     crc32x(crc, crc, tmp2);
3322     ldr(tmp0, Address(buf, 40));
3323     crc32x(crc, crc, tmp3);
3324     ldr(tmp1, Address(buf, 48));
3325     crc32x(crc, crc, tmp0);
3326     ldr(tmp2, Address(buf, 56));
3327     crc32x(crc, crc, tmp1);
3328     ldr(tmp3, Address(pre(buf, 64)));
3329     br(Assembler::GE, CRC_by64_loop);
3330 
3331     // post-loop
3332     crc32x(crc, crc, tmp2);
3333     crc32x(crc, crc, tmp3);
3334 
3335     sub(len, len, 64);
3336     add(buf, buf, 8);
3337     cmn(len, 128);
3338     br(Assembler::NE, CRC_less64);
3339   BIND(L_exit);
3340     mvnw(crc, crc);
3341 }
3342 
3343 /**
3344  * @param crc   register containing existing CRC (32-bit)
3345  * @param buf   register pointing to input byte buffer (byte*)
3346  * @param len   register containing number of bytes
3347  * @param table register that will contain address of CRC table
3348  * @param tmp   scratch register
3349  */
3350 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3351         Register table0, Register table1, Register table2, Register table3,
3352         Register tmp, Register tmp2, Register tmp3) {
3353   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3354   unsigned long offset;
3355 
3356   if (UseCRC32) {
3357       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3358       return;
3359   }
3360 
3361     mvnw(crc, crc);
3362 
3363     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3364     if (offset) add(table0, table0, offset);
3365     add(table1, table0, 1*256*sizeof(juint));
3366     add(table2, table0, 2*256*sizeof(juint));
3367     add(table3, table0, 3*256*sizeof(juint));
3368 
3369   if (UseNeon) {
3370       cmp(len, (u1)64);
3371       br(Assembler::LT, L_by16);
3372       eor(v16, T16B, v16, v16);
3373 
3374     Label L_fold;
3375 
3376       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3377 
3378       ld1(v0, v1, T2D, post(buf, 32));
3379       ld1r(v4, T2D, post(tmp, 8));
3380       ld1r(v5, T2D, post(tmp, 8));
3381       ld1r(v6, T2D, post(tmp, 8));
3382       ld1r(v7, T2D, post(tmp, 8));
3383       mov(v16, T4S, 0, crc);
3384 
3385       eor(v0, T16B, v0, v16);
3386       sub(len, len, 64);
3387 
3388     BIND(L_fold);
3389       pmull(v22, T8H, v0, v5, T8B);
3390       pmull(v20, T8H, v0, v7, T8B);
3391       pmull(v23, T8H, v0, v4, T8B);
3392       pmull(v21, T8H, v0, v6, T8B);
3393 
3394       pmull2(v18, T8H, v0, v5, T16B);
3395       pmull2(v16, T8H, v0, v7, T16B);
3396       pmull2(v19, T8H, v0, v4, T16B);
3397       pmull2(v17, T8H, v0, v6, T16B);
3398 
3399       uzp1(v24, T8H, v20, v22);
3400       uzp2(v25, T8H, v20, v22);
3401       eor(v20, T16B, v24, v25);
3402 
3403       uzp1(v26, T8H, v16, v18);
3404       uzp2(v27, T8H, v16, v18);
3405       eor(v16, T16B, v26, v27);
3406 
3407       ushll2(v22, T4S, v20, T8H, 8);
3408       ushll(v20, T4S, v20, T4H, 8);
3409 
3410       ushll2(v18, T4S, v16, T8H, 8);
3411       ushll(v16, T4S, v16, T4H, 8);
3412 
3413       eor(v22, T16B, v23, v22);
3414       eor(v18, T16B, v19, v18);
3415       eor(v20, T16B, v21, v20);
3416       eor(v16, T16B, v17, v16);
3417 
3418       uzp1(v17, T2D, v16, v20);
3419       uzp2(v21, T2D, v16, v20);
3420       eor(v17, T16B, v17, v21);
3421 
3422       ushll2(v20, T2D, v17, T4S, 16);
3423       ushll(v16, T2D, v17, T2S, 16);
3424 
3425       eor(v20, T16B, v20, v22);
3426       eor(v16, T16B, v16, v18);
3427 
3428       uzp1(v17, T2D, v20, v16);
3429       uzp2(v21, T2D, v20, v16);
3430       eor(v28, T16B, v17, v21);
3431 
3432       pmull(v22, T8H, v1, v5, T8B);
3433       pmull(v20, T8H, v1, v7, T8B);
3434       pmull(v23, T8H, v1, v4, T8B);
3435       pmull(v21, T8H, v1, v6, T8B);
3436 
3437       pmull2(v18, T8H, v1, v5, T16B);
3438       pmull2(v16, T8H, v1, v7, T16B);
3439       pmull2(v19, T8H, v1, v4, T16B);
3440       pmull2(v17, T8H, v1, v6, T16B);
3441 
3442       ld1(v0, v1, T2D, post(buf, 32));
3443 
3444       uzp1(v24, T8H, v20, v22);
3445       uzp2(v25, T8H, v20, v22);
3446       eor(v20, T16B, v24, v25);
3447 
3448       uzp1(v26, T8H, v16, v18);
3449       uzp2(v27, T8H, v16, v18);
3450       eor(v16, T16B, v26, v27);
3451 
3452       ushll2(v22, T4S, v20, T8H, 8);
3453       ushll(v20, T4S, v20, T4H, 8);
3454 
3455       ushll2(v18, T4S, v16, T8H, 8);
3456       ushll(v16, T4S, v16, T4H, 8);
3457 
3458       eor(v22, T16B, v23, v22);
3459       eor(v18, T16B, v19, v18);
3460       eor(v20, T16B, v21, v20);
3461       eor(v16, T16B, v17, v16);
3462 
3463       uzp1(v17, T2D, v16, v20);
3464       uzp2(v21, T2D, v16, v20);
3465       eor(v16, T16B, v17, v21);
3466 
3467       ushll2(v20, T2D, v16, T4S, 16);
3468       ushll(v16, T2D, v16, T2S, 16);
3469 
3470       eor(v20, T16B, v22, v20);
3471       eor(v16, T16B, v16, v18);
3472 
3473       uzp1(v17, T2D, v20, v16);
3474       uzp2(v21, T2D, v20, v16);
3475       eor(v20, T16B, v17, v21);
3476 
3477       shl(v16, T2D, v28, 1);
3478       shl(v17, T2D, v20, 1);
3479 
3480       eor(v0, T16B, v0, v16);
3481       eor(v1, T16B, v1, v17);
3482 
3483       subs(len, len, 32);
3484       br(Assembler::GE, L_fold);
3485 
3486       mov(crc, 0);
3487       mov(tmp, v0, T1D, 0);
3488       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3489       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3490       mov(tmp, v0, T1D, 1);
3491       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3492       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3493       mov(tmp, v1, T1D, 0);
3494       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3495       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3496       mov(tmp, v1, T1D, 1);
3497       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3498       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3499 
3500       add(len, len, 32);
3501   }
3502 
3503   BIND(L_by16);
3504     subs(len, len, 16);
3505     br(Assembler::GE, L_by16_loop);
3506     adds(len, len, 16-4);
3507     br(Assembler::GE, L_by4_loop);
3508     adds(len, len, 4);
3509     br(Assembler::GT, L_by1_loop);
3510     b(L_exit);
3511 
3512   BIND(L_by4_loop);
3513     ldrw(tmp, Address(post(buf, 4)));
3514     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3515     subs(len, len, 4);
3516     br(Assembler::GE, L_by4_loop);
3517     adds(len, len, 4);
3518     br(Assembler::LE, L_exit);
3519   BIND(L_by1_loop);
3520     subs(len, len, 1);
3521     ldrb(tmp, Address(post(buf, 1)));
3522     update_byte_crc32(crc, tmp, table0);
3523     br(Assembler::GT, L_by1_loop);
3524     b(L_exit);
3525 
3526     align(CodeEntryAlignment);
3527   BIND(L_by16_loop);
3528     subs(len, len, 16);
3529     ldp(tmp, tmp3, Address(post(buf, 16)));
3530     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3531     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3532     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3533     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3534     br(Assembler::GE, L_by16_loop);
3535     adds(len, len, 16-4);
3536     br(Assembler::GE, L_by4_loop);
3537     adds(len, len, 4);
3538     br(Assembler::GT, L_by1_loop);
3539   BIND(L_exit);
3540     mvnw(crc, crc);
3541 }
3542 
3543 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3544         Register len, Register tmp0, Register tmp1, Register tmp2,
3545         Register tmp3) {
3546     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3547     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3548 
3549     subs(len, len, 128);
3550     br(Assembler::GE, CRC_by64_pre);
3551   BIND(CRC_less64);
3552     adds(len, len, 128-32);
3553     br(Assembler::GE, CRC_by32_loop);
3554   BIND(CRC_less32);
3555     adds(len, len, 32-4);
3556     br(Assembler::GE, CRC_by4_loop);
3557     adds(len, len, 4);
3558     br(Assembler::GT, CRC_by1_loop);
3559     b(L_exit);
3560 
3561   BIND(CRC_by32_loop);
3562     ldp(tmp0, tmp1, Address(post(buf, 16)));
3563     subs(len, len, 32);
3564     crc32cx(crc, crc, tmp0);
3565     ldr(tmp2, Address(post(buf, 8)));
3566     crc32cx(crc, crc, tmp1);
3567     ldr(tmp3, Address(post(buf, 8)));
3568     crc32cx(crc, crc, tmp2);
3569     crc32cx(crc, crc, tmp3);
3570     br(Assembler::GE, CRC_by32_loop);
3571     cmn(len, 32);
3572     br(Assembler::NE, CRC_less32);
3573     b(L_exit);
3574 
3575   BIND(CRC_by4_loop);
3576     ldrw(tmp0, Address(post(buf, 4)));
3577     subs(len, len, 4);
3578     crc32cw(crc, crc, tmp0);
3579     br(Assembler::GE, CRC_by4_loop);
3580     adds(len, len, 4);
3581     br(Assembler::LE, L_exit);
3582   BIND(CRC_by1_loop);
3583     ldrb(tmp0, Address(post(buf, 1)));
3584     subs(len, len, 1);
3585     crc32cb(crc, crc, tmp0);
3586     br(Assembler::GT, CRC_by1_loop);
3587     b(L_exit);
3588 
3589   BIND(CRC_by64_pre);
3590     sub(buf, buf, 8);
3591     ldp(tmp0, tmp1, Address(buf, 8));
3592     crc32cx(crc, crc, tmp0);
3593     ldr(tmp2, Address(buf, 24));
3594     crc32cx(crc, crc, tmp1);
3595     ldr(tmp3, Address(buf, 32));
3596     crc32cx(crc, crc, tmp2);
3597     ldr(tmp0, Address(buf, 40));
3598     crc32cx(crc, crc, tmp3);
3599     ldr(tmp1, Address(buf, 48));
3600     crc32cx(crc, crc, tmp0);
3601     ldr(tmp2, Address(buf, 56));
3602     crc32cx(crc, crc, tmp1);
3603     ldr(tmp3, Address(pre(buf, 64)));
3604 
3605     b(CRC_by64_loop);
3606 
3607     align(CodeEntryAlignment);
3608   BIND(CRC_by64_loop);
3609     subs(len, len, 64);
3610     crc32cx(crc, crc, tmp2);
3611     ldr(tmp0, Address(buf, 8));
3612     crc32cx(crc, crc, tmp3);
3613     ldr(tmp1, Address(buf, 16));
3614     crc32cx(crc, crc, tmp0);
3615     ldr(tmp2, Address(buf, 24));
3616     crc32cx(crc, crc, tmp1);
3617     ldr(tmp3, Address(buf, 32));
3618     crc32cx(crc, crc, tmp2);
3619     ldr(tmp0, Address(buf, 40));
3620     crc32cx(crc, crc, tmp3);
3621     ldr(tmp1, Address(buf, 48));
3622     crc32cx(crc, crc, tmp0);
3623     ldr(tmp2, Address(buf, 56));
3624     crc32cx(crc, crc, tmp1);
3625     ldr(tmp3, Address(pre(buf, 64)));
3626     br(Assembler::GE, CRC_by64_loop);
3627 
3628     // post-loop
3629     crc32cx(crc, crc, tmp2);
3630     crc32cx(crc, crc, tmp3);
3631 
3632     sub(len, len, 64);
3633     add(buf, buf, 8);
3634     cmn(len, 128);
3635     br(Assembler::NE, CRC_less64);
3636   BIND(L_exit);
3637 }
3638 
3639 /**
3640  * @param crc   register containing existing CRC (32-bit)
3641  * @param buf   register pointing to input byte buffer (byte*)
3642  * @param len   register containing number of bytes
3643  * @param table register that will contain address of CRC table
3644  * @param tmp   scratch register
3645  */
3646 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3647         Register table0, Register table1, Register table2, Register table3,
3648         Register tmp, Register tmp2, Register tmp3) {
3649   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3650 }
3651 
3652 
3653 SkipIfEqual::SkipIfEqual(
3654     MacroAssembler* masm, const bool* flag_addr, bool value) {
3655   _masm = masm;
3656   unsigned long offset;
3657   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3658   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3659   _masm->cbzw(rscratch1, _label);
3660 }
3661 
3662 SkipIfEqual::~SkipIfEqual() {
3663   _masm->bind(_label);
3664 }
3665 
3666 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3667   Address adr;
3668   switch(dst.getMode()) {
3669   case Address::base_plus_offset:
3670     // This is the expected mode, although we allow all the other
3671     // forms below.
3672     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3673     break;
3674   default:
3675     lea(rscratch2, dst);
3676     adr = Address(rscratch2);
3677     break;
3678   }
3679   ldr(rscratch1, adr);
3680   add(rscratch1, rscratch1, src);
3681   str(rscratch1, adr);
3682 }
3683 
3684 void MacroAssembler::cmpptr(Register src1, Address src2) {
3685   unsigned long offset;
3686   adrp(rscratch1, src2, offset);
3687   ldr(rscratch1, Address(rscratch1, offset));
3688   cmp(src1, rscratch1);
3689 }
3690 
3691 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3692   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3693   bs->obj_equals(this, obj1, obj2);
3694 }
3695 
3696 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) {
3697   load_method_holder(rresult, rmethod);
3698   ldr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset()));
3699 }
3700 
3701 void MacroAssembler::load_method_holder(Register holder, Register method) {
3702   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3703   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3704   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3705 }
3706 
3707 void MacroAssembler::load_klass(Register dst, Register src) {
3708   if (UseCompressedClassPointers) {
3709     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3710     decode_klass_not_null(dst);
3711   } else {
3712     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3713   }
3714 }
3715 
3716 // ((OopHandle)result).resolve();
3717 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3718   // OopHandle::resolve is an indirection.
3719   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3720 }
3721 
3722 // ((WeakHandle)result).resolve();
3723 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) {
3724   assert_different_registers(rresult, rtmp);
3725   Label resolved;
3726 
3727   // A null weak handle resolves to null.
3728   cbz(rresult, resolved);
3729 
3730   // Only 64 bit platforms support GCs that require a tmp register
3731   // Only IN_HEAP loads require a thread_tmp register
3732   // WeakHandle::resolve is an indirection like jweak.
3733   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
3734                  rresult, Address(rresult), rtmp, /*tmp_thread*/noreg);
3735   bind(resolved);
3736 }
3737 
3738 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3739   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3740   ldr(dst, Address(rmethod, Method::const_offset()));
3741   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3742   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3743   ldr(dst, Address(dst, mirror_offset));
3744   resolve_oop_handle(dst, tmp);
3745 }
3746 
3747 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3748   if (UseCompressedClassPointers) {
3749     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3750     if (CompressedKlassPointers::base() == NULL) {
3751       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3752       return;
3753     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3754                && CompressedKlassPointers::shift() == 0) {
3755       // Only the bottom 32 bits matter
3756       cmpw(trial_klass, tmp);
3757       return;
3758     }
3759     decode_klass_not_null(tmp);
3760   } else {
3761     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3762   }
3763   cmp(trial_klass, tmp);
3764 }
3765 
3766 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3767   load_klass(dst, src);
3768   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3769 }
3770 
3771 void MacroAssembler::store_klass(Register dst, Register src) {
3772   // FIXME: Should this be a store release?  concurrent gcs assumes
3773   // klass length is valid if klass field is not null.
3774   if (UseCompressedClassPointers) {
3775     encode_klass_not_null(src);
3776     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3777   } else {
3778     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3779   }
3780 }
3781 
3782 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3783   if (UseCompressedClassPointers) {
3784     // Store to klass gap in destination
3785     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3786   }
3787 }
3788 
3789 // Algorithm must match CompressedOops::encode.
3790 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3791 #ifdef ASSERT
3792   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3793 #endif
3794   verify_oop(s, "broken oop in encode_heap_oop");
3795   if (CompressedOops::base() == NULL) {
3796     if (CompressedOops::shift() != 0) {
3797       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3798       lsr(d, s, LogMinObjAlignmentInBytes);
3799     } else {
3800       mov(d, s);
3801     }
3802   } else {
3803     subs(d, s, rheapbase);
3804     csel(d, d, zr, Assembler::HS);
3805     lsr(d, d, LogMinObjAlignmentInBytes);
3806 
3807     /*  Old algorithm: is this any worse?
3808     Label nonnull;
3809     cbnz(r, nonnull);
3810     sub(r, r, rheapbase);
3811     bind(nonnull);
3812     lsr(r, r, LogMinObjAlignmentInBytes);
3813     */
3814   }
3815 }
3816 
3817 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3818 #ifdef ASSERT
3819   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3820   if (CheckCompressedOops) {
3821     Label ok;
3822     cbnz(r, ok);
3823     stop("null oop passed to encode_heap_oop_not_null");
3824     bind(ok);
3825   }
3826 #endif
3827   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3828   if (CompressedOops::base() != NULL) {
3829     sub(r, r, rheapbase);
3830   }
3831   if (CompressedOops::shift() != 0) {
3832     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3833     lsr(r, r, LogMinObjAlignmentInBytes);
3834   }
3835 }
3836 
3837 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3838 #ifdef ASSERT
3839   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3840   if (CheckCompressedOops) {
3841     Label ok;
3842     cbnz(src, ok);
3843     stop("null oop passed to encode_heap_oop_not_null2");
3844     bind(ok);
3845   }
3846 #endif
3847   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3848 
3849   Register data = src;
3850   if (CompressedOops::base() != NULL) {
3851     sub(dst, src, rheapbase);
3852     data = dst;
3853   }
3854   if (CompressedOops::shift() != 0) {
3855     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3856     lsr(dst, data, LogMinObjAlignmentInBytes);
3857     data = dst;
3858   }
3859   if (data == src)
3860     mov(dst, src);
3861 }
3862 
3863 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3864 #ifdef ASSERT
3865   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3866 #endif
3867   if (CompressedOops::base() == NULL) {
3868     if (CompressedOops::shift() != 0 || d != s) {
3869       lsl(d, s, CompressedOops::shift());
3870     }
3871   } else {
3872     Label done;
3873     if (d != s)
3874       mov(d, s);
3875     cbz(s, done);
3876     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3877     bind(done);
3878   }
3879   verify_oop(d, "broken oop in decode_heap_oop");
3880 }
3881 
3882 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3883   assert (UseCompressedOops, "should only be used for compressed headers");
3884   assert (Universe::heap() != NULL, "java heap should be initialized");
3885   // Cannot assert, unverified entry point counts instructions (see .ad file)
3886   // vtableStubs also counts instructions in pd_code_size_limit.
3887   // Also do not verify_oop as this is called by verify_oop.
3888   if (CompressedOops::shift() != 0) {
3889     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3890     if (CompressedOops::base() != NULL) {
3891       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3892     } else {
3893       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3894     }
3895   } else {
3896     assert (CompressedOops::base() == NULL, "sanity");
3897   }
3898 }
3899 
3900 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3901   assert (UseCompressedOops, "should only be used for compressed headers");
3902   assert (Universe::heap() != NULL, "java heap should be initialized");
3903   // Cannot assert, unverified entry point counts instructions (see .ad file)
3904   // vtableStubs also counts instructions in pd_code_size_limit.
3905   // Also do not verify_oop as this is called by verify_oop.
3906   if (CompressedOops::shift() != 0) {
3907     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3908     if (CompressedOops::base() != NULL) {
3909       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3910     } else {
3911       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3912     }
3913   } else {
3914     assert (CompressedOops::base() == NULL, "sanity");
3915     if (dst != src) {
3916       mov(dst, src);
3917     }
3918   }
3919 }
3920 
3921 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
3922 
3923 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
3924   assert(UseCompressedClassPointers, "not using compressed class pointers");
3925   assert(Metaspace::initialized(), "metaspace not initialized yet");
3926 
3927   if (_klass_decode_mode != KlassDecodeNone) {
3928     return _klass_decode_mode;
3929   }
3930 
3931   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
3932          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
3933 
3934   if (CompressedKlassPointers::base() == NULL) {
3935     return (_klass_decode_mode = KlassDecodeZero);
3936   }
3937 
3938   if (operand_valid_for_logical_immediate(
3939         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
3940     const uint64_t range_mask =
3941       (1UL << log2_intptr(CompressedKlassPointers::range())) - 1;
3942     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
3943       return (_klass_decode_mode = KlassDecodeXor);
3944     }
3945   }
3946 
3947   const uint64_t shifted_base =
3948     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
3949   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
3950             "compressed class base bad alignment");
3951 
3952   return (_klass_decode_mode = KlassDecodeMovk);
3953 }
3954 
3955 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3956   switch (klass_decode_mode()) {
3957   case KlassDecodeZero:
3958     if (CompressedKlassPointers::shift() != 0) {
3959       lsr(dst, src, LogKlassAlignmentInBytes);
3960     } else {
3961       if (dst != src) mov(dst, src);
3962     }
3963     break;
3964 
3965   case KlassDecodeXor:
3966     if (CompressedKlassPointers::shift() != 0) {
3967       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3968       lsr(dst, dst, LogKlassAlignmentInBytes);
3969     } else {
3970       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3971     }
3972     break;
3973 
3974   case KlassDecodeMovk:
3975     if (CompressedKlassPointers::shift() != 0) {
3976       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
3977     } else {
3978       movw(dst, src);
3979     }
3980     break;
3981 
3982   case KlassDecodeNone:
3983     ShouldNotReachHere();
3984     break;
3985   }
3986 }
3987 
3988 void MacroAssembler::encode_klass_not_null(Register r) {
3989   encode_klass_not_null(r, r);
3990 }
3991 
3992 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3993   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3994 
3995   switch (klass_decode_mode()) {
3996   case KlassDecodeZero:
3997     if (CompressedKlassPointers::shift() != 0) {
3998       lsl(dst, src, LogKlassAlignmentInBytes);
3999     } else {
4000       if (dst != src) mov(dst, src);
4001     }
4002     break;
4003 
4004   case KlassDecodeXor:
4005     if (CompressedKlassPointers::shift() != 0) {
4006       lsl(dst, src, LogKlassAlignmentInBytes);
4007       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
4008     } else {
4009       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
4010     }
4011     break;
4012 
4013   case KlassDecodeMovk: {
4014     const uint64_t shifted_base =
4015       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4016 
4017     if (dst != src) movw(dst, src);
4018     movk(dst, shifted_base >> 32, 32);
4019 
4020     if (CompressedKlassPointers::shift() != 0) {
4021       lsl(dst, dst, LogKlassAlignmentInBytes);
4022     }
4023 
4024     break;
4025   }
4026 
4027   case KlassDecodeNone:
4028     ShouldNotReachHere();
4029     break;
4030   }
4031 }
4032 
4033 void  MacroAssembler::decode_klass_not_null(Register r) {
4034   decode_klass_not_null(r, r);
4035 }
4036 
4037 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4038 #ifdef ASSERT
4039   {
4040     ThreadInVMfromUnknown tiv;
4041     assert (UseCompressedOops, "should only be used for compressed oops");
4042     assert (Universe::heap() != NULL, "java heap should be initialized");
4043     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4044     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4045   }
4046 #endif
4047   int oop_index = oop_recorder()->find_index(obj);
4048   InstructionMark im(this);
4049   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4050   code_section()->relocate(inst_mark(), rspec);
4051   movz(dst, 0xDEAD, 16);
4052   movk(dst, 0xBEEF);
4053 }
4054 
4055 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4056   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4057   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4058   int index = oop_recorder()->find_index(k);
4059   assert(! Universe::heap()->is_in(k), "should not be an oop");
4060 
4061   InstructionMark im(this);
4062   RelocationHolder rspec = metadata_Relocation::spec(index);
4063   code_section()->relocate(inst_mark(), rspec);
4064   narrowKlass nk = CompressedKlassPointers::encode(k);
4065   movz(dst, (nk >> 16), 16);
4066   movk(dst, nk & 0xffff);
4067 }
4068 
4069 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4070                                     Register dst, Address src,
4071                                     Register tmp1, Register thread_tmp) {
4072   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4073   decorators = AccessInternal::decorator_fixup(decorators);
4074   bool as_raw = (decorators & AS_RAW) != 0;
4075   if (as_raw) {
4076     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4077   } else {
4078     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4079   }
4080 }
4081 
4082 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4083                                      Address dst, Register src,
4084                                      Register tmp1, Register thread_tmp) {
4085   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4086   decorators = AccessInternal::decorator_fixup(decorators);
4087   bool as_raw = (decorators & AS_RAW) != 0;
4088   if (as_raw) {
4089     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4090   } else {
4091     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4092   }
4093 }
4094 
4095 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4096   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4097   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4098     decorators |= ACCESS_READ | ACCESS_WRITE;
4099   }
4100   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4101   return bs->resolve(this, decorators, obj);
4102 }
4103 
4104 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4105                                    Register thread_tmp, DecoratorSet decorators) {
4106   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4107 }
4108 
4109 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4110                                             Register thread_tmp, DecoratorSet decorators) {
4111   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4112 }
4113 
4114 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4115                                     Register thread_tmp, DecoratorSet decorators) {
4116   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4117 }
4118 
4119 // Used for storing NULLs.
4120 void MacroAssembler::store_heap_oop_null(Address dst) {
4121   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4122 }
4123 
4124 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4125   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4126   int index = oop_recorder()->allocate_metadata_index(obj);
4127   RelocationHolder rspec = metadata_Relocation::spec(index);
4128   return Address((address)obj, rspec);
4129 }
4130 
4131 // Move an oop into a register.  immediate is true if we want
4132 // immediate instructions and nmethod entry barriers are not enabled.
4133 // i.e. we are not going to patch this instruction while the code is being
4134 // executed by another thread.
4135 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4136   int oop_index;
4137   if (obj == NULL) {
4138     oop_index = oop_recorder()->allocate_oop_index(obj);
4139   } else {
4140 #ifdef ASSERT
4141     {
4142       ThreadInVMfromUnknown tiv;
4143       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4144     }
4145 #endif
4146     oop_index = oop_recorder()->find_index(obj);
4147   }
4148   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4149 
4150   // nmethod entry barrier necessitate using the constant pool. They have to be
4151   // ordered with respected to oop accesses.
4152   // Using immediate literals would necessitate ISBs.
4153   if (BarrierSet::barrier_set()->barrier_set_nmethod() != NULL || !immediate) {
4154     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4155     ldr_constant(dst, Address(dummy, rspec));
4156   } else
4157     mov(dst, Address((address)obj, rspec));
4158 
4159 }
4160 
4161 // Move a metadata address into a register.
4162 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4163   int oop_index;
4164   if (obj == NULL) {
4165     oop_index = oop_recorder()->allocate_metadata_index(obj);
4166   } else {
4167     oop_index = oop_recorder()->find_index(obj);
4168   }
4169   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4170   mov(dst, Address((address)obj, rspec));
4171 }
4172 
4173 Address MacroAssembler::constant_oop_address(jobject obj) {
4174 #ifdef ASSERT
4175   {
4176     ThreadInVMfromUnknown tiv;
4177     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4178     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4179   }
4180 #endif
4181   int oop_index = oop_recorder()->find_index(obj);
4182   return Address((address)obj, oop_Relocation::spec(oop_index));
4183 }
4184 
4185 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4186 void MacroAssembler::tlab_allocate(Register obj,
4187                                    Register var_size_in_bytes,
4188                                    int con_size_in_bytes,
4189                                    Register t1,
4190                                    Register t2,
4191                                    Label& slow_case) {
4192   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4193   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4194 }
4195 
4196 // Defines obj, preserves var_size_in_bytes
4197 void MacroAssembler::eden_allocate(Register obj,
4198                                    Register var_size_in_bytes,
4199                                    int con_size_in_bytes,
4200                                    Register t1,
4201                                    Label& slow_case) {
4202   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4203   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4204 }
4205 
4206 // Zero words; len is in bytes
4207 // Destroys all registers except addr
4208 // len must be a nonzero multiple of wordSize
4209 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4210   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4211 
4212 #ifdef ASSERT
4213   { Label L;
4214     tst(len, BytesPerWord - 1);
4215     br(Assembler::EQ, L);
4216     stop("len is not a multiple of BytesPerWord");
4217     bind(L);
4218   }
4219 #endif
4220 
4221 #ifndef PRODUCT
4222   block_comment("zero memory");
4223 #endif
4224 
4225   Label loop;
4226   Label entry;
4227 
4228 //  Algorithm:
4229 //
4230 //    scratch1 = cnt & 7;
4231 //    cnt -= scratch1;
4232 //    p += scratch1;
4233 //    switch (scratch1) {
4234 //      do {
4235 //        cnt -= 8;
4236 //          p[-8] = 0;
4237 //        case 7:
4238 //          p[-7] = 0;
4239 //        case 6:
4240 //          p[-6] = 0;
4241 //          // ...
4242 //        case 1:
4243 //          p[-1] = 0;
4244 //        case 0:
4245 //          p += 8;
4246 //      } while (cnt);
4247 //    }
4248 
4249   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4250 
4251   lsr(len, len, LogBytesPerWord);
4252   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4253   sub(len, len, rscratch1);      // cnt -= unroll
4254   // t1 always points to the end of the region we're about to zero
4255   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4256   adr(rscratch2, entry);
4257   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4258   br(rscratch2);
4259   bind(loop);
4260   sub(len, len, unroll);
4261   for (int i = -unroll; i < 0; i++)
4262     Assembler::str(zr, Address(t1, i * wordSize));
4263   bind(entry);
4264   add(t1, t1, unroll * wordSize);
4265   cbnz(len, loop);
4266 }
4267 
4268 void MacroAssembler::verify_tlab() {
4269 #ifdef ASSERT
4270   if (UseTLAB && VerifyOops) {
4271     Label next, ok;
4272 
4273     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4274 
4275     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4276     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4277     cmp(rscratch2, rscratch1);
4278     br(Assembler::HS, next);
4279     STOP("assert(top >= start)");
4280     should_not_reach_here();
4281 
4282     bind(next);
4283     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4284     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4285     cmp(rscratch2, rscratch1);
4286     br(Assembler::HS, ok);
4287     STOP("assert(top <= end)");
4288     should_not_reach_here();
4289 
4290     bind(ok);
4291     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4292   }
4293 #endif
4294 }
4295 
4296 // Writes to stack successive pages until offset reached to check for
4297 // stack overflow + shadow pages.  This clobbers tmp.
4298 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4299   assert_different_registers(tmp, size, rscratch1);
4300   mov(tmp, sp);
4301   // Bang stack for total size given plus shadow page size.
4302   // Bang one page at a time because large size can bang beyond yellow and
4303   // red zones.
4304   Label loop;
4305   mov(rscratch1, os::vm_page_size());
4306   bind(loop);
4307   lea(tmp, Address(tmp, -os::vm_page_size()));
4308   subsw(size, size, rscratch1);
4309   str(size, Address(tmp));
4310   br(Assembler::GT, loop);
4311 
4312   // Bang down shadow pages too.
4313   // At this point, (tmp-0) is the last address touched, so don't
4314   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4315   // was post-decremented.)  Skip this address by starting at i=1, and
4316   // touch a few more pages below.  N.B.  It is important to touch all
4317   // the way down to and including i=StackShadowPages.
4318   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4319     // this could be any sized move but this is can be a debugging crumb
4320     // so the bigger the better.
4321     lea(tmp, Address(tmp, -os::vm_page_size()));
4322     str(size, Address(tmp));
4323   }
4324 }
4325 
4326 // Move the address of the polling page into dest.
4327 void MacroAssembler::get_polling_page(Register dest, relocInfo::relocType rtype) {
4328   ldr(dest, Address(rthread, Thread::polling_page_offset()));
4329 }
4330 
4331 // Move the address of the polling page into r, then read the polling
4332 // page.
4333 address MacroAssembler::fetch_and_read_polling_page(Register r, relocInfo::relocType rtype) {
4334   get_polling_page(r, rtype);
4335   return read_polling_page(r, rtype);
4336 }
4337 
4338 // Read the polling page.  The address of the polling page must
4339 // already be in r.
4340 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4341   InstructionMark im(this);
4342   code_section()->relocate(inst_mark(), rtype);
4343   ldrw(zr, Address(r, 0));
4344   return inst_mark();
4345 }
4346 
4347 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4348   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4349   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4350   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4351   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4352   long offset_low = dest_page - low_page;
4353   long offset_high = dest_page - high_page;
4354 
4355   assert(is_valid_AArch64_address(dest.target()), "bad address");
4356   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4357 
4358   InstructionMark im(this);
4359   code_section()->relocate(inst_mark(), dest.rspec());
4360   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4361   // the code cache so that if it is relocated we know it will still reach
4362   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4363     _adrp(reg1, dest.target());
4364   } else {
4365     unsigned long target = (unsigned long)dest.target();
4366     unsigned long adrp_target
4367       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4368 
4369     _adrp(reg1, (address)adrp_target);
4370     movk(reg1, target >> 32, 32);
4371   }
4372   byte_offset = (unsigned long)dest.target() & 0xfff;
4373 }
4374 
4375 void MacroAssembler::load_byte_map_base(Register reg) {
4376   CardTable::CardValue* byte_map_base =
4377     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4378 
4379   if (is_valid_AArch64_address((address)byte_map_base)) {
4380     // Strictly speaking the byte_map_base isn't an address at all,
4381     // and it might even be negative.
4382     unsigned long offset;
4383     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4384     // We expect offset to be zero with most collectors.
4385     if (offset != 0) {
4386       add(reg, reg, offset);
4387     }
4388   } else {
4389     mov(reg, (uint64_t)byte_map_base);
4390   }
4391 }
4392 
4393 void MacroAssembler::build_frame(int framesize) {
4394   assert(framesize > 0, "framesize must be > 0");
4395   if (framesize < ((1 << 9) + 2 * wordSize)) {
4396     sub(sp, sp, framesize);
4397     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4398     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4399   } else {
4400     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4401     if (PreserveFramePointer) mov(rfp, sp);
4402     if (framesize < ((1 << 12) + 2 * wordSize))
4403       sub(sp, sp, framesize - 2 * wordSize);
4404     else {
4405       mov(rscratch1, framesize - 2 * wordSize);
4406       sub(sp, sp, rscratch1);
4407     }
4408   }
4409 }
4410 
4411 void MacroAssembler::remove_frame(int framesize) {
4412   assert(framesize > 0, "framesize must be > 0");
4413   if (framesize < ((1 << 9) + 2 * wordSize)) {
4414     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4415     add(sp, sp, framesize);
4416   } else {
4417     if (framesize < ((1 << 12) + 2 * wordSize))
4418       add(sp, sp, framesize - 2 * wordSize);
4419     else {
4420       mov(rscratch1, framesize - 2 * wordSize);
4421       add(sp, sp, rscratch1);
4422     }
4423     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4424   }
4425 }
4426 
4427 
4428 // This method checks if provided byte array contains byte with highest bit set.
4429 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
4430     // Simple and most common case of aligned small array which is not at the
4431     // end of memory page is placed here. All other cases are in stub.
4432     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
4433     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
4434     assert_different_registers(ary1, len, result);
4435 
4436     cmpw(len, 0);
4437     br(LE, SET_RESULT);
4438     cmpw(len, 4 * wordSize);
4439     br(GE, STUB_LONG); // size > 32 then go to stub
4440 
4441     int shift = 64 - exact_log2(os::vm_page_size());
4442     lsl(rscratch1, ary1, shift);
4443     mov(rscratch2, (size_t)(4 * wordSize) << shift);
4444     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
4445     br(CS, STUB); // at the end of page then go to stub
4446     subs(len, len, wordSize);
4447     br(LT, END);
4448 
4449   BIND(LOOP);
4450     ldr(rscratch1, Address(post(ary1, wordSize)));
4451     tst(rscratch1, UPPER_BIT_MASK);
4452     br(NE, SET_RESULT);
4453     subs(len, len, wordSize);
4454     br(GE, LOOP);
4455     cmpw(len, -wordSize);
4456     br(EQ, SET_RESULT);
4457 
4458   BIND(END);
4459     ldr(result, Address(ary1));
4460     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
4461     lslv(result, result, len);
4462     tst(result, UPPER_BIT_MASK);
4463     b(SET_RESULT);
4464 
4465   BIND(STUB);
4466     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
4467     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
4468     trampoline_call(has_neg);
4469     b(DONE);
4470 
4471   BIND(STUB_LONG);
4472     RuntimeAddress has_neg_long =  RuntimeAddress(
4473             StubRoutines::aarch64::has_negatives_long());
4474     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
4475     trampoline_call(has_neg_long);
4476     b(DONE);
4477 
4478   BIND(SET_RESULT);
4479     cset(result, NE); // set true or false
4480 
4481   BIND(DONE);
4482 }
4483 
4484 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
4485                                    Register tmp4, Register tmp5, Register result,
4486                                    Register cnt1, int elem_size) {
4487   Label DONE, SAME;
4488   Register tmp1 = rscratch1;
4489   Register tmp2 = rscratch2;
4490   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4491   int elem_per_word = wordSize/elem_size;
4492   int log_elem_size = exact_log2(elem_size);
4493   int length_offset = arrayOopDesc::length_offset_in_bytes();
4494   int base_offset
4495     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
4496   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
4497 
4498   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
4499   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4500 
4501 #ifndef PRODUCT
4502   {
4503     const char kind = (elem_size == 2) ? 'U' : 'L';
4504     char comment[64];
4505     snprintf(comment, sizeof comment, "array_equals%c{", kind);
4506     BLOCK_COMMENT(comment);
4507   }
4508 #endif
4509 
4510   // if (a1 == a2)
4511   //     return true;
4512   cmpoop(a1, a2); // May have read barriers for a1 and a2.
4513   br(EQ, SAME);
4514 
4515   if (UseSimpleArrayEquals) {
4516     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
4517     // if (a1 == null || a2 == null)
4518     //     return false;
4519     // a1 & a2 == 0 means (some-pointer is null) or
4520     // (very-rare-or-even-probably-impossible-pointer-values)
4521     // so, we can save one branch in most cases
4522     tst(a1, a2);
4523     mov(result, false);
4524     br(EQ, A_MIGHT_BE_NULL);
4525     // if (a1.length != a2.length)
4526     //      return false;
4527     bind(A_IS_NOT_NULL);
4528     ldrw(cnt1, Address(a1, length_offset));
4529     ldrw(cnt2, Address(a2, length_offset));
4530     eorw(tmp5, cnt1, cnt2);
4531     cbnzw(tmp5, DONE);
4532     lea(a1, Address(a1, base_offset));
4533     lea(a2, Address(a2, base_offset));
4534     // Check for short strings, i.e. smaller than wordSize.
4535     subs(cnt1, cnt1, elem_per_word);
4536     br(Assembler::LT, SHORT);
4537     // Main 8 byte comparison loop.
4538     bind(NEXT_WORD); {
4539       ldr(tmp1, Address(post(a1, wordSize)));
4540       ldr(tmp2, Address(post(a2, wordSize)));
4541       subs(cnt1, cnt1, elem_per_word);
4542       eor(tmp5, tmp1, tmp2);
4543       cbnz(tmp5, DONE);
4544     } br(GT, NEXT_WORD);
4545     // Last longword.  In the case where length == 4 we compare the
4546     // same longword twice, but that's still faster than another
4547     // conditional branch.
4548     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4549     // length == 4.
4550     if (log_elem_size > 0)
4551       lsl(cnt1, cnt1, log_elem_size);
4552     ldr(tmp3, Address(a1, cnt1));
4553     ldr(tmp4, Address(a2, cnt1));
4554     eor(tmp5, tmp3, tmp4);
4555     cbnz(tmp5, DONE);
4556     b(SAME);
4557     bind(A_MIGHT_BE_NULL);
4558     // in case both a1 and a2 are not-null, proceed with loads
4559     cbz(a1, DONE);
4560     cbz(a2, DONE);
4561     b(A_IS_NOT_NULL);
4562     bind(SHORT);
4563 
4564     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
4565     {
4566       ldrw(tmp1, Address(post(a1, 4)));
4567       ldrw(tmp2, Address(post(a2, 4)));
4568       eorw(tmp5, tmp1, tmp2);
4569       cbnzw(tmp5, DONE);
4570     }
4571     bind(TAIL03);
4572     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
4573     {
4574       ldrh(tmp3, Address(post(a1, 2)));
4575       ldrh(tmp4, Address(post(a2, 2)));
4576       eorw(tmp5, tmp3, tmp4);
4577       cbnzw(tmp5, DONE);
4578     }
4579     bind(TAIL01);
4580     if (elem_size == 1) { // Only needed when comparing byte arrays.
4581       tbz(cnt1, 0, SAME); // 0-1 bytes left.
4582       {
4583         ldrb(tmp1, a1);
4584         ldrb(tmp2, a2);
4585         eorw(tmp5, tmp1, tmp2);
4586         cbnzw(tmp5, DONE);
4587       }
4588     }
4589   } else {
4590     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
4591         CSET_EQ, LAST_CHECK;
4592     mov(result, false);
4593     cbz(a1, DONE);
4594     ldrw(cnt1, Address(a1, length_offset));
4595     cbz(a2, DONE);
4596     ldrw(cnt2, Address(a2, length_offset));
4597     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
4598     // faster to perform another branch before comparing a1 and a2
4599     cmp(cnt1, (u1)elem_per_word);
4600     br(LE, SHORT); // short or same
4601     ldr(tmp3, Address(pre(a1, base_offset)));
4602     subs(zr, cnt1, stubBytesThreshold);
4603     br(GE, STUB);
4604     ldr(tmp4, Address(pre(a2, base_offset)));
4605     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4606     cmp(cnt2, cnt1);
4607     br(NE, DONE);
4608 
4609     // Main 16 byte comparison loop with 2 exits
4610     bind(NEXT_DWORD); {
4611       ldr(tmp1, Address(pre(a1, wordSize)));
4612       ldr(tmp2, Address(pre(a2, wordSize)));
4613       subs(cnt1, cnt1, 2 * elem_per_word);
4614       br(LE, TAIL);
4615       eor(tmp4, tmp3, tmp4);
4616       cbnz(tmp4, DONE);
4617       ldr(tmp3, Address(pre(a1, wordSize)));
4618       ldr(tmp4, Address(pre(a2, wordSize)));
4619       cmp(cnt1, (u1)elem_per_word);
4620       br(LE, TAIL2);
4621       cmp(tmp1, tmp2);
4622     } br(EQ, NEXT_DWORD);
4623     b(DONE);
4624 
4625     bind(TAIL);
4626     eor(tmp4, tmp3, tmp4);
4627     eor(tmp2, tmp1, tmp2);
4628     lslv(tmp2, tmp2, tmp5);
4629     orr(tmp5, tmp4, tmp2);
4630     cmp(tmp5, zr);
4631     b(CSET_EQ);
4632 
4633     bind(TAIL2);
4634     eor(tmp2, tmp1, tmp2);
4635     cbnz(tmp2, DONE);
4636     b(LAST_CHECK);
4637 
4638     bind(STUB);
4639     ldr(tmp4, Address(pre(a2, base_offset)));
4640     cmp(cnt2, cnt1);
4641     br(NE, DONE);
4642     if (elem_size == 2) { // convert to byte counter
4643       lsl(cnt1, cnt1, 1);
4644     }
4645     eor(tmp5, tmp3, tmp4);
4646     cbnz(tmp5, DONE);
4647     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
4648     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
4649     trampoline_call(stub);
4650     b(DONE);
4651 
4652     bind(EARLY_OUT);
4653     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
4654     // so, if a2 == null => return false(0), else return true, so we can return a2
4655     mov(result, a2);
4656     b(DONE);
4657     bind(SHORT);
4658     cmp(cnt2, cnt1);
4659     br(NE, DONE);
4660     cbz(cnt1, SAME);
4661     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
4662     ldr(tmp3, Address(a1, base_offset));
4663     ldr(tmp4, Address(a2, base_offset));
4664     bind(LAST_CHECK);
4665     eor(tmp4, tmp3, tmp4);
4666     lslv(tmp5, tmp4, tmp5);
4667     cmp(tmp5, zr);
4668     bind(CSET_EQ);
4669     cset(result, EQ);
4670     b(DONE);
4671   }
4672 
4673   bind(SAME);
4674   mov(result, true);
4675   // That's it.
4676   bind(DONE);
4677 
4678   BLOCK_COMMENT("} array_equals");
4679 }
4680 
4681 // Compare Strings
4682 
4683 // For Strings we're passed the address of the first characters in a1
4684 // and a2 and the length in cnt1.
4685 // elem_size is the element size in bytes: either 1 or 2.
4686 // There are two implementations.  For arrays >= 8 bytes, all
4687 // comparisons (including the final one, which may overlap) are
4688 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
4689 // halfword, then a short, and then a byte.
4690 
4691 void MacroAssembler::string_equals(Register a1, Register a2,
4692                                    Register result, Register cnt1, int elem_size)
4693 {
4694   Label SAME, DONE, SHORT, NEXT_WORD;
4695   Register tmp1 = rscratch1;
4696   Register tmp2 = rscratch2;
4697   Register cnt2 = tmp2;  // cnt2 only used in array length compare
4698 
4699   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
4700   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
4701 
4702 #ifndef PRODUCT
4703   {
4704     const char kind = (elem_size == 2) ? 'U' : 'L';
4705     char comment[64];
4706     snprintf(comment, sizeof comment, "{string_equals%c", kind);
4707     BLOCK_COMMENT(comment);
4708   }
4709 #endif
4710 
4711   mov(result, false);
4712 
4713   // Check for short strings, i.e. smaller than wordSize.
4714   subs(cnt1, cnt1, wordSize);
4715   br(Assembler::LT, SHORT);
4716   // Main 8 byte comparison loop.
4717   bind(NEXT_WORD); {
4718     ldr(tmp1, Address(post(a1, wordSize)));
4719     ldr(tmp2, Address(post(a2, wordSize)));
4720     subs(cnt1, cnt1, wordSize);
4721     eor(tmp1, tmp1, tmp2);
4722     cbnz(tmp1, DONE);
4723   } br(GT, NEXT_WORD);
4724   // Last longword.  In the case where length == 4 we compare the
4725   // same longword twice, but that's still faster than another
4726   // conditional branch.
4727   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
4728   // length == 4.
4729   ldr(tmp1, Address(a1, cnt1));
4730   ldr(tmp2, Address(a2, cnt1));
4731   eor(tmp2, tmp1, tmp2);
4732   cbnz(tmp2, DONE);
4733   b(SAME);
4734 
4735   bind(SHORT);
4736   Label TAIL03, TAIL01;
4737 
4738   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
4739   {
4740     ldrw(tmp1, Address(post(a1, 4)));
4741     ldrw(tmp2, Address(post(a2, 4)));
4742     eorw(tmp1, tmp1, tmp2);
4743     cbnzw(tmp1, DONE);
4744   }
4745   bind(TAIL03);
4746   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
4747   {
4748     ldrh(tmp1, Address(post(a1, 2)));
4749     ldrh(tmp2, Address(post(a2, 2)));
4750     eorw(tmp1, tmp1, tmp2);
4751     cbnzw(tmp1, DONE);
4752   }
4753   bind(TAIL01);
4754   if (elem_size == 1) { // Only needed when comparing 1-byte elements
4755     tbz(cnt1, 0, SAME); // 0-1 bytes left.
4756     {
4757       ldrb(tmp1, a1);
4758       ldrb(tmp2, a2);
4759       eorw(tmp1, tmp1, tmp2);
4760       cbnzw(tmp1, DONE);
4761     }
4762   }
4763   // Arrays are equal.
4764   bind(SAME);
4765   mov(result, true);
4766 
4767   // That's it.
4768   bind(DONE);
4769   BLOCK_COMMENT("} string_equals");
4770 }
4771 
4772 
4773 // The size of the blocks erased by the zero_blocks stub.  We must
4774 // handle anything smaller than this ourselves in zero_words().
4775 const int MacroAssembler::zero_words_block_size = 8;
4776 
4777 // zero_words() is used by C2 ClearArray patterns.  It is as small as
4778 // possible, handling small word counts locally and delegating
4779 // anything larger to the zero_blocks stub.  It is expanded many times
4780 // in compiled code, so it is important to keep it short.
4781 
4782 // ptr:   Address of a buffer to be zeroed.
4783 // cnt:   Count in HeapWords.
4784 //
4785 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
4786 void MacroAssembler::zero_words(Register ptr, Register cnt)
4787 {
4788   assert(is_power_of_2(zero_words_block_size), "adjust this");
4789   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
4790 
4791   BLOCK_COMMENT("zero_words {");
4792   cmp(cnt, (u1)zero_words_block_size);
4793   Label around;
4794   br(LO, around);
4795   {
4796     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
4797     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
4798     if (StubRoutines::aarch64::complete()) {
4799       trampoline_call(zero_blocks);
4800     } else {
4801       bl(zero_blocks);
4802     }
4803   }
4804   bind(around);
4805   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
4806     Label l;
4807     tbz(cnt, exact_log2(i), l);
4808     for (int j = 0; j < i; j += 2) {
4809       stp(zr, zr, post(ptr, 16));
4810     }
4811     bind(l);
4812   }
4813   {
4814     Label l;
4815     tbz(cnt, 0, l);
4816     str(zr, Address(ptr));
4817     bind(l);
4818   }
4819   BLOCK_COMMENT("} zero_words");
4820 }
4821 
4822 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
4823 // cnt:          Immediate count in HeapWords.
4824 #define SmallArraySize (18 * BytesPerLong)
4825 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4826 {
4827   BLOCK_COMMENT("zero_words {");
4828   int i = cnt & 1;  // store any odd word to start
4829   if (i) str(zr, Address(base));
4830 
4831   if (cnt <= SmallArraySize / BytesPerLong) {
4832     for (; i < (int)cnt; i += 2)
4833       stp(zr, zr, Address(base, i * wordSize));
4834   } else {
4835     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4836     int remainder = cnt % (2 * unroll);
4837     for (; i < remainder; i += 2)
4838       stp(zr, zr, Address(base, i * wordSize));
4839 
4840     Label loop;
4841     Register cnt_reg = rscratch1;
4842     Register loop_base = rscratch2;
4843     cnt = cnt - remainder;
4844     mov(cnt_reg, cnt);
4845     // adjust base and prebias by -2 * wordSize so we can pre-increment
4846     add(loop_base, base, (remainder - 2) * wordSize);
4847     bind(loop);
4848     sub(cnt_reg, cnt_reg, 2 * unroll);
4849     for (i = 1; i < unroll; i++)
4850       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4851     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4852     cbnz(cnt_reg, loop);
4853   }
4854   BLOCK_COMMENT("} zero_words");
4855 }
4856 
4857 // Zero blocks of memory by using DC ZVA.
4858 //
4859 // Aligns the base address first sufficently for DC ZVA, then uses
4860 // DC ZVA repeatedly for every full block.  cnt is the size to be
4861 // zeroed in HeapWords.  Returns the count of words left to be zeroed
4862 // in cnt.
4863 //
4864 // NOTE: This is intended to be used in the zero_blocks() stub.  If
4865 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
4866 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
4867   Register tmp = rscratch1;
4868   Register tmp2 = rscratch2;
4869   int zva_length = VM_Version::zva_length();
4870   Label initial_table_end, loop_zva;
4871   Label fini;
4872 
4873   // Base must be 16 byte aligned. If not just return and let caller handle it
4874   tst(base, 0x0f);
4875   br(Assembler::NE, fini);
4876   // Align base with ZVA length.
4877   neg(tmp, base);
4878   andr(tmp, tmp, zva_length - 1);
4879 
4880   // tmp: the number of bytes to be filled to align the base with ZVA length.
4881   add(base, base, tmp);
4882   sub(cnt, cnt, tmp, Assembler::ASR, 3);
4883   adr(tmp2, initial_table_end);
4884   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
4885   br(tmp2);
4886 
4887   for (int i = -zva_length + 16; i < 0; i += 16)
4888     stp(zr, zr, Address(base, i));
4889   bind(initial_table_end);
4890 
4891   sub(cnt, cnt, zva_length >> 3);
4892   bind(loop_zva);
4893   dc(Assembler::ZVA, base);
4894   subs(cnt, cnt, zva_length >> 3);
4895   add(base, base, zva_length);
4896   br(Assembler::GE, loop_zva);
4897   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
4898   bind(fini);
4899 }
4900 
4901 // base:   Address of a buffer to be filled, 8 bytes aligned.
4902 // cnt:    Count in 8-byte unit.
4903 // value:  Value to be filled with.
4904 // base will point to the end of the buffer after filling.
4905 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
4906 {
4907 //  Algorithm:
4908 //
4909 //    scratch1 = cnt & 7;
4910 //    cnt -= scratch1;
4911 //    p += scratch1;
4912 //    switch (scratch1) {
4913 //      do {
4914 //        cnt -= 8;
4915 //          p[-8] = v;
4916 //        case 7:
4917 //          p[-7] = v;
4918 //        case 6:
4919 //          p[-6] = v;
4920 //          // ...
4921 //        case 1:
4922 //          p[-1] = v;
4923 //        case 0:
4924 //          p += 8;
4925 //      } while (cnt);
4926 //    }
4927 
4928   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
4929 
4930   Label fini, skip, entry, loop;
4931   const int unroll = 8; // Number of stp instructions we'll unroll
4932 
4933   cbz(cnt, fini);
4934   tbz(base, 3, skip);
4935   str(value, Address(post(base, 8)));
4936   sub(cnt, cnt, 1);
4937   bind(skip);
4938 
4939   andr(rscratch1, cnt, (unroll-1) * 2);
4940   sub(cnt, cnt, rscratch1);
4941   add(base, base, rscratch1, Assembler::LSL, 3);
4942   adr(rscratch2, entry);
4943   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
4944   br(rscratch2);
4945 
4946   bind(loop);
4947   add(base, base, unroll * 16);
4948   for (int i = -unroll; i < 0; i++)
4949     stp(value, value, Address(base, i * 16));
4950   bind(entry);
4951   subs(cnt, cnt, unroll * 2);
4952   br(Assembler::GE, loop);
4953 
4954   tbz(cnt, 0, fini);
4955   str(value, Address(post(base, 8)));
4956   bind(fini);
4957 }
4958 
4959 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
4960 // java/lang/StringUTF16.compress.
4961 void MacroAssembler::encode_iso_array(Register src, Register dst,
4962                       Register len, Register result,
4963                       FloatRegister Vtmp1, FloatRegister Vtmp2,
4964                       FloatRegister Vtmp3, FloatRegister Vtmp4)
4965 {
4966     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
4967         NEXT_32_START, NEXT_32_PRFM_START;
4968     Register tmp1 = rscratch1, tmp2 = rscratch2;
4969 
4970       mov(result, len); // Save initial len
4971 
4972       cmp(len, (u1)8); // handle shortest strings first
4973       br(LT, LOOP_1);
4974       cmp(len, (u1)32);
4975       br(LT, NEXT_8);
4976       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
4977       // to convert chars to bytes
4978       if (SoftwarePrefetchHintDistance >= 0) {
4979         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4980         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
4981         br(LE, NEXT_32_START);
4982         b(NEXT_32_PRFM_START);
4983         BIND(NEXT_32_PRFM);
4984           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4985         BIND(NEXT_32_PRFM_START);
4986           prfm(Address(src, SoftwarePrefetchHintDistance));
4987           orr(v4, T16B, Vtmp1, Vtmp2);
4988           orr(v5, T16B, Vtmp3, Vtmp4);
4989           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
4990           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
4991           uzp2(v5, T16B, v4, v5); // high bytes
4992           umov(tmp2, v5, D, 1);
4993           fmovd(tmp1, v5);
4994           orr(tmp1, tmp1, tmp2);
4995           cbnz(tmp1, LOOP_8);
4996           stpq(Vtmp1, Vtmp3, dst);
4997           sub(len, len, 32);
4998           add(dst, dst, 32);
4999           add(src, src, 64);
5000           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5001           br(GE, NEXT_32_PRFM);
5002           cmp(len, (u1)32);
5003           br(LT, LOOP_8);
5004         BIND(NEXT_32);
5005           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5006         BIND(NEXT_32_START);
5007       } else {
5008         BIND(NEXT_32);
5009           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5010       }
5011       prfm(Address(src, SoftwarePrefetchHintDistance));
5012       uzp1(v4, T16B, Vtmp1, Vtmp2);
5013       uzp1(v5, T16B, Vtmp3, Vtmp4);
5014       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5015       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5016       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5017       umov(tmp2, Vtmp1, D, 1);
5018       fmovd(tmp1, Vtmp1);
5019       orr(tmp1, tmp1, tmp2);
5020       cbnz(tmp1, LOOP_8);
5021       stpq(v4, v5, dst);
5022       sub(len, len, 32);
5023       add(dst, dst, 32);
5024       add(src, src, 64);
5025       cmp(len, (u1)32);
5026       br(GE, NEXT_32);
5027       cbz(len, DONE);
5028 
5029     BIND(LOOP_8);
5030       cmp(len, (u1)8);
5031       br(LT, LOOP_1);
5032     BIND(NEXT_8);
5033       ld1(Vtmp1, T8H, src);
5034       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5035       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5036       fmovd(tmp1, Vtmp3);
5037       cbnz(tmp1, NEXT_1);
5038       strd(Vtmp2, dst);
5039 
5040       sub(len, len, 8);
5041       add(dst, dst, 8);
5042       add(src, src, 16);
5043       cmp(len, (u1)8);
5044       br(GE, NEXT_8);
5045 
5046     BIND(LOOP_1);
5047 
5048     cbz(len, DONE);
5049     BIND(NEXT_1);
5050       ldrh(tmp1, Address(post(src, 2)));
5051       tst(tmp1, 0xff00);
5052       br(NE, SET_RESULT);
5053       strb(tmp1, Address(post(dst, 1)));
5054       subs(len, len, 1);
5055       br(GT, NEXT_1);
5056 
5057     BIND(SET_RESULT);
5058       sub(result, result, len); // Return index where we stopped
5059                                 // Return len == 0 if we processed all
5060                                 // characters
5061     BIND(DONE);
5062 }
5063 
5064 
5065 // Inflate byte[] array to char[].
5066 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5067                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5068                                         Register tmp4) {
5069   Label big, done, after_init, to_stub;
5070 
5071   assert_different_registers(src, dst, len, tmp4, rscratch1);
5072 
5073   fmovd(vtmp1, zr);
5074   lsrw(tmp4, len, 3);
5075   bind(after_init);
5076   cbnzw(tmp4, big);
5077   // Short string: less than 8 bytes.
5078   {
5079     Label loop, tiny;
5080 
5081     cmpw(len, 4);
5082     br(LT, tiny);
5083     // Use SIMD to do 4 bytes.
5084     ldrs(vtmp2, post(src, 4));
5085     zip1(vtmp3, T8B, vtmp2, vtmp1);
5086     subw(len, len, 4);
5087     strd(vtmp3, post(dst, 8));
5088 
5089     cbzw(len, done);
5090 
5091     // Do the remaining bytes by steam.
5092     bind(loop);
5093     ldrb(tmp4, post(src, 1));
5094     strh(tmp4, post(dst, 2));
5095     subw(len, len, 1);
5096 
5097     bind(tiny);
5098     cbnz(len, loop);
5099 
5100     b(done);
5101   }
5102 
5103   if (SoftwarePrefetchHintDistance >= 0) {
5104     bind(to_stub);
5105       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5106       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5107       trampoline_call(stub);
5108       b(after_init);
5109   }
5110 
5111   // Unpack the bytes 8 at a time.
5112   bind(big);
5113   {
5114     Label loop, around, loop_last, loop_start;
5115 
5116     if (SoftwarePrefetchHintDistance >= 0) {
5117       const int large_loop_threshold = (64 + 16)/8;
5118       ldrd(vtmp2, post(src, 8));
5119       andw(len, len, 7);
5120       cmp(tmp4, (u1)large_loop_threshold);
5121       br(GE, to_stub);
5122       b(loop_start);
5123 
5124       bind(loop);
5125       ldrd(vtmp2, post(src, 8));
5126       bind(loop_start);
5127       subs(tmp4, tmp4, 1);
5128       br(EQ, loop_last);
5129       zip1(vtmp2, T16B, vtmp2, vtmp1);
5130       ldrd(vtmp3, post(src, 8));
5131       st1(vtmp2, T8H, post(dst, 16));
5132       subs(tmp4, tmp4, 1);
5133       zip1(vtmp3, T16B, vtmp3, vtmp1);
5134       st1(vtmp3, T8H, post(dst, 16));
5135       br(NE, loop);
5136       b(around);
5137       bind(loop_last);
5138       zip1(vtmp2, T16B, vtmp2, vtmp1);
5139       st1(vtmp2, T8H, post(dst, 16));
5140       bind(around);
5141       cbz(len, done);
5142     } else {
5143       andw(len, len, 7);
5144       bind(loop);
5145       ldrd(vtmp2, post(src, 8));
5146       sub(tmp4, tmp4, 1);
5147       zip1(vtmp3, T16B, vtmp2, vtmp1);
5148       st1(vtmp3, T8H, post(dst, 16));
5149       cbnz(tmp4, loop);
5150     }
5151   }
5152 
5153   // Do the tail of up to 8 bytes.
5154   add(src, src, len);
5155   ldrd(vtmp3, Address(src, -8));
5156   add(dst, dst, len, ext::uxtw, 1);
5157   zip1(vtmp3, T16B, vtmp3, vtmp1);
5158   strq(vtmp3, Address(dst, -16));
5159 
5160   bind(done);
5161 }
5162 
5163 // Compress char[] array to byte[].
5164 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5165                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5166                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5167                                          Register result) {
5168   encode_iso_array(src, dst, len, result,
5169                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5170   cmp(len, zr);
5171   csel(result, result, zr, EQ);
5172 }
5173 
5174 // get_thread() can be called anywhere inside generated code so we
5175 // need to save whatever non-callee save context might get clobbered
5176 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5177 // the call setup code.
5178 //
5179 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5180 //
5181 void MacroAssembler::get_thread(Register dst) {
5182   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5183   push(saved_regs, sp);
5184 
5185   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5186   blr(lr);
5187   if (dst != c_rarg0) {
5188     mov(dst, c_rarg0);
5189   }
5190 
5191   pop(saved_regs, sp);
5192 }
5193 
5194 void MacroAssembler::cache_wb(Address line) {
5195   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5196   assert(line.index() == noreg, "index should be noreg");
5197   assert(line.offset() == 0, "offset should be 0");
5198   // would like to assert this
5199   // assert(line._ext.shift == 0, "shift should be zero");
5200   if (VM_Version::supports_dcpop()) {
5201     // writeback using clear virtual address to point of persistence
5202     dc(Assembler::CVAP, line.base());
5203   } else {
5204     // no need to generate anything as Unsafe.writebackMemory should
5205     // never invoke this stub
5206   }
5207 }
5208 
5209 void MacroAssembler::cache_wbsync(bool is_pre) {
5210   // we only need a barrier post sync
5211   if (!is_pre) {
5212     membar(Assembler::AnyAny);
5213   }
5214 }