1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   assert(last_java_pc != NULL, "must provide a valid PC");
 377 
 378   adr(scratch, last_java_pc);
 379   str(scratch, Address(rthread,
 380                        JavaThread::frame_anchor_offset()
 381                        + JavaFrameAnchor::last_Java_pc_offset()));
 382 
 383   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          Label &L,
 389                                          Register scratch) {
 390   if (L.is_bound()) {
 391     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 392   } else {
 393     InstructionMark im(this);
 394     L.add_patch_at(code(), locator());
 395     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 396   }
 397 }
 398 
 399 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 400   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 401   assert(CodeCache::find_blob(entry.target()) != NULL,
 402          "destination of far call not found in code cache");
 403   if (far_branches()) {
 404     unsigned long offset;
 405     // We can use ADRP here because we know that the total size of
 406     // the code cache cannot exceed 2Gb.
 407     adrp(tmp, entry, offset);
 408     add(tmp, tmp, offset);
 409     if (cbuf) cbuf->set_insts_mark();
 410     blr(tmp);
 411   } else {
 412     if (cbuf) cbuf->set_insts_mark();
 413     bl(entry);
 414   }
 415 }
 416 
 417 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 418   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 419   assert(CodeCache::find_blob(entry.target()) != NULL,
 420          "destination of far call not found in code cache");
 421   if (far_branches()) {
 422     unsigned long offset;
 423     // We can use ADRP here because we know that the total size of
 424     // the code cache cannot exceed 2Gb.
 425     adrp(tmp, entry, offset);
 426     add(tmp, tmp, offset);
 427     if (cbuf) cbuf->set_insts_mark();
 428     br(tmp);
 429   } else {
 430     if (cbuf) cbuf->set_insts_mark();
 431     b(entry);
 432   }
 433 }
 434 
 435 void MacroAssembler::reserved_stack_check() {
 436     // testing if reserved zone needs to be enabled
 437     Label no_reserved_zone_enabling;
 438 
 439     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 440     cmp(sp, rscratch1);
 441     br(Assembler::LO, no_reserved_zone_enabling);
 442 
 443     enter();   // LR and FP are live.
 444     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 445     mov(c_rarg0, rthread);
 446     blr(rscratch1);
 447     leave();
 448 
 449     // We have already removed our own frame.
 450     // throw_delayed_StackOverflowError will think that it's been
 451     // called by our caller.
 452     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 453     br(rscratch1);
 454     should_not_reach_here();
 455 
 456     bind(no_reserved_zone_enabling);
 457 }
 458 
 459 int MacroAssembler::biased_locking_enter(Register lock_reg,
 460                                          Register obj_reg,
 461                                          Register swap_reg,
 462                                          Register tmp_reg,
 463                                          bool swap_reg_contains_mark,
 464                                          Label& done,
 465                                          Label* slow_case,
 466                                          BiasedLockingCounters* counters) {
 467   assert(UseBiasedLocking, "why call this otherwise?");
 468   assert_different_registers(lock_reg, obj_reg, swap_reg);
 469 
 470   if (PrintBiasedLockingStatistics && counters == NULL)
 471     counters = BiasedLocking::counters();
 472 
 473   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 474   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 475   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 476   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 477   Address saved_mark_addr(lock_reg, 0);
 478 
 479   // Biased locking
 480   // See whether the lock is currently biased toward our thread and
 481   // whether the epoch is still valid
 482   // Note that the runtime guarantees sufficient alignment of JavaThread
 483   // pointers to allow age to be placed into low bits
 484   // First check to see whether biasing is even enabled for this object
 485   Label cas_label;
 486   int null_check_offset = -1;
 487   if (!swap_reg_contains_mark) {
 488     null_check_offset = offset();
 489     ldr(swap_reg, mark_addr);
 490   }
 491   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 492   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 493   br(Assembler::NE, cas_label);
 494   // The bias pattern is present in the object's header. Need to check
 495   // whether the bias owner and the epoch are both still current.
 496   load_prototype_header(tmp_reg, obj_reg);
 497   orr(tmp_reg, tmp_reg, rthread);
 498   eor(tmp_reg, swap_reg, tmp_reg);
 499   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 500   if (counters != NULL) {
 501     Label around;
 502     cbnz(tmp_reg, around);
 503     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 504     b(done);
 505     bind(around);
 506   } else {
 507     cbz(tmp_reg, done);
 508   }
 509 
 510   Label try_revoke_bias;
 511   Label try_rebias;
 512 
 513   // At this point we know that the header has the bias pattern and
 514   // that we are not the bias owner in the current epoch. We need to
 515   // figure out more details about the state of the header in order to
 516   // know what operations can be legally performed on the object's
 517   // header.
 518 
 519   // If the low three bits in the xor result aren't clear, that means
 520   // the prototype header is no longer biased and we have to revoke
 521   // the bias on this object.
 522   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 523   cbnz(rscratch1, try_revoke_bias);
 524 
 525   // Biasing is still enabled for this data type. See whether the
 526   // epoch of the current bias is still valid, meaning that the epoch
 527   // bits of the mark word are equal to the epoch bits of the
 528   // prototype header. (Note that the prototype header's epoch bits
 529   // only change at a safepoint.) If not, attempt to rebias the object
 530   // toward the current thread. Note that we must be absolutely sure
 531   // that the current epoch is invalid in order to do this because
 532   // otherwise the manipulations it performs on the mark word are
 533   // illegal.
 534   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 535   cbnz(rscratch1, try_rebias);
 536 
 537   // The epoch of the current bias is still valid but we know nothing
 538   // about the owner; it might be set or it might be clear. Try to
 539   // acquire the bias of the object using an atomic operation. If this
 540   // fails we will go in to the runtime to revoke the object's bias.
 541   // Note that we first construct the presumed unbiased header so we
 542   // don't accidentally blow away another thread's valid bias.
 543   {
 544     Label here;
 545     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 546     andr(swap_reg, swap_reg, rscratch1);
 547     orr(tmp_reg, swap_reg, rthread);
 548     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 549     // If the biasing toward our thread failed, this means that
 550     // another thread succeeded in biasing it toward itself and we
 551     // need to revoke that bias. The revocation will occur in the
 552     // interpreter runtime in the slow case.
 553     bind(here);
 554     if (counters != NULL) {
 555       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 556                   tmp_reg, rscratch1, rscratch2);
 557     }
 558   }
 559   b(done);
 560 
 561   bind(try_rebias);
 562   // At this point we know the epoch has expired, meaning that the
 563   // current "bias owner", if any, is actually invalid. Under these
 564   // circumstances _only_, we are allowed to use the current header's
 565   // value as the comparison value when doing the cas to acquire the
 566   // bias in the current epoch. In other words, we allow transfer of
 567   // the bias from one thread to another directly in this situation.
 568   //
 569   // FIXME: due to a lack of registers we currently blow away the age
 570   // bits in this situation. Should attempt to preserve them.
 571   {
 572     Label here;
 573     load_prototype_header(tmp_reg, obj_reg);
 574     orr(tmp_reg, rthread, tmp_reg);
 575     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 576     // If the biasing toward our thread failed, then another thread
 577     // succeeded in biasing it toward itself and we need to revoke that
 578     // bias. The revocation will occur in the runtime in the slow case.
 579     bind(here);
 580     if (counters != NULL) {
 581       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 582                   tmp_reg, rscratch1, rscratch2);
 583     }
 584   }
 585   b(done);
 586 
 587   bind(try_revoke_bias);
 588   // The prototype mark in the klass doesn't have the bias bit set any
 589   // more, indicating that objects of this data type are not supposed
 590   // to be biased any more. We are going to try to reset the mark of
 591   // this object to the prototype value and fall through to the
 592   // CAS-based locking scheme. Note that if our CAS fails, it means
 593   // that another thread raced us for the privilege of revoking the
 594   // bias of this particular object, so it's okay to continue in the
 595   // normal locking code.
 596   //
 597   // FIXME: due to a lack of registers we currently blow away the age
 598   // bits in this situation. Should attempt to preserve them.
 599   {
 600     Label here, nope;
 601     load_prototype_header(tmp_reg, obj_reg);
 602     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 603     bind(here);
 604 
 605     // Fall through to the normal CAS-based lock, because no matter what
 606     // the result of the above CAS, some thread must have succeeded in
 607     // removing the bias bit from the object's header.
 608     if (counters != NULL) {
 609       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 610                   rscratch1, rscratch2);
 611     }
 612     bind(nope);
 613   }
 614 
 615   bind(cas_label);
 616 
 617   return null_check_offset;
 618 }
 619 
 620 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 621   assert(UseBiasedLocking, "why call this otherwise?");
 622 
 623   // Check for biased locking unlock case, which is a no-op
 624   // Note: we do not have to check the thread ID for two reasons.
 625   // First, the interpreter checks for IllegalMonitorStateException at
 626   // a higher level. Second, if the bias was revoked while we held the
 627   // lock, the object could not be rebiased toward another thread, so
 628   // the bias bit would be clear.
 629   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 630   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 631   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 632   br(Assembler::EQ, done);
 633 }
 634 
 635 static void pass_arg0(MacroAssembler* masm, Register arg) {
 636   if (c_rarg0 != arg ) {
 637     masm->mov(c_rarg0, arg);
 638   }
 639 }
 640 
 641 static void pass_arg1(MacroAssembler* masm, Register arg) {
 642   if (c_rarg1 != arg ) {
 643     masm->mov(c_rarg1, arg);
 644   }
 645 }
 646 
 647 static void pass_arg2(MacroAssembler* masm, Register arg) {
 648   if (c_rarg2 != arg ) {
 649     masm->mov(c_rarg2, arg);
 650   }
 651 }
 652 
 653 static void pass_arg3(MacroAssembler* masm, Register arg) {
 654   if (c_rarg3 != arg ) {
 655     masm->mov(c_rarg3, arg);
 656   }
 657 }
 658 
 659 void MacroAssembler::call_VM_base(Register oop_result,
 660                                   Register java_thread,
 661                                   Register last_java_sp,
 662                                   address  entry_point,
 663                                   int      number_of_arguments,
 664                                   bool     check_exceptions) {
 665    // determine java_thread register
 666   if (!java_thread->is_valid()) {
 667     java_thread = rthread;
 668   }
 669 
 670   // determine last_java_sp register
 671   if (!last_java_sp->is_valid()) {
 672     last_java_sp = esp;
 673   }
 674 
 675   // debugging support
 676   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 677   assert(java_thread == rthread, "unexpected register");
 678 #ifdef ASSERT
 679   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 680   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 681 #endif // ASSERT
 682 
 683   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 684   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 685 
 686   // push java thread (becomes first argument of C function)
 687 
 688   mov(c_rarg0, java_thread);
 689 
 690   // set last Java frame before call
 691   assert(last_java_sp != rfp, "can't use rfp");
 692 
 693   Label l;
 694   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 695 
 696   // do the call, remove parameters
 697   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 698 
 699   // reset last Java frame
 700   // Only interpreter should have to clear fp
 701   reset_last_Java_frame(true);
 702 
 703    // C++ interp handles this in the interpreter
 704   check_and_handle_popframe(java_thread);
 705   check_and_handle_earlyret(java_thread);
 706 
 707   if (check_exceptions) {
 708     // check for pending exceptions (java_thread is set upon return)
 709     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 710     Label ok;
 711     cbz(rscratch1, ok);
 712     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 713     br(rscratch1);
 714     bind(ok);
 715   }
 716 
 717   // get oop result if there is one and reset the value in the thread
 718   if (oop_result->is_valid()) {
 719     get_vm_result(oop_result, java_thread);
 720   }
 721 }
 722 
 723 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 724   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 725 }
 726 
 727 // Maybe emit a call via a trampoline.  If the code cache is small
 728 // trampolines won't be emitted.
 729 
 730 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 731   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 732   assert(entry.rspec().type() == relocInfo::runtime_call_type
 733          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 734          || entry.rspec().type() == relocInfo::static_call_type
 735          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 736 
 737   // We need a trampoline if branches are far.
 738   if (far_branches()) {
 739     bool in_scratch_emit_size = false;
 740 #ifdef COMPILER2
 741     // We don't want to emit a trampoline if C2 is generating dummy
 742     // code during its branch shortening phase.
 743     CompileTask* task = ciEnv::current()->task();
 744     in_scratch_emit_size =
 745       (task != NULL && is_c2_compile(task->comp_level()) &&
 746        Compile::current()->in_scratch_emit_size());
 747 #endif
 748     if (!in_scratch_emit_size) {
 749       address stub = emit_trampoline_stub(offset(), entry.target());
 750       if (stub == NULL) {
 751         return NULL; // CodeCache is full
 752       }
 753     }
 754   }
 755 
 756   if (cbuf) cbuf->set_insts_mark();
 757   relocate(entry.rspec());
 758   if (!far_branches()) {
 759     bl(entry.target());
 760   } else {
 761     bl(pc());
 762   }
 763   // just need to return a non-null address
 764   return pc();
 765 }
 766 
 767 
 768 // Emit a trampoline stub for a call to a target which is too far away.
 769 //
 770 // code sequences:
 771 //
 772 // call-site:
 773 //   branch-and-link to <destination> or <trampoline stub>
 774 //
 775 // Related trampoline stub for this call site in the stub section:
 776 //   load the call target from the constant pool
 777 //   branch (LR still points to the call site above)
 778 
 779 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 780                                              address dest) {
 781   // Max stub size: alignment nop, TrampolineStub.
 782   address stub = start_a_stub(NativeInstruction::instruction_size
 783                    + NativeCallTrampolineStub::instruction_size);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 void MacroAssembler::c2bool(Register x) {
 816   // implements x == 0 ? 0 : 1
 817   // note: must only look at least-significant byte of x
 818   //       since C-style booleans are stored in one byte
 819   //       only! (was bug)
 820   tst(x, 0xff);
 821   cset(x, Assembler::NE);
 822 }
 823 
 824 address MacroAssembler::ic_call(address entry, jint method_index) {
 825   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 826   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 827   // unsigned long offset;
 828   // ldr_constant(rscratch2, const_ptr);
 829   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 830   return trampoline_call(Address(entry, rh));
 831 }
 832 
 833 // Implementation of call_VM versions
 834 
 835 void MacroAssembler::call_VM(Register oop_result,
 836                              address entry_point,
 837                              bool check_exceptions) {
 838   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 839 }
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              Register arg_1,
 844                              bool check_exceptions) {
 845   pass_arg1(this, arg_1);
 846   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 847 }
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              Register arg_1,
 852                              Register arg_2,
 853                              bool check_exceptions) {
 854   assert(arg_1 != c_rarg2, "smashed arg");
 855   pass_arg2(this, arg_2);
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              address entry_point,
 862                              Register arg_1,
 863                              Register arg_2,
 864                              Register arg_3,
 865                              bool check_exceptions) {
 866   assert(arg_1 != c_rarg3, "smashed arg");
 867   assert(arg_2 != c_rarg3, "smashed arg");
 868   pass_arg3(this, arg_3);
 869 
 870   assert(arg_1 != c_rarg2, "smashed arg");
 871   pass_arg2(this, arg_2);
 872 
 873   pass_arg1(this, arg_1);
 874   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              int number_of_arguments,
 881                              bool check_exceptions) {
 882   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 883 }
 884 
 885 void MacroAssembler::call_VM(Register oop_result,
 886                              Register last_java_sp,
 887                              address entry_point,
 888                              Register arg_1,
 889                              bool check_exceptions) {
 890   pass_arg1(this, arg_1);
 891   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 892 }
 893 
 894 void MacroAssembler::call_VM(Register oop_result,
 895                              Register last_java_sp,
 896                              address entry_point,
 897                              Register arg_1,
 898                              Register arg_2,
 899                              bool check_exceptions) {
 900 
 901   assert(arg_1 != c_rarg2, "smashed arg");
 902   pass_arg2(this, arg_2);
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              Register arg_3,
 913                              bool check_exceptions) {
 914   assert(arg_1 != c_rarg3, "smashed arg");
 915   assert(arg_2 != c_rarg3, "smashed arg");
 916   pass_arg3(this, arg_3);
 917   assert(arg_1 != c_rarg2, "smashed arg");
 918   pass_arg2(this, arg_2);
 919   pass_arg1(this, arg_1);
 920   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 921 }
 922 
 923 
 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 925   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 926   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 927   verify_oop(oop_result, "broken oop in call_VM_base");
 928 }
 929 
 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 931   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 933 }
 934 
 935 void MacroAssembler::align(int modulus) {
 936   while (offset() % modulus != 0) nop();
 937 }
 938 
 939 // these are no-ops overridden by InterpreterMacroAssembler
 940 
 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 942 
 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 944 
 945 
 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 947                                                       Register tmp,
 948                                                       int offset) {
 949   intptr_t value = *delayed_value_addr;
 950   if (value != 0)
 951     return RegisterOrConstant(value + offset);
 952 
 953   // load indirectly to solve generation ordering problem
 954   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 955 
 956   if (offset != 0)
 957     add(tmp, tmp, offset);
 958 
 959   return RegisterOrConstant(tmp);
 960 }
 961 
 962 
 963 void MacroAssembler:: notify(int type) {
 964   if (type == bytecode_start) {
 965     // set_last_Java_frame(esp, rfp, (address)NULL);
 966     Assembler:: notify(type);
 967     // reset_last_Java_frame(true);
 968   }
 969   else
 970     Assembler:: notify(type);
 971 }
 972 
 973 // Look up the method for a megamorphic invokeinterface call.
 974 // The target method is determined by <intf_klass, itable_index>.
 975 // The receiver klass is in recv_klass.
 976 // On success, the result will be in method_result, and execution falls through.
 977 // On failure, execution transfers to the given label.
 978 void MacroAssembler::lookup_interface_method(Register recv_klass,
 979                                              Register intf_klass,
 980                                              RegisterOrConstant itable_index,
 981                                              Register method_result,
 982                                              Register scan_temp,
 983                                              Label& L_no_such_interface,
 984                          bool return_method) {
 985   assert_different_registers(recv_klass, intf_klass, scan_temp);
 986   assert_different_registers(method_result, intf_klass, scan_temp);
 987   assert(recv_klass != method_result || !return_method,
 988      "recv_klass can be destroyed when method isn't needed");
 989   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 990          "caller must use same register for non-constant itable index as for method");
 991 
 992   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 993   int vtable_base = in_bytes(Klass::vtable_start_offset());
 994   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 995   int scan_step   = itableOffsetEntry::size() * wordSize;
 996   int vte_size    = vtableEntry::size_in_bytes();
 997   assert(vte_size == wordSize, "else adjust times_vte_scale");
 998 
 999   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1000 
1001   // %%% Could store the aligned, prescaled offset in the klassoop.
1002   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1003   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1004   add(scan_temp, scan_temp, vtable_base);
1005 
1006   if (return_method) {
1007     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1008     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1009     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1010     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1011     if (itentry_off)
1012       add(recv_klass, recv_klass, itentry_off);
1013   }
1014 
1015   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1016   //   if (scan->interface() == intf) {
1017   //     result = (klass + scan->offset() + itable_index);
1018   //   }
1019   // }
1020   Label search, found_method;
1021 
1022   for (int peel = 1; peel >= 0; peel--) {
1023     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1024     cmp(intf_klass, method_result);
1025 
1026     if (peel) {
1027       br(Assembler::EQ, found_method);
1028     } else {
1029       br(Assembler::NE, search);
1030       // (invert the test to fall through to found_method...)
1031     }
1032 
1033     if (!peel)  break;
1034 
1035     bind(search);
1036 
1037     // Check that the previous entry is non-null.  A null entry means that
1038     // the receiver class doesn't implement the interface, and wasn't the
1039     // same as when the caller was compiled.
1040     cbz(method_result, L_no_such_interface);
1041     add(scan_temp, scan_temp, scan_step);
1042   }
1043 
1044   bind(found_method);
1045 
1046   // Got a hit.
1047   if (return_method) {
1048     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1049     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1050   }
1051 }
1052 
1053 // virtual method calling
1054 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1055                                            RegisterOrConstant vtable_index,
1056                                            Register method_result) {
1057   const int base = in_bytes(Klass::vtable_start_offset());
1058   assert(vtableEntry::size() * wordSize == 8,
1059          "adjust the scaling in the code below");
1060   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1061 
1062   if (vtable_index.is_register()) {
1063     lea(method_result, Address(recv_klass,
1064                                vtable_index.as_register(),
1065                                Address::lsl(LogBytesPerWord)));
1066     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1067   } else {
1068     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1069     ldr(method_result,
1070         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1071   }
1072 }
1073 
1074 void MacroAssembler::check_klass_subtype(Register sub_klass,
1075                            Register super_klass,
1076                            Register temp_reg,
1077                            Label& L_success) {
1078   Label L_failure;
1079   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1080   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1081   bind(L_failure);
1082 }
1083 
1084 
1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1086                                                    Register super_klass,
1087                                                    Register temp_reg,
1088                                                    Label* L_success,
1089                                                    Label* L_failure,
1090                                                    Label* L_slow_path,
1091                                         RegisterOrConstant super_check_offset) {
1092   assert_different_registers(sub_klass, super_klass, temp_reg);
1093   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1094   if (super_check_offset.is_register()) {
1095     assert_different_registers(sub_klass, super_klass,
1096                                super_check_offset.as_register());
1097   } else if (must_load_sco) {
1098     assert(temp_reg != noreg, "supply either a temp or a register offset");
1099   }
1100 
1101   Label L_fallthrough;
1102   int label_nulls = 0;
1103   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1104   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1105   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1106   assert(label_nulls <= 1, "at most one NULL in the batch");
1107 
1108   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1109   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1110   Address super_check_offset_addr(super_klass, sco_offset);
1111 
1112   // Hacked jmp, which may only be used just before L_fallthrough.
1113 #define final_jmp(label)                                                \
1114   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1115   else                            b(label)                /*omit semi*/
1116 
1117   // If the pointers are equal, we are done (e.g., String[] elements).
1118   // This self-check enables sharing of secondary supertype arrays among
1119   // non-primary types such as array-of-interface.  Otherwise, each such
1120   // type would need its own customized SSA.
1121   // We move this check to the front of the fast path because many
1122   // type checks are in fact trivially successful in this manner,
1123   // so we get a nicely predicted branch right at the start of the check.
1124   cmp(sub_klass, super_klass);
1125   br(Assembler::EQ, *L_success);
1126 
1127   // Check the supertype display:
1128   if (must_load_sco) {
1129     ldrw(temp_reg, super_check_offset_addr);
1130     super_check_offset = RegisterOrConstant(temp_reg);
1131   }
1132   Address super_check_addr(sub_klass, super_check_offset);
1133   ldr(rscratch1, super_check_addr);
1134   cmp(super_klass, rscratch1); // load displayed supertype
1135 
1136   // This check has worked decisively for primary supers.
1137   // Secondary supers are sought in the super_cache ('super_cache_addr').
1138   // (Secondary supers are interfaces and very deeply nested subtypes.)
1139   // This works in the same check above because of a tricky aliasing
1140   // between the super_cache and the primary super display elements.
1141   // (The 'super_check_addr' can address either, as the case requires.)
1142   // Note that the cache is updated below if it does not help us find
1143   // what we need immediately.
1144   // So if it was a primary super, we can just fail immediately.
1145   // Otherwise, it's the slow path for us (no success at this point).
1146 
1147   if (super_check_offset.is_register()) {
1148     br(Assembler::EQ, *L_success);
1149     subs(zr, super_check_offset.as_register(), sc_offset);
1150     if (L_failure == &L_fallthrough) {
1151       br(Assembler::EQ, *L_slow_path);
1152     } else {
1153       br(Assembler::NE, *L_failure);
1154       final_jmp(*L_slow_path);
1155     }
1156   } else if (super_check_offset.as_constant() == sc_offset) {
1157     // Need a slow path; fast failure is impossible.
1158     if (L_slow_path == &L_fallthrough) {
1159       br(Assembler::EQ, *L_success);
1160     } else {
1161       br(Assembler::NE, *L_slow_path);
1162       final_jmp(*L_success);
1163     }
1164   } else {
1165     // No slow path; it's a fast decision.
1166     if (L_failure == &L_fallthrough) {
1167       br(Assembler::EQ, *L_success);
1168     } else {
1169       br(Assembler::NE, *L_failure);
1170       final_jmp(*L_success);
1171     }
1172   }
1173 
1174   bind(L_fallthrough);
1175 
1176 #undef final_jmp
1177 }
1178 
1179 // These two are taken from x86, but they look generally useful
1180 
1181 // scans count pointer sized words at [addr] for occurence of value,
1182 // generic
1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1184                                 Register scratch) {
1185   Label Lloop, Lexit;
1186   cbz(count, Lexit);
1187   bind(Lloop);
1188   ldr(scratch, post(addr, wordSize));
1189   cmp(value, scratch);
1190   br(EQ, Lexit);
1191   sub(count, count, 1);
1192   cbnz(count, Lloop);
1193   bind(Lexit);
1194 }
1195 
1196 // scans count 4 byte words at [addr] for occurence of value,
1197 // generic
1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1199                                 Register scratch) {
1200   Label Lloop, Lexit;
1201   cbz(count, Lexit);
1202   bind(Lloop);
1203   ldrw(scratch, post(addr, wordSize));
1204   cmpw(value, scratch);
1205   br(EQ, Lexit);
1206   sub(count, count, 1);
1207   cbnz(count, Lloop);
1208   bind(Lexit);
1209 }
1210 
1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1212                                                    Register super_klass,
1213                                                    Register temp_reg,
1214                                                    Register temp2_reg,
1215                                                    Label* L_success,
1216                                                    Label* L_failure,
1217                                                    bool set_cond_codes) {
1218   assert_different_registers(sub_klass, super_klass, temp_reg);
1219   if (temp2_reg != noreg)
1220     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1222 
1223   Label L_fallthrough;
1224   int label_nulls = 0;
1225   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1226   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1227   assert(label_nulls <= 1, "at most one NULL in the batch");
1228 
1229   // a couple of useful fields in sub_klass:
1230   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1231   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1232   Address secondary_supers_addr(sub_klass, ss_offset);
1233   Address super_cache_addr(     sub_klass, sc_offset);
1234 
1235   BLOCK_COMMENT("check_klass_subtype_slow_path");
1236 
1237   // Do a linear scan of the secondary super-klass chain.
1238   // This code is rarely used, so simplicity is a virtue here.
1239   // The repne_scan instruction uses fixed registers, which we must spill.
1240   // Don't worry too much about pre-existing connections with the input regs.
1241 
1242   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1243   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1244 
1245   RegSet pushed_registers;
1246   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1247   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1248 
1249   if (super_klass != r0 || UseCompressedOops) {
1250     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1251   }
1252 
1253   push(pushed_registers, sp);
1254 
1255   // Get super_klass value into r0 (even if it was in r5 or r2).
1256   if (super_klass != r0) {
1257     mov(r0, super_klass);
1258   }
1259 
1260 #ifndef PRODUCT
1261   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1262   Address pst_counter_addr(rscratch2);
1263   ldr(rscratch1, pst_counter_addr);
1264   add(rscratch1, rscratch1, 1);
1265   str(rscratch1, pst_counter_addr);
1266 #endif //PRODUCT
1267 
1268   // We will consult the secondary-super array.
1269   ldr(r5, secondary_supers_addr);
1270   // Load the array length.
1271   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1272   // Skip to start of data.
1273   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1274 
1275   cmp(sp, zr); // Clear Z flag; SP is never zero
1276   // Scan R2 words at [R5] for an occurrence of R0.
1277   // Set NZ/Z based on last compare.
1278   repne_scan(r5, r0, r2, rscratch1);
1279 
1280   // Unspill the temp. registers:
1281   pop(pushed_registers, sp);
1282 
1283   br(Assembler::NE, *L_failure);
1284 
1285   // Success.  Cache the super we found and proceed in triumph.
1286   str(super_klass, super_cache_addr);
1287 
1288   if (L_success != &L_fallthrough) {
1289     b(*L_success);
1290   }
1291 
1292 #undef IS_A_TEMP
1293 
1294   bind(L_fallthrough);
1295 }
1296 
1297 
1298 void MacroAssembler::verify_oop(Register reg, const char* s) {
1299   if (!VerifyOops || VerifyAdapterSharing) {
1300     // Below address of the code string confuses VerifyAdapterSharing
1301     // because it may differ between otherwise equivalent adapters.
1302     return;
1303   }
1304 
1305   // Pass register number to verify_oop_subroutine
1306   const char* b = NULL;
1307   {
1308     ResourceMark rm;
1309     stringStream ss;
1310     ss.print("verify_oop: %s: %s", reg->name(), s);
1311     b = code_string(ss.as_string());
1312   }
1313   BLOCK_COMMENT("verify_oop {");
1314 
1315   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1316   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1317 
1318   mov(r0, reg);
1319   mov(rscratch1, (address)b);
1320 
1321   // call indirectly to solve generation ordering problem
1322   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1323   ldr(rscratch2, Address(rscratch2));
1324   blr(rscratch2);
1325 
1326   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1327   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1328 
1329   BLOCK_COMMENT("} verify_oop");
1330 }
1331 
1332 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1333   if (!VerifyOops || VerifyAdapterSharing) {
1334     // Below address of the code string confuses VerifyAdapterSharing
1335     // because it may differ between otherwise equivalent adapters.
1336     return;
1337   }
1338 
1339   const char* b = NULL;
1340   {
1341     ResourceMark rm;
1342     stringStream ss;
1343     ss.print("verify_oop_addr: %s", s);
1344     b = code_string(ss.as_string());
1345   }
1346   BLOCK_COMMENT("verify_oop_addr {");
1347 
1348   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1349   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1350 
1351   // addr may contain sp so we will have to adjust it based on the
1352   // pushes that we just did.
1353   if (addr.uses(sp)) {
1354     lea(r0, addr);
1355     ldr(r0, Address(r0, 4 * wordSize));
1356   } else {
1357     ldr(r0, addr);
1358   }
1359   mov(rscratch1, (address)b);
1360 
1361   // call indirectly to solve generation ordering problem
1362   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1363   ldr(rscratch2, Address(rscratch2));
1364   blr(rscratch2);
1365 
1366   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1367   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1368 
1369   BLOCK_COMMENT("} verify_oop_addr");
1370 }
1371 
1372 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1373                                          int extra_slot_offset) {
1374   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1375   int stackElementSize = Interpreter::stackElementSize;
1376   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1377 #ifdef ASSERT
1378   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1379   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1380 #endif
1381   if (arg_slot.is_constant()) {
1382     return Address(esp, arg_slot.as_constant() * stackElementSize
1383                    + offset);
1384   } else {
1385     add(rscratch1, esp, arg_slot.as_register(),
1386         ext::uxtx, exact_log2(stackElementSize));
1387     return Address(rscratch1, offset);
1388   }
1389 }
1390 
1391 void MacroAssembler::call_VM_leaf_base(address entry_point,
1392                                        int number_of_arguments,
1393                                        Label *retaddr) {
1394   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1395 }
1396 
1397 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1398                                         int number_of_gp_arguments,
1399                                         int number_of_fp_arguments,
1400                                         ret_type type,
1401                                         Label *retaddr) {
1402   Label E, L;
1403 
1404   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1405 
1406   // We add 1 to number_of_arguments because the thread in arg0 is
1407   // not counted
1408   mov(rscratch1, entry_point);
1409   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1410   if (retaddr)
1411     bind(*retaddr);
1412 
1413   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1414   maybe_isb();
1415 }
1416 
1417 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1418   call_VM_leaf_base(entry_point, number_of_arguments);
1419 }
1420 
1421 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1422   pass_arg0(this, arg_0);
1423   call_VM_leaf_base(entry_point, 1);
1424 }
1425 
1426 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1427   pass_arg0(this, arg_0);
1428   pass_arg1(this, arg_1);
1429   call_VM_leaf_base(entry_point, 2);
1430 }
1431 
1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1433                                   Register arg_1, Register arg_2) {
1434   pass_arg0(this, arg_0);
1435   pass_arg1(this, arg_1);
1436   pass_arg2(this, arg_2);
1437   call_VM_leaf_base(entry_point, 3);
1438 }
1439 
1440 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1441   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1442 }
1443 
1444 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1445   pass_arg0(this, arg_0);
1446   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1447 }
1448 
1449 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1450 
1451   assert(arg_0 != c_rarg1, "smashed arg");
1452   pass_arg1(this, arg_1);
1453   pass_arg0(this, arg_0);
1454   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1455 }
1456 
1457 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1458   assert(arg_0 != c_rarg2, "smashed arg");
1459   assert(arg_1 != c_rarg2, "smashed arg");
1460   pass_arg2(this, arg_2);
1461   assert(arg_0 != c_rarg1, "smashed arg");
1462   pass_arg1(this, arg_1);
1463   pass_arg0(this, arg_0);
1464   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1465 }
1466 
1467 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1468   assert(arg_0 != c_rarg3, "smashed arg");
1469   assert(arg_1 != c_rarg3, "smashed arg");
1470   assert(arg_2 != c_rarg3, "smashed arg");
1471   pass_arg3(this, arg_3);
1472   assert(arg_0 != c_rarg2, "smashed arg");
1473   assert(arg_1 != c_rarg2, "smashed arg");
1474   pass_arg2(this, arg_2);
1475   assert(arg_0 != c_rarg1, "smashed arg");
1476   pass_arg1(this, arg_1);
1477   pass_arg0(this, arg_0);
1478   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1479 }
1480 
1481 void MacroAssembler::null_check(Register reg, int offset) {
1482   if (needs_explicit_null_check(offset)) {
1483     // provoke OS NULL exception if reg = NULL by
1484     // accessing M[reg] w/o changing any registers
1485     // NOTE: this is plenty to provoke a segv
1486     ldr(zr, Address(reg));
1487   } else {
1488     // nothing to do, (later) access of M[reg + offset]
1489     // will provoke OS NULL exception if reg = NULL
1490   }
1491 }
1492 
1493 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
1494   ldrw(temp_reg, Address(klass, Klass::access_flags_offset()));
1495   andr(temp_reg, temp_reg, JVM_ACC_VALUE);
1496   cbnz(temp_reg, is_value); 
1497 }
1498 
1499 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
1500   (void) temp_reg; // keep signature uniform with x86
1501   tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable);
1502 }
1503 
1504 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) {
1505   (void) temp_reg; // keep signature uniform with x86
1506   tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable);
1507 }
1508 
1509 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
1510   (void) temp_reg; // keep signature uniform with x86
1511   tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened);
1512 }
1513 
1514 void MacroAssembler::test_flat_array_klass(Register klass, Register temp_reg, Label& is_flattened) {
1515   ldrw(temp_reg, Address(klass, Klass::layout_helper_offset()));
1516   asrw(temp_reg, temp_reg, Klass::_lh_array_tag_shift);
1517   cmpw(temp_reg, Klass::_lh_array_tag_vt_value);
1518   br(Assembler::EQ, is_flattened);
1519 }
1520 
1521 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flattened) {
1522   load_klass(temp_reg, oop);
1523   test_flat_array_klass(temp_reg, temp_reg, is_flattened);
1524 }
1525 
1526 // MacroAssembler protected routines needed to implement
1527 // public methods
1528 
1529 void MacroAssembler::mov(Register r, Address dest) {
1530   code_section()->relocate(pc(), dest.rspec());
1531   u_int64_t imm64 = (u_int64_t)dest.target();
1532   movptr(r, imm64);
1533 }
1534 
1535 // Move a constant pointer into r.  In AArch64 mode the virtual
1536 // address space is 48 bits in size, so we only need three
1537 // instructions to create a patchable instruction sequence that can
1538 // reach anywhere.
1539 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1540 #ifndef PRODUCT
1541   {
1542     char buffer[64];
1543     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1544     block_comment(buffer);
1545   }
1546 #endif
1547   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1548   movz(r, imm64 & 0xffff);
1549   imm64 >>= 16;
1550   movk(r, imm64 & 0xffff, 16);
1551   imm64 >>= 16;
1552   movk(r, imm64 & 0xffff, 32);
1553 }
1554 
1555 // Macro to mov replicated immediate to vector register.
1556 //  Vd will get the following values for different arrangements in T
1557 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1558 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1559 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1560 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1561 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1562 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1563 //   T1D/T2D: invalid
1564 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1565   assert(T != T1D && T != T2D, "invalid arrangement");
1566   if (T == T8B || T == T16B) {
1567     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1568     movi(Vd, T, imm32 & 0xff, 0);
1569     return;
1570   }
1571   u_int32_t nimm32 = ~imm32;
1572   if (T == T4H || T == T8H) {
1573     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1574     imm32 &= 0xffff;
1575     nimm32 &= 0xffff;
1576   }
1577   u_int32_t x = imm32;
1578   int movi_cnt = 0;
1579   int movn_cnt = 0;
1580   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1581   x = nimm32;
1582   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1583   if (movn_cnt < movi_cnt) imm32 = nimm32;
1584   unsigned lsl = 0;
1585   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1586   if (movn_cnt < movi_cnt)
1587     mvni(Vd, T, imm32 & 0xff, lsl);
1588   else
1589     movi(Vd, T, imm32 & 0xff, lsl);
1590   imm32 >>= 8; lsl += 8;
1591   while (imm32) {
1592     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1593     if (movn_cnt < movi_cnt)
1594       bici(Vd, T, imm32 & 0xff, lsl);
1595     else
1596       orri(Vd, T, imm32 & 0xff, lsl);
1597     lsl += 8; imm32 >>= 8;
1598   }
1599 }
1600 
1601 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1602 {
1603 #ifndef PRODUCT
1604   {
1605     char buffer[64];
1606     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1607     block_comment(buffer);
1608   }
1609 #endif
1610   if (operand_valid_for_logical_immediate(false, imm64)) {
1611     orr(dst, zr, imm64);
1612   } else {
1613     // we can use a combination of MOVZ or MOVN with
1614     // MOVK to build up the constant
1615     u_int64_t imm_h[4];
1616     int zero_count = 0;
1617     int neg_count = 0;
1618     int i;
1619     for (i = 0; i < 4; i++) {
1620       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1621       if (imm_h[i] == 0) {
1622         zero_count++;
1623       } else if (imm_h[i] == 0xffffL) {
1624         neg_count++;
1625       }
1626     }
1627     if (zero_count == 4) {
1628       // one MOVZ will do
1629       movz(dst, 0);
1630     } else if (neg_count == 4) {
1631       // one MOVN will do
1632       movn(dst, 0);
1633     } else if (zero_count == 3) {
1634       for (i = 0; i < 4; i++) {
1635         if (imm_h[i] != 0L) {
1636           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1637           break;
1638         }
1639       }
1640     } else if (neg_count == 3) {
1641       // one MOVN will do
1642       for (int i = 0; i < 4; i++) {
1643         if (imm_h[i] != 0xffffL) {
1644           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1645           break;
1646         }
1647       }
1648     } else if (zero_count == 2) {
1649       // one MOVZ and one MOVK will do
1650       for (i = 0; i < 3; i++) {
1651         if (imm_h[i] != 0L) {
1652           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1653           i++;
1654           break;
1655         }
1656       }
1657       for (;i < 4; i++) {
1658         if (imm_h[i] != 0L) {
1659           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1660         }
1661       }
1662     } else if (neg_count == 2) {
1663       // one MOVN and one MOVK will do
1664       for (i = 0; i < 4; i++) {
1665         if (imm_h[i] != 0xffffL) {
1666           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1667           i++;
1668           break;
1669         }
1670       }
1671       for (;i < 4; i++) {
1672         if (imm_h[i] != 0xffffL) {
1673           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1674         }
1675       }
1676     } else if (zero_count == 1) {
1677       // one MOVZ and two MOVKs will do
1678       for (i = 0; i < 4; i++) {
1679         if (imm_h[i] != 0L) {
1680           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1681           i++;
1682           break;
1683         }
1684       }
1685       for (;i < 4; i++) {
1686         if (imm_h[i] != 0x0L) {
1687           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1688         }
1689       }
1690     } else if (neg_count == 1) {
1691       // one MOVN and two MOVKs will do
1692       for (i = 0; i < 4; i++) {
1693         if (imm_h[i] != 0xffffL) {
1694           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1695           i++;
1696           break;
1697         }
1698       }
1699       for (;i < 4; i++) {
1700         if (imm_h[i] != 0xffffL) {
1701           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1702         }
1703       }
1704     } else {
1705       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1706       movz(dst, (u_int32_t)imm_h[0], 0);
1707       for (i = 1; i < 4; i++) {
1708         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1709       }
1710     }
1711   }
1712 }
1713 
1714 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1715 {
1716 #ifndef PRODUCT
1717     {
1718       char buffer[64];
1719       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1720       block_comment(buffer);
1721     }
1722 #endif
1723   if (operand_valid_for_logical_immediate(true, imm32)) {
1724     orrw(dst, zr, imm32);
1725   } else {
1726     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1727     // constant
1728     u_int32_t imm_h[2];
1729     imm_h[0] = imm32 & 0xffff;
1730     imm_h[1] = ((imm32 >> 16) & 0xffff);
1731     if (imm_h[0] == 0) {
1732       movzw(dst, imm_h[1], 16);
1733     } else if (imm_h[0] == 0xffff) {
1734       movnw(dst, imm_h[1] ^ 0xffff, 16);
1735     } else if (imm_h[1] == 0) {
1736       movzw(dst, imm_h[0], 0);
1737     } else if (imm_h[1] == 0xffff) {
1738       movnw(dst, imm_h[0] ^ 0xffff, 0);
1739     } else {
1740       // use a MOVZ and MOVK (makes it easier to debug)
1741       movzw(dst, imm_h[0], 0);
1742       movkw(dst, imm_h[1], 16);
1743     }
1744   }
1745 }
1746 
1747 // Form an address from base + offset in Rd.  Rd may or may
1748 // not actually be used: you must use the Address that is returned.
1749 // It is up to you to ensure that the shift provided matches the size
1750 // of your data.
1751 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1752   if (Address::offset_ok_for_immed(byte_offset, shift))
1753     // It fits; no need for any heroics
1754     return Address(base, byte_offset);
1755 
1756   // Don't do anything clever with negative or misaligned offsets
1757   unsigned mask = (1 << shift) - 1;
1758   if (byte_offset < 0 || byte_offset & mask) {
1759     mov(Rd, byte_offset);
1760     add(Rd, base, Rd);
1761     return Address(Rd);
1762   }
1763 
1764   // See if we can do this with two 12-bit offsets
1765   {
1766     unsigned long word_offset = byte_offset >> shift;
1767     unsigned long masked_offset = word_offset & 0xfff000;
1768     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1769         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1770       add(Rd, base, masked_offset << shift);
1771       word_offset -= masked_offset;
1772       return Address(Rd, word_offset << shift);
1773     }
1774   }
1775 
1776   // Do it the hard way
1777   mov(Rd, byte_offset);
1778   add(Rd, base, Rd);
1779   return Address(Rd);
1780 }
1781 
1782 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1783   if (UseLSE) {
1784     mov(tmp, 1);
1785     ldadd(Assembler::word, tmp, zr, counter_addr);
1786     return;
1787   }
1788   Label retry_load;
1789   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1790     prfm(Address(counter_addr), PSTL1STRM);
1791   bind(retry_load);
1792   // flush and load exclusive from the memory location
1793   ldxrw(tmp, counter_addr);
1794   addw(tmp, tmp, 1);
1795   // if we store+flush with no intervening write tmp wil be zero
1796   stxrw(tmp2, tmp, counter_addr);
1797   cbnzw(tmp2, retry_load);
1798 }
1799 
1800 
1801 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1802                                     bool want_remainder, Register scratch)
1803 {
1804   // Full implementation of Java idiv and irem.  The function
1805   // returns the (pc) offset of the div instruction - may be needed
1806   // for implicit exceptions.
1807   //
1808   // constraint : ra/rb =/= scratch
1809   //         normal case
1810   //
1811   // input : ra: dividend
1812   //         rb: divisor
1813   //
1814   // result: either
1815   //         quotient  (= ra idiv rb)
1816   //         remainder (= ra irem rb)
1817 
1818   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1819 
1820   int idivl_offset = offset();
1821   if (! want_remainder) {
1822     sdivw(result, ra, rb);
1823   } else {
1824     sdivw(scratch, ra, rb);
1825     Assembler::msubw(result, scratch, rb, ra);
1826   }
1827 
1828   return idivl_offset;
1829 }
1830 
1831 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1832                                     bool want_remainder, Register scratch)
1833 {
1834   // Full implementation of Java ldiv and lrem.  The function
1835   // returns the (pc) offset of the div instruction - may be needed
1836   // for implicit exceptions.
1837   //
1838   // constraint : ra/rb =/= scratch
1839   //         normal case
1840   //
1841   // input : ra: dividend
1842   //         rb: divisor
1843   //
1844   // result: either
1845   //         quotient  (= ra idiv rb)
1846   //         remainder (= ra irem rb)
1847 
1848   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1849 
1850   int idivq_offset = offset();
1851   if (! want_remainder) {
1852     sdiv(result, ra, rb);
1853   } else {
1854     sdiv(scratch, ra, rb);
1855     Assembler::msub(result, scratch, rb, ra);
1856   }
1857 
1858   return idivq_offset;
1859 }
1860 
1861 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1862   address prev = pc() - NativeMembar::instruction_size;
1863   address last = code()->last_insn();
1864   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1865     NativeMembar *bar = NativeMembar_at(prev);
1866     // We are merging two memory barrier instructions.  On AArch64 we
1867     // can do this simply by ORing them together.
1868     bar->set_kind(bar->get_kind() | order_constraint);
1869     BLOCK_COMMENT("merged membar");
1870   } else {
1871     code()->set_last_insn(pc());
1872     dmb(Assembler::barrier(order_constraint));
1873   }
1874 }
1875 
1876 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1877   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1878     merge_ldst(rt, adr, size_in_bytes, is_store);
1879     code()->clear_last_insn();
1880     return true;
1881   } else {
1882     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1883     const unsigned mask = size_in_bytes - 1;
1884     if (adr.getMode() == Address::base_plus_offset &&
1885         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1886       code()->set_last_insn(pc());
1887     }
1888     return false;
1889   }
1890 }
1891 
1892 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1893   // We always try to merge two adjacent loads into one ldp.
1894   if (!try_merge_ldst(Rx, adr, 8, false)) {
1895     Assembler::ldr(Rx, adr);
1896   }
1897 }
1898 
1899 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1900   // We always try to merge two adjacent loads into one ldp.
1901   if (!try_merge_ldst(Rw, adr, 4, false)) {
1902     Assembler::ldrw(Rw, adr);
1903   }
1904 }
1905 
1906 void MacroAssembler::str(Register Rx, const Address &adr) {
1907   // We always try to merge two adjacent stores into one stp.
1908   if (!try_merge_ldst(Rx, adr, 8, true)) {
1909     Assembler::str(Rx, adr);
1910   }
1911 }
1912 
1913 void MacroAssembler::strw(Register Rw, const Address &adr) {
1914   // We always try to merge two adjacent stores into one stp.
1915   if (!try_merge_ldst(Rw, adr, 4, true)) {
1916     Assembler::strw(Rw, adr);
1917   }
1918 }
1919 
1920 // MacroAssembler routines found actually to be needed
1921 
1922 void MacroAssembler::push(Register src)
1923 {
1924   str(src, Address(pre(esp, -1 * wordSize)));
1925 }
1926 
1927 void MacroAssembler::pop(Register dst)
1928 {
1929   ldr(dst, Address(post(esp, 1 * wordSize)));
1930 }
1931 
1932 // Note: load_unsigned_short used to be called load_unsigned_word.
1933 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1934   int off = offset();
1935   ldrh(dst, src);
1936   return off;
1937 }
1938 
1939 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1940   int off = offset();
1941   ldrb(dst, src);
1942   return off;
1943 }
1944 
1945 int MacroAssembler::load_signed_short(Register dst, Address src) {
1946   int off = offset();
1947   ldrsh(dst, src);
1948   return off;
1949 }
1950 
1951 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1952   int off = offset();
1953   ldrsb(dst, src);
1954   return off;
1955 }
1956 
1957 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1958   int off = offset();
1959   ldrshw(dst, src);
1960   return off;
1961 }
1962 
1963 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1964   int off = offset();
1965   ldrsbw(dst, src);
1966   return off;
1967 }
1968 
1969 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1970   switch (size_in_bytes) {
1971   case  8:  ldr(dst, src); break;
1972   case  4:  ldrw(dst, src); break;
1973   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1974   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1975   default:  ShouldNotReachHere();
1976   }
1977 }
1978 
1979 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1980   switch (size_in_bytes) {
1981   case  8:  str(src, dst); break;
1982   case  4:  strw(src, dst); break;
1983   case  2:  strh(src, dst); break;
1984   case  1:  strb(src, dst); break;
1985   default:  ShouldNotReachHere();
1986   }
1987 }
1988 
1989 void MacroAssembler::decrementw(Register reg, int value)
1990 {
1991   if (value < 0)  { incrementw(reg, -value);      return; }
1992   if (value == 0) {                               return; }
1993   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1994   /* else */ {
1995     guarantee(reg != rscratch2, "invalid dst for register decrement");
1996     movw(rscratch2, (unsigned)value);
1997     subw(reg, reg, rscratch2);
1998   }
1999 }
2000 
2001 void MacroAssembler::decrement(Register reg, int value)
2002 {
2003   if (value < 0)  { increment(reg, -value);      return; }
2004   if (value == 0) {                              return; }
2005   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2006   /* else */ {
2007     assert(reg != rscratch2, "invalid dst for register decrement");
2008     mov(rscratch2, (unsigned long)value);
2009     sub(reg, reg, rscratch2);
2010   }
2011 }
2012 
2013 void MacroAssembler::decrementw(Address dst, int value)
2014 {
2015   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2016   if (dst.getMode() == Address::literal) {
2017     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2018     lea(rscratch2, dst);
2019     dst = Address(rscratch2);
2020   }
2021   ldrw(rscratch1, dst);
2022   decrementw(rscratch1, value);
2023   strw(rscratch1, dst);
2024 }
2025 
2026 void MacroAssembler::decrement(Address dst, int value)
2027 {
2028   assert(!dst.uses(rscratch1), "invalid address for decrement");
2029   if (dst.getMode() == Address::literal) {
2030     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2031     lea(rscratch2, dst);
2032     dst = Address(rscratch2);
2033   }
2034   ldr(rscratch1, dst);
2035   decrement(rscratch1, value);
2036   str(rscratch1, dst);
2037 }
2038 
2039 void MacroAssembler::incrementw(Register reg, int value)
2040 {
2041   if (value < 0)  { decrementw(reg, -value);      return; }
2042   if (value == 0) {                               return; }
2043   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2044   /* else */ {
2045     assert(reg != rscratch2, "invalid dst for register increment");
2046     movw(rscratch2, (unsigned)value);
2047     addw(reg, reg, rscratch2);
2048   }
2049 }
2050 
2051 void MacroAssembler::increment(Register reg, int value)
2052 {
2053   if (value < 0)  { decrement(reg, -value);      return; }
2054   if (value == 0) {                              return; }
2055   if (value < (1 << 12)) { add(reg, reg, value); return; }
2056   /* else */ {
2057     assert(reg != rscratch2, "invalid dst for register increment");
2058     movw(rscratch2, (unsigned)value);
2059     add(reg, reg, rscratch2);
2060   }
2061 }
2062 
2063 void MacroAssembler::incrementw(Address dst, int value)
2064 {
2065   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2066   if (dst.getMode() == Address::literal) {
2067     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2068     lea(rscratch2, dst);
2069     dst = Address(rscratch2);
2070   }
2071   ldrw(rscratch1, dst);
2072   incrementw(rscratch1, value);
2073   strw(rscratch1, dst);
2074 }
2075 
2076 void MacroAssembler::increment(Address dst, int value)
2077 {
2078   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2079   if (dst.getMode() == Address::literal) {
2080     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2081     lea(rscratch2, dst);
2082     dst = Address(rscratch2);
2083   }
2084   ldr(rscratch1, dst);
2085   increment(rscratch1, value);
2086   str(rscratch1, dst);
2087 }
2088 
2089 
2090 void MacroAssembler::pusha() {
2091   push(0x7fffffff, sp);
2092 }
2093 
2094 void MacroAssembler::popa() {
2095   pop(0x7fffffff, sp);
2096 }
2097 
2098 // Push lots of registers in the bit set supplied.  Don't push sp.
2099 // Return the number of words pushed
2100 int MacroAssembler::push(unsigned int bitset, Register stack) {
2101   int words_pushed = 0;
2102 
2103   // Scan bitset to accumulate register pairs
2104   unsigned char regs[32];
2105   int count = 0;
2106   for (int reg = 0; reg <= 30; reg++) {
2107     if (1 & bitset)
2108       regs[count++] = reg;
2109     bitset >>= 1;
2110   }
2111   regs[count++] = zr->encoding_nocheck();
2112   count &= ~1;  // Only push an even nuber of regs
2113 
2114   if (count) {
2115     stp(as_Register(regs[0]), as_Register(regs[1]),
2116        Address(pre(stack, -count * wordSize)));
2117     words_pushed += 2;
2118   }
2119   for (int i = 2; i < count; i += 2) {
2120     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2121        Address(stack, i * wordSize));
2122     words_pushed += 2;
2123   }
2124 
2125   assert(words_pushed == count, "oops, pushed != count");
2126 
2127   return count;
2128 }
2129 
2130 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2131   int words_pushed = 0;
2132 
2133   // Scan bitset to accumulate register pairs
2134   unsigned char regs[32];
2135   int count = 0;
2136   for (int reg = 0; reg <= 30; reg++) {
2137     if (1 & bitset)
2138       regs[count++] = reg;
2139     bitset >>= 1;
2140   }
2141   regs[count++] = zr->encoding_nocheck();
2142   count &= ~1;
2143 
2144   for (int i = 2; i < count; i += 2) {
2145     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2146        Address(stack, i * wordSize));
2147     words_pushed += 2;
2148   }
2149   if (count) {
2150     ldp(as_Register(regs[0]), as_Register(regs[1]),
2151        Address(post(stack, count * wordSize)));
2152     words_pushed += 2;
2153   }
2154 
2155   assert(words_pushed == count, "oops, pushed != count");
2156 
2157   return count;
2158 }
2159 #ifdef ASSERT
2160 void MacroAssembler::verify_heapbase(const char* msg) {
2161 #if 0
2162   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2163   assert (Universe::heap() != NULL, "java heap should be initialized");
2164   if (CheckCompressedOops) {
2165     Label ok;
2166     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2167     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2168     br(Assembler::EQ, ok);
2169     stop(msg);
2170     bind(ok);
2171     pop(1 << rscratch1->encoding(), sp);
2172   }
2173 #endif
2174 }
2175 #endif
2176 
2177 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2178   Label done, not_weak;
2179   cbz(value, done);           // Use NULL as-is.
2180 
2181   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2182   tbz(r0, 0, not_weak);    // Test for jweak tag.
2183 
2184   // Resolve jweak.
2185   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2186                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2187   verify_oop(value);
2188   b(done);
2189 
2190   bind(not_weak);
2191   // Resolve (untagged) jobject.
2192   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2193   verify_oop(value);
2194   bind(done);
2195 }
2196 
2197 void MacroAssembler::stop(const char* msg) {
2198   address ip = pc();
2199   pusha();
2200   mov(c_rarg0, (address)msg);
2201   mov(c_rarg1, (address)ip);
2202   mov(c_rarg2, sp);
2203   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2204   // call(c_rarg3);
2205   blrt(c_rarg3, 3, 0, 1);
2206   hlt(0);
2207 }
2208 
2209 void MacroAssembler::warn(const char* msg) {
2210   pusha();
2211   mov(c_rarg0, (address)msg);
2212   mov(lr, CAST_FROM_FN_PTR(address, warning));
2213   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2214   popa();
2215 }
2216 
2217 void MacroAssembler::unimplemented(const char* what) {
2218   const char* buf = NULL;
2219   {
2220     ResourceMark rm;
2221     stringStream ss;
2222     ss.print("unimplemented: %s", what);
2223     buf = code_string(ss.as_string());
2224   }
2225   stop(buf);
2226 }
2227 
2228 // If a constant does not fit in an immediate field, generate some
2229 // number of MOV instructions and then perform the operation.
2230 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2231                                            add_sub_imm_insn insn1,
2232                                            add_sub_reg_insn insn2) {
2233   assert(Rd != zr, "Rd = zr and not setting flags?");
2234   if (operand_valid_for_add_sub_immediate((int)imm)) {
2235     (this->*insn1)(Rd, Rn, imm);
2236   } else {
2237     if (uabs(imm) < (1 << 24)) {
2238        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2239        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2240     } else {
2241        assert_different_registers(Rd, Rn);
2242        mov(Rd, (uint64_t)imm);
2243        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2244     }
2245   }
2246 }
2247 
2248 // Seperate vsn which sets the flags. Optimisations are more restricted
2249 // because we must set the flags correctly.
2250 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2251                                            add_sub_imm_insn insn1,
2252                                            add_sub_reg_insn insn2) {
2253   if (operand_valid_for_add_sub_immediate((int)imm)) {
2254     (this->*insn1)(Rd, Rn, imm);
2255   } else {
2256     assert_different_registers(Rd, Rn);
2257     assert(Rd != zr, "overflow in immediate operand");
2258     mov(Rd, (uint64_t)imm);
2259     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2260   }
2261 }
2262 
2263 
2264 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2265   if (increment.is_register()) {
2266     add(Rd, Rn, increment.as_register());
2267   } else {
2268     add(Rd, Rn, increment.as_constant());
2269   }
2270 }
2271 
2272 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2273   if (increment.is_register()) {
2274     addw(Rd, Rn, increment.as_register());
2275   } else {
2276     addw(Rd, Rn, increment.as_constant());
2277   }
2278 }
2279 
2280 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2281   if (decrement.is_register()) {
2282     sub(Rd, Rn, decrement.as_register());
2283   } else {
2284     sub(Rd, Rn, decrement.as_constant());
2285   }
2286 }
2287 
2288 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2289   if (decrement.is_register()) {
2290     subw(Rd, Rn, decrement.as_register());
2291   } else {
2292     subw(Rd, Rn, decrement.as_constant());
2293   }
2294 }
2295 
2296 void MacroAssembler::reinit_heapbase()
2297 {
2298   if (UseCompressedOops) {
2299     if (Universe::is_fully_initialized()) {
2300       mov(rheapbase, Universe::narrow_ptrs_base());
2301     } else {
2302       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2303       ldr(rheapbase, Address(rheapbase));
2304     }
2305   }
2306 }
2307 
2308 // this simulates the behaviour of the x86 cmpxchg instruction using a
2309 // load linked/store conditional pair. we use the acquire/release
2310 // versions of these instructions so that we flush pending writes as
2311 // per Java semantics.
2312 
2313 // n.b the x86 version assumes the old value to be compared against is
2314 // in rax and updates rax with the value located in memory if the
2315 // cmpxchg fails. we supply a register for the old value explicitly
2316 
2317 // the aarch64 load linked/store conditional instructions do not
2318 // accept an offset. so, unlike x86, we must provide a plain register
2319 // to identify the memory word to be compared/exchanged rather than a
2320 // register+offset Address.
2321 
2322 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2323                                 Label &succeed, Label *fail) {
2324   // oldv holds comparison value
2325   // newv holds value to write in exchange
2326   // addr identifies memory word to compare against/update
2327   if (UseLSE) {
2328     mov(tmp, oldv);
2329     casal(Assembler::xword, oldv, newv, addr);
2330     cmp(tmp, oldv);
2331     br(Assembler::EQ, succeed);
2332     membar(AnyAny);
2333   } else {
2334     Label retry_load, nope;
2335     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2336       prfm(Address(addr), PSTL1STRM);
2337     bind(retry_load);
2338     // flush and load exclusive from the memory location
2339     // and fail if it is not what we expect
2340     ldaxr(tmp, addr);
2341     cmp(tmp, oldv);
2342     br(Assembler::NE, nope);
2343     // if we store+flush with no intervening write tmp wil be zero
2344     stlxr(tmp, newv, addr);
2345     cbzw(tmp, succeed);
2346     // retry so we only ever return after a load fails to compare
2347     // ensures we don't return a stale value after a failed write.
2348     b(retry_load);
2349     // if the memory word differs we return it in oldv and signal a fail
2350     bind(nope);
2351     membar(AnyAny);
2352     mov(oldv, tmp);
2353   }
2354   if (fail)
2355     b(*fail);
2356 }
2357 
2358 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2359                                         Label &succeed, Label *fail) {
2360   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2361   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2362 }
2363 
2364 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2365                                 Label &succeed, Label *fail) {
2366   // oldv holds comparison value
2367   // newv holds value to write in exchange
2368   // addr identifies memory word to compare against/update
2369   // tmp returns 0/1 for success/failure
2370   if (UseLSE) {
2371     mov(tmp, oldv);
2372     casal(Assembler::word, oldv, newv, addr);
2373     cmp(tmp, oldv);
2374     br(Assembler::EQ, succeed);
2375     membar(AnyAny);
2376   } else {
2377     Label retry_load, nope;
2378     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2379       prfm(Address(addr), PSTL1STRM);
2380     bind(retry_load);
2381     // flush and load exclusive from the memory location
2382     // and fail if it is not what we expect
2383     ldaxrw(tmp, addr);
2384     cmp(tmp, oldv);
2385     br(Assembler::NE, nope);
2386     // if we store+flush with no intervening write tmp wil be zero
2387     stlxrw(tmp, newv, addr);
2388     cbzw(tmp, succeed);
2389     // retry so we only ever return after a load fails to compare
2390     // ensures we don't return a stale value after a failed write.
2391     b(retry_load);
2392     // if the memory word differs we return it in oldv and signal a fail
2393     bind(nope);
2394     membar(AnyAny);
2395     mov(oldv, tmp);
2396   }
2397   if (fail)
2398     b(*fail);
2399 }
2400 
2401 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2402 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2403 // Pass a register for the result, otherwise pass noreg.
2404 
2405 // Clobbers rscratch1
2406 void MacroAssembler::cmpxchg(Register addr, Register expected,
2407                              Register new_val,
2408                              enum operand_size size,
2409                              bool acquire, bool release,
2410                              bool weak,
2411                              Register result) {
2412   if (result == noreg)  result = rscratch1;
2413   BLOCK_COMMENT("cmpxchg {");
2414   if (UseLSE) {
2415     mov(result, expected);
2416     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2417     compare_eq(result, expected, size);
2418   } else {
2419     Label retry_load, done;
2420     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2421       prfm(Address(addr), PSTL1STRM);
2422     bind(retry_load);
2423     load_exclusive(result, addr, size, acquire);
2424     compare_eq(result, expected, size);
2425     br(Assembler::NE, done);
2426     store_exclusive(rscratch1, new_val, addr, size, release);
2427     if (weak) {
2428       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2429     } else {
2430       cbnzw(rscratch1, retry_load);
2431     }
2432     bind(done);
2433   }
2434   BLOCK_COMMENT("} cmpxchg");
2435 }
2436 
2437 // A generic comparison. Only compares for equality, clobbers rscratch1.
2438 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2439   if (size == xword) {
2440     cmp(rm, rn);
2441   } else if (size == word) {
2442     cmpw(rm, rn);
2443   } else if (size == halfword) {
2444     eorw(rscratch1, rm, rn);
2445     ands(zr, rscratch1, 0xffff);
2446   } else if (size == byte) {
2447     eorw(rscratch1, rm, rn);
2448     ands(zr, rscratch1, 0xff);
2449   } else {
2450     ShouldNotReachHere();
2451   }
2452 }
2453 
2454 
2455 static bool different(Register a, RegisterOrConstant b, Register c) {
2456   if (b.is_constant())
2457     return a != c;
2458   else
2459     return a != b.as_register() && a != c && b.as_register() != c;
2460 }
2461 
2462 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2463 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2464   if (UseLSE) {                                                         \
2465     prev = prev->is_valid() ? prev : zr;                                \
2466     if (incr.is_register()) {                                           \
2467       AOP(sz, incr.as_register(), prev, addr);                          \
2468     } else {                                                            \
2469       mov(rscratch2, incr.as_constant());                               \
2470       AOP(sz, rscratch2, prev, addr);                                   \
2471     }                                                                   \
2472     return;                                                             \
2473   }                                                                     \
2474   Register result = rscratch2;                                          \
2475   if (prev->is_valid())                                                 \
2476     result = different(prev, incr, addr) ? prev : rscratch2;            \
2477                                                                         \
2478   Label retry_load;                                                     \
2479   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2480     prfm(Address(addr), PSTL1STRM);                                     \
2481   bind(retry_load);                                                     \
2482   LDXR(result, addr);                                                   \
2483   OP(rscratch1, result, incr);                                          \
2484   STXR(rscratch2, rscratch1, addr);                                     \
2485   cbnzw(rscratch2, retry_load);                                         \
2486   if (prev->is_valid() && prev != result) {                             \
2487     IOP(prev, rscratch1, incr);                                         \
2488   }                                                                     \
2489 }
2490 
2491 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2492 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2493 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2494 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2495 
2496 #undef ATOMIC_OP
2497 
2498 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2499 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2500   if (UseLSE) {                                                         \
2501     prev = prev->is_valid() ? prev : zr;                                \
2502     AOP(sz, newv, prev, addr);                                          \
2503     return;                                                             \
2504   }                                                                     \
2505   Register result = rscratch2;                                          \
2506   if (prev->is_valid())                                                 \
2507     result = different(prev, newv, addr) ? prev : rscratch2;            \
2508                                                                         \
2509   Label retry_load;                                                     \
2510   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2511     prfm(Address(addr), PSTL1STRM);                                     \
2512   bind(retry_load);                                                     \
2513   LDXR(result, addr);                                                   \
2514   STXR(rscratch1, newv, addr);                                          \
2515   cbnzw(rscratch1, retry_load);                                         \
2516   if (prev->is_valid() && prev != result)                               \
2517     mov(prev, result);                                                  \
2518 }
2519 
2520 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2521 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2522 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2523 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2524 
2525 #undef ATOMIC_XCHG
2526 
2527 #ifndef PRODUCT
2528 extern "C" void findpc(intptr_t x);
2529 #endif
2530 
2531 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2532 {
2533   // In order to get locks to work, we need to fake a in_VM state
2534   if (ShowMessageBoxOnError ) {
2535     JavaThread* thread = JavaThread::current();
2536     JavaThreadState saved_state = thread->thread_state();
2537     thread->set_thread_state(_thread_in_vm);
2538 #ifndef PRODUCT
2539     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2540       ttyLocker ttyl;
2541       BytecodeCounter::print();
2542     }
2543 #endif
2544     if (os::message_box(msg, "Execution stopped, print registers?")) {
2545       ttyLocker ttyl;
2546       tty->print_cr(" pc = 0x%016lx", pc);
2547 #ifndef PRODUCT
2548       tty->cr();
2549       findpc(pc);
2550       tty->cr();
2551 #endif
2552       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2553       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2554       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2555       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2556       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2557       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2558       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2559       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2560       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2561       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2562       tty->print_cr("r10 = 0x%016lx", regs[10]);
2563       tty->print_cr("r11 = 0x%016lx", regs[11]);
2564       tty->print_cr("r12 = 0x%016lx", regs[12]);
2565       tty->print_cr("r13 = 0x%016lx", regs[13]);
2566       tty->print_cr("r14 = 0x%016lx", regs[14]);
2567       tty->print_cr("r15 = 0x%016lx", regs[15]);
2568       tty->print_cr("r16 = 0x%016lx", regs[16]);
2569       tty->print_cr("r17 = 0x%016lx", regs[17]);
2570       tty->print_cr("r18 = 0x%016lx", regs[18]);
2571       tty->print_cr("r19 = 0x%016lx", regs[19]);
2572       tty->print_cr("r20 = 0x%016lx", regs[20]);
2573       tty->print_cr("r21 = 0x%016lx", regs[21]);
2574       tty->print_cr("r22 = 0x%016lx", regs[22]);
2575       tty->print_cr("r23 = 0x%016lx", regs[23]);
2576       tty->print_cr("r24 = 0x%016lx", regs[24]);
2577       tty->print_cr("r25 = 0x%016lx", regs[25]);
2578       tty->print_cr("r26 = 0x%016lx", regs[26]);
2579       tty->print_cr("r27 = 0x%016lx", regs[27]);
2580       tty->print_cr("r28 = 0x%016lx", regs[28]);
2581       tty->print_cr("r30 = 0x%016lx", regs[30]);
2582       tty->print_cr("r31 = 0x%016lx", regs[31]);
2583       BREAKPOINT;
2584     }
2585     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2586   } else {
2587     ttyLocker ttyl;
2588     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2589                     msg);
2590     assert(false, "DEBUG MESSAGE: %s", msg);
2591   }
2592 }
2593 
2594 #ifdef BUILTIN_SIM
2595 // routine to generate an x86 prolog for a stub function which
2596 // bootstraps into the generated ARM code which directly follows the
2597 // stub
2598 //
2599 // the argument encodes the number of general and fp registers
2600 // passed by the caller and the callng convention (currently just
2601 // the number of general registers and assumes C argument passing)
2602 
2603 extern "C" {
2604 int aarch64_stub_prolog_size();
2605 void aarch64_stub_prolog();
2606 void aarch64_prolog();
2607 }
2608 
2609 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2610                                    address *prolog_ptr)
2611 {
2612   int calltype = (((ret_type & 0x3) << 8) |
2613                   ((fp_arg_count & 0xf) << 4) |
2614                   (gp_arg_count & 0xf));
2615 
2616   // the addresses for the x86 to ARM entry code we need to use
2617   address start = pc();
2618   // printf("start = %lx\n", start);
2619   int byteCount =  aarch64_stub_prolog_size();
2620   // printf("byteCount = %x\n", byteCount);
2621   int instructionCount = (byteCount + 3)/ 4;
2622   // printf("instructionCount = %x\n", instructionCount);
2623   for (int i = 0; i < instructionCount; i++) {
2624     nop();
2625   }
2626 
2627   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2628 
2629   // write the address of the setup routine and the call format at the
2630   // end of into the copied code
2631   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2632   if (prolog_ptr)
2633     patch_end[-2] = (u_int64_t)prolog_ptr;
2634   patch_end[-1] = calltype;
2635 }
2636 #endif
2637 
2638 void MacroAssembler::push_call_clobbered_registers() {
2639   int step = 4 * wordSize;
2640   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2641   sub(sp, sp, step);
2642   mov(rscratch1, -step);
2643   // Push v0-v7, v16-v31.
2644   for (int i = 31; i>= 4; i -= 4) {
2645     if (i <= v7->encoding() || i >= v16->encoding())
2646       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2647           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2648   }
2649   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2650       as_FloatRegister(3), T1D, Address(sp));
2651 }
2652 
2653 void MacroAssembler::pop_call_clobbered_registers() {
2654   for (int i = 0; i < 32; i += 4) {
2655     if (i <= v7->encoding() || i >= v16->encoding())
2656       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2657           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2658   }
2659 
2660   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2661 }
2662 
2663 void MacroAssembler::push_CPU_state(bool save_vectors) {
2664   int step = (save_vectors ? 8 : 4) * wordSize;
2665   push(0x3fffffff, sp);         // integer registers except lr & sp
2666   mov(rscratch1, -step);
2667   sub(sp, sp, step);
2668   for (int i = 28; i >= 4; i -= 4) {
2669     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2670         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2671   }
2672   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2673 }
2674 
2675 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2676   int step = (restore_vectors ? 8 : 4) * wordSize;
2677   for (int i = 0; i <= 28; i += 4)
2678     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2679         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2680   pop(0x3fffffff, sp);         // integer registers except lr & sp
2681 }
2682 
2683 /**
2684  * Helpers for multiply_to_len().
2685  */
2686 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2687                                      Register src1, Register src2) {
2688   adds(dest_lo, dest_lo, src1);
2689   adc(dest_hi, dest_hi, zr);
2690   adds(dest_lo, dest_lo, src2);
2691   adc(final_dest_hi, dest_hi, zr);
2692 }
2693 
2694 // Generate an address from (r + r1 extend offset).  "size" is the
2695 // size of the operand.  The result may be in rscratch2.
2696 Address MacroAssembler::offsetted_address(Register r, Register r1,
2697                                           Address::extend ext, int offset, int size) {
2698   if (offset || (ext.shift() % size != 0)) {
2699     lea(rscratch2, Address(r, r1, ext));
2700     return Address(rscratch2, offset);
2701   } else {
2702     return Address(r, r1, ext);
2703   }
2704 }
2705 
2706 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2707 {
2708   assert(offset >= 0, "spill to negative address?");
2709   // Offset reachable ?
2710   //   Not aligned - 9 bits signed offset
2711   //   Aligned - 12 bits unsigned offset shifted
2712   Register base = sp;
2713   if ((offset & (size-1)) && offset >= (1<<8)) {
2714     add(tmp, base, offset & ((1<<12)-1));
2715     base = tmp;
2716     offset &= -1<<12;
2717   }
2718 
2719   if (offset >= (1<<12) * size) {
2720     add(tmp, base, offset & (((1<<12)-1)<<12));
2721     base = tmp;
2722     offset &= ~(((1<<12)-1)<<12);
2723   }
2724 
2725   return Address(base, offset);
2726 }
2727 
2728 // Checks whether offset is aligned.
2729 // Returns true if it is, else false.
2730 bool MacroAssembler::merge_alignment_check(Register base,
2731                                            size_t size,
2732                                            long cur_offset,
2733                                            long prev_offset) const {
2734   if (AvoidUnalignedAccesses) {
2735     if (base == sp) {
2736       // Checks whether low offset if aligned to pair of registers.
2737       long pair_mask = size * 2 - 1;
2738       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2739       return (offset & pair_mask) == 0;
2740     } else { // If base is not sp, we can't guarantee the access is aligned.
2741       return false;
2742     }
2743   } else {
2744     long mask = size - 1;
2745     // Load/store pair instruction only supports element size aligned offset.
2746     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2747   }
2748 }
2749 
2750 // Checks whether current and previous loads/stores can be merged.
2751 // Returns true if it can be merged, else false.
2752 bool MacroAssembler::ldst_can_merge(Register rt,
2753                                     const Address &adr,
2754                                     size_t cur_size_in_bytes,
2755                                     bool is_store) const {
2756   address prev = pc() - NativeInstruction::instruction_size;
2757   address last = code()->last_insn();
2758 
2759   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2760     return false;
2761   }
2762 
2763   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2764     return false;
2765   }
2766 
2767   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2768   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2769 
2770   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2771   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2772 
2773   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2774     return false;
2775   }
2776 
2777   long max_offset = 63 * prev_size_in_bytes;
2778   long min_offset = -64 * prev_size_in_bytes;
2779 
2780   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2781 
2782   // Only same base can be merged.
2783   if (adr.base() != prev_ldst->base()) {
2784     return false;
2785   }
2786 
2787   long cur_offset = adr.offset();
2788   long prev_offset = prev_ldst->offset();
2789   size_t diff = abs(cur_offset - prev_offset);
2790   if (diff != prev_size_in_bytes) {
2791     return false;
2792   }
2793 
2794   // Following cases can not be merged:
2795   // ldr x2, [x2, #8]
2796   // ldr x3, [x2, #16]
2797   // or:
2798   // ldr x2, [x3, #8]
2799   // ldr x2, [x3, #16]
2800   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2801   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2802     return false;
2803   }
2804 
2805   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2806   // Offset range must be in ldp/stp instruction's range.
2807   if (low_offset > max_offset || low_offset < min_offset) {
2808     return false;
2809   }
2810 
2811   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2812     return true;
2813   }
2814 
2815   return false;
2816 }
2817 
2818 // Merge current load/store with previous load/store into ldp/stp.
2819 void MacroAssembler::merge_ldst(Register rt,
2820                                 const Address &adr,
2821                                 size_t cur_size_in_bytes,
2822                                 bool is_store) {
2823 
2824   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2825 
2826   Register rt_low, rt_high;
2827   address prev = pc() - NativeInstruction::instruction_size;
2828   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2829 
2830   long offset;
2831 
2832   if (adr.offset() < prev_ldst->offset()) {
2833     offset = adr.offset();
2834     rt_low = rt;
2835     rt_high = prev_ldst->target();
2836   } else {
2837     offset = prev_ldst->offset();
2838     rt_low = prev_ldst->target();
2839     rt_high = rt;
2840   }
2841 
2842   Address adr_p = Address(prev_ldst->base(), offset);
2843   // Overwrite previous generated binary.
2844   code_section()->set_end(prev);
2845 
2846   const int sz = prev_ldst->size_in_bytes();
2847   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2848   if (!is_store) {
2849     BLOCK_COMMENT("merged ldr pair");
2850     if (sz == 8) {
2851       ldp(rt_low, rt_high, adr_p);
2852     } else {
2853       ldpw(rt_low, rt_high, adr_p);
2854     }
2855   } else {
2856     BLOCK_COMMENT("merged str pair");
2857     if (sz == 8) {
2858       stp(rt_low, rt_high, adr_p);
2859     } else {
2860       stpw(rt_low, rt_high, adr_p);
2861     }
2862   }
2863 }
2864 
2865 /**
2866  * Multiply 64 bit by 64 bit first loop.
2867  */
2868 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2869                                            Register y, Register y_idx, Register z,
2870                                            Register carry, Register product,
2871                                            Register idx, Register kdx) {
2872   //
2873   //  jlong carry, x[], y[], z[];
2874   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2875   //    huge_128 product = y[idx] * x[xstart] + carry;
2876   //    z[kdx] = (jlong)product;
2877   //    carry  = (jlong)(product >>> 64);
2878   //  }
2879   //  z[xstart] = carry;
2880   //
2881 
2882   Label L_first_loop, L_first_loop_exit;
2883   Label L_one_x, L_one_y, L_multiply;
2884 
2885   subsw(xstart, xstart, 1);
2886   br(Assembler::MI, L_one_x);
2887 
2888   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2889   ldr(x_xstart, Address(rscratch1));
2890   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2891 
2892   bind(L_first_loop);
2893   subsw(idx, idx, 1);
2894   br(Assembler::MI, L_first_loop_exit);
2895   subsw(idx, idx, 1);
2896   br(Assembler::MI, L_one_y);
2897   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2898   ldr(y_idx, Address(rscratch1));
2899   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2900   bind(L_multiply);
2901 
2902   // AArch64 has a multiply-accumulate instruction that we can't use
2903   // here because it has no way to process carries, so we have to use
2904   // separate add and adc instructions.  Bah.
2905   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2906   mul(product, x_xstart, y_idx);
2907   adds(product, product, carry);
2908   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2909 
2910   subw(kdx, kdx, 2);
2911   ror(product, product, 32); // back to big-endian
2912   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2913 
2914   b(L_first_loop);
2915 
2916   bind(L_one_y);
2917   ldrw(y_idx, Address(y,  0));
2918   b(L_multiply);
2919 
2920   bind(L_one_x);
2921   ldrw(x_xstart, Address(x,  0));
2922   b(L_first_loop);
2923 
2924   bind(L_first_loop_exit);
2925 }
2926 
2927 /**
2928  * Multiply 128 bit by 128. Unrolled inner loop.
2929  *
2930  */
2931 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2932                                              Register carry, Register carry2,
2933                                              Register idx, Register jdx,
2934                                              Register yz_idx1, Register yz_idx2,
2935                                              Register tmp, Register tmp3, Register tmp4,
2936                                              Register tmp6, Register product_hi) {
2937 
2938   //   jlong carry, x[], y[], z[];
2939   //   int kdx = ystart+1;
2940   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2941   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2942   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2943   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2944   //     carry  = (jlong)(tmp4 >>> 64);
2945   //     z[kdx+idx+1] = (jlong)tmp3;
2946   //     z[kdx+idx] = (jlong)tmp4;
2947   //   }
2948   //   idx += 2;
2949   //   if (idx > 0) {
2950   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2951   //     z[kdx+idx] = (jlong)yz_idx1;
2952   //     carry  = (jlong)(yz_idx1 >>> 64);
2953   //   }
2954   //
2955 
2956   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2957 
2958   lsrw(jdx, idx, 2);
2959 
2960   bind(L_third_loop);
2961 
2962   subsw(jdx, jdx, 1);
2963   br(Assembler::MI, L_third_loop_exit);
2964   subw(idx, idx, 4);
2965 
2966   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2967 
2968   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2969 
2970   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2971 
2972   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2973   ror(yz_idx2, yz_idx2, 32);
2974 
2975   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2976 
2977   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2978   umulh(tmp4, product_hi, yz_idx1);
2979 
2980   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2981   ror(rscratch2, rscratch2, 32);
2982 
2983   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2984   umulh(carry2, product_hi, yz_idx2);
2985 
2986   // propagate sum of both multiplications into carry:tmp4:tmp3
2987   adds(tmp3, tmp3, carry);
2988   adc(tmp4, tmp4, zr);
2989   adds(tmp3, tmp3, rscratch1);
2990   adcs(tmp4, tmp4, tmp);
2991   adc(carry, carry2, zr);
2992   adds(tmp4, tmp4, rscratch2);
2993   adc(carry, carry, zr);
2994 
2995   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2996   ror(tmp4, tmp4, 32);
2997   stp(tmp4, tmp3, Address(tmp6, 0));
2998 
2999   b(L_third_loop);
3000   bind (L_third_loop_exit);
3001 
3002   andw (idx, idx, 0x3);
3003   cbz(idx, L_post_third_loop_done);
3004 
3005   Label L_check_1;
3006   subsw(idx, idx, 2);
3007   br(Assembler::MI, L_check_1);
3008 
3009   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3010   ldr(yz_idx1, Address(rscratch1, 0));
3011   ror(yz_idx1, yz_idx1, 32);
3012   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3013   umulh(tmp4, product_hi, yz_idx1);
3014   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3015   ldr(yz_idx2, Address(rscratch1, 0));
3016   ror(yz_idx2, yz_idx2, 32);
3017 
3018   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3019 
3020   ror(tmp3, tmp3, 32);
3021   str(tmp3, Address(rscratch1, 0));
3022 
3023   bind (L_check_1);
3024 
3025   andw (idx, idx, 0x1);
3026   subsw(idx, idx, 1);
3027   br(Assembler::MI, L_post_third_loop_done);
3028   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3029   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3030   umulh(carry2, tmp4, product_hi);
3031   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3032 
3033   add2_with_carry(carry2, tmp3, tmp4, carry);
3034 
3035   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3036   extr(carry, carry2, tmp3, 32);
3037 
3038   bind(L_post_third_loop_done);
3039 }
3040 
3041 /**
3042  * Code for BigInteger::multiplyToLen() instrinsic.
3043  *
3044  * r0: x
3045  * r1: xlen
3046  * r2: y
3047  * r3: ylen
3048  * r4:  z
3049  * r5: zlen
3050  * r10: tmp1
3051  * r11: tmp2
3052  * r12: tmp3
3053  * r13: tmp4
3054  * r14: tmp5
3055  * r15: tmp6
3056  * r16: tmp7
3057  *
3058  */
3059 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3060                                      Register z, Register zlen,
3061                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3062                                      Register tmp5, Register tmp6, Register product_hi) {
3063 
3064   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3065 
3066   const Register idx = tmp1;
3067   const Register kdx = tmp2;
3068   const Register xstart = tmp3;
3069 
3070   const Register y_idx = tmp4;
3071   const Register carry = tmp5;
3072   const Register product  = xlen;
3073   const Register x_xstart = zlen;  // reuse register
3074 
3075   // First Loop.
3076   //
3077   //  final static long LONG_MASK = 0xffffffffL;
3078   //  int xstart = xlen - 1;
3079   //  int ystart = ylen - 1;
3080   //  long carry = 0;
3081   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3082   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3083   //    z[kdx] = (int)product;
3084   //    carry = product >>> 32;
3085   //  }
3086   //  z[xstart] = (int)carry;
3087   //
3088 
3089   movw(idx, ylen);      // idx = ylen;
3090   movw(kdx, zlen);      // kdx = xlen+ylen;
3091   mov(carry, zr);       // carry = 0;
3092 
3093   Label L_done;
3094 
3095   movw(xstart, xlen);
3096   subsw(xstart, xstart, 1);
3097   br(Assembler::MI, L_done);
3098 
3099   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3100 
3101   Label L_second_loop;
3102   cbzw(kdx, L_second_loop);
3103 
3104   Label L_carry;
3105   subw(kdx, kdx, 1);
3106   cbzw(kdx, L_carry);
3107 
3108   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3109   lsr(carry, carry, 32);
3110   subw(kdx, kdx, 1);
3111 
3112   bind(L_carry);
3113   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3114 
3115   // Second and third (nested) loops.
3116   //
3117   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3118   //   carry = 0;
3119   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3120   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3121   //                    (z[k] & LONG_MASK) + carry;
3122   //     z[k] = (int)product;
3123   //     carry = product >>> 32;
3124   //   }
3125   //   z[i] = (int)carry;
3126   // }
3127   //
3128   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3129 
3130   const Register jdx = tmp1;
3131 
3132   bind(L_second_loop);
3133   mov(carry, zr);                // carry = 0;
3134   movw(jdx, ylen);               // j = ystart+1
3135 
3136   subsw(xstart, xstart, 1);      // i = xstart-1;
3137   br(Assembler::MI, L_done);
3138 
3139   str(z, Address(pre(sp, -4 * wordSize)));
3140 
3141   Label L_last_x;
3142   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3143   subsw(xstart, xstart, 1);       // i = xstart-1;
3144   br(Assembler::MI, L_last_x);
3145 
3146   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3147   ldr(product_hi, Address(rscratch1));
3148   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3149 
3150   Label L_third_loop_prologue;
3151   bind(L_third_loop_prologue);
3152 
3153   str(ylen, Address(sp, wordSize));
3154   stp(x, xstart, Address(sp, 2 * wordSize));
3155   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3156                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3157   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3158   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3159 
3160   addw(tmp3, xlen, 1);
3161   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3162   subsw(tmp3, tmp3, 1);
3163   br(Assembler::MI, L_done);
3164 
3165   lsr(carry, carry, 32);
3166   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3167   b(L_second_loop);
3168 
3169   // Next infrequent code is moved outside loops.
3170   bind(L_last_x);
3171   ldrw(product_hi, Address(x,  0));
3172   b(L_third_loop_prologue);
3173 
3174   bind(L_done);
3175 }
3176 
3177 // Code for BigInteger::mulAdd instrinsic
3178 // out     = r0
3179 // in      = r1
3180 // offset  = r2  (already out.length-offset)
3181 // len     = r3
3182 // k       = r4
3183 //
3184 // pseudo code from java implementation:
3185 // carry = 0;
3186 // offset = out.length-offset - 1;
3187 // for (int j=len-1; j >= 0; j--) {
3188 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3189 //     out[offset--] = (int)product;
3190 //     carry = product >>> 32;
3191 // }
3192 // return (int)carry;
3193 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3194       Register len, Register k) {
3195     Label LOOP, END;
3196     // pre-loop
3197     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3198     csel(out, zr, out, Assembler::EQ);
3199     br(Assembler::EQ, END);
3200     add(in, in, len, LSL, 2); // in[j+1] address
3201     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3202     mov(out, zr); // used to keep carry now
3203     BIND(LOOP);
3204     ldrw(rscratch1, Address(pre(in, -4)));
3205     madd(rscratch1, rscratch1, k, out);
3206     ldrw(rscratch2, Address(pre(offset, -4)));
3207     add(rscratch1, rscratch1, rscratch2);
3208     strw(rscratch1, Address(offset));
3209     lsr(out, rscratch1, 32);
3210     subs(len, len, 1);
3211     br(Assembler::NE, LOOP);
3212     BIND(END);
3213 }
3214 
3215 /**
3216  * Emits code to update CRC-32 with a byte value according to constants in table
3217  *
3218  * @param [in,out]crc   Register containing the crc.
3219  * @param [in]val       Register containing the byte to fold into the CRC.
3220  * @param [in]table     Register containing the table of crc constants.
3221  *
3222  * uint32_t crc;
3223  * val = crc_table[(val ^ crc) & 0xFF];
3224  * crc = val ^ (crc >> 8);
3225  *
3226  */
3227 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3228   eor(val, val, crc);
3229   andr(val, val, 0xff);
3230   ldrw(val, Address(table, val, Address::lsl(2)));
3231   eor(crc, val, crc, Assembler::LSR, 8);
3232 }
3233 
3234 /**
3235  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3236  *
3237  * @param [in,out]crc   Register containing the crc.
3238  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3239  * @param [in]table0    Register containing table 0 of crc constants.
3240  * @param [in]table1    Register containing table 1 of crc constants.
3241  * @param [in]table2    Register containing table 2 of crc constants.
3242  * @param [in]table3    Register containing table 3 of crc constants.
3243  *
3244  * uint32_t crc;
3245  *   v = crc ^ v
3246  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3247  *
3248  */
3249 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3250         Register table0, Register table1, Register table2, Register table3,
3251         bool upper) {
3252   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3253   uxtb(tmp, v);
3254   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3255   ubfx(tmp, v, 8, 8);
3256   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3257   eor(crc, crc, tmp);
3258   ubfx(tmp, v, 16, 8);
3259   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3260   eor(crc, crc, tmp);
3261   ubfx(tmp, v, 24, 8);
3262   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3263   eor(crc, crc, tmp);
3264 }
3265 
3266 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3267         Register len, Register tmp0, Register tmp1, Register tmp2,
3268         Register tmp3) {
3269     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3270     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3271 
3272     mvnw(crc, crc);
3273 
3274     subs(len, len, 128);
3275     br(Assembler::GE, CRC_by64_pre);
3276   BIND(CRC_less64);
3277     adds(len, len, 128-32);
3278     br(Assembler::GE, CRC_by32_loop);
3279   BIND(CRC_less32);
3280     adds(len, len, 32-4);
3281     br(Assembler::GE, CRC_by4_loop);
3282     adds(len, len, 4);
3283     br(Assembler::GT, CRC_by1_loop);
3284     b(L_exit);
3285 
3286   BIND(CRC_by32_loop);
3287     ldp(tmp0, tmp1, Address(post(buf, 16)));
3288     subs(len, len, 32);
3289     crc32x(crc, crc, tmp0);
3290     ldr(tmp2, Address(post(buf, 8)));
3291     crc32x(crc, crc, tmp1);
3292     ldr(tmp3, Address(post(buf, 8)));
3293     crc32x(crc, crc, tmp2);
3294     crc32x(crc, crc, tmp3);
3295     br(Assembler::GE, CRC_by32_loop);
3296     cmn(len, 32);
3297     br(Assembler::NE, CRC_less32);
3298     b(L_exit);
3299 
3300   BIND(CRC_by4_loop);
3301     ldrw(tmp0, Address(post(buf, 4)));
3302     subs(len, len, 4);
3303     crc32w(crc, crc, tmp0);
3304     br(Assembler::GE, CRC_by4_loop);
3305     adds(len, len, 4);
3306     br(Assembler::LE, L_exit);
3307   BIND(CRC_by1_loop);
3308     ldrb(tmp0, Address(post(buf, 1)));
3309     subs(len, len, 1);
3310     crc32b(crc, crc, tmp0);
3311     br(Assembler::GT, CRC_by1_loop);
3312     b(L_exit);
3313 
3314   BIND(CRC_by64_pre);
3315     sub(buf, buf, 8);
3316     ldp(tmp0, tmp1, Address(buf, 8));
3317     crc32x(crc, crc, tmp0);
3318     ldr(tmp2, Address(buf, 24));
3319     crc32x(crc, crc, tmp1);
3320     ldr(tmp3, Address(buf, 32));
3321     crc32x(crc, crc, tmp2);
3322     ldr(tmp0, Address(buf, 40));
3323     crc32x(crc, crc, tmp3);
3324     ldr(tmp1, Address(buf, 48));
3325     crc32x(crc, crc, tmp0);
3326     ldr(tmp2, Address(buf, 56));
3327     crc32x(crc, crc, tmp1);
3328     ldr(tmp3, Address(pre(buf, 64)));
3329 
3330     b(CRC_by64_loop);
3331 
3332     align(CodeEntryAlignment);
3333   BIND(CRC_by64_loop);
3334     subs(len, len, 64);
3335     crc32x(crc, crc, tmp2);
3336     ldr(tmp0, Address(buf, 8));
3337     crc32x(crc, crc, tmp3);
3338     ldr(tmp1, Address(buf, 16));
3339     crc32x(crc, crc, tmp0);
3340     ldr(tmp2, Address(buf, 24));
3341     crc32x(crc, crc, tmp1);
3342     ldr(tmp3, Address(buf, 32));
3343     crc32x(crc, crc, tmp2);
3344     ldr(tmp0, Address(buf, 40));
3345     crc32x(crc, crc, tmp3);
3346     ldr(tmp1, Address(buf, 48));
3347     crc32x(crc, crc, tmp0);
3348     ldr(tmp2, Address(buf, 56));
3349     crc32x(crc, crc, tmp1);
3350     ldr(tmp3, Address(pre(buf, 64)));
3351     br(Assembler::GE, CRC_by64_loop);
3352 
3353     // post-loop
3354     crc32x(crc, crc, tmp2);
3355     crc32x(crc, crc, tmp3);
3356 
3357     sub(len, len, 64);
3358     add(buf, buf, 8);
3359     cmn(len, 128);
3360     br(Assembler::NE, CRC_less64);
3361   BIND(L_exit);
3362     mvnw(crc, crc);
3363 }
3364 
3365 /**
3366  * @param crc   register containing existing CRC (32-bit)
3367  * @param buf   register pointing to input byte buffer (byte*)
3368  * @param len   register containing number of bytes
3369  * @param table register that will contain address of CRC table
3370  * @param tmp   scratch register
3371  */
3372 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3373         Register table0, Register table1, Register table2, Register table3,
3374         Register tmp, Register tmp2, Register tmp3) {
3375   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3376   unsigned long offset;
3377 
3378   if (UseCRC32) {
3379       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3380       return;
3381   }
3382 
3383     mvnw(crc, crc);
3384 
3385     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3386     if (offset) add(table0, table0, offset);
3387     add(table1, table0, 1*256*sizeof(juint));
3388     add(table2, table0, 2*256*sizeof(juint));
3389     add(table3, table0, 3*256*sizeof(juint));
3390 
3391   if (UseNeon) {
3392       cmp(len, (u1)64);
3393       br(Assembler::LT, L_by16);
3394       eor(v16, T16B, v16, v16);
3395 
3396     Label L_fold;
3397 
3398       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3399 
3400       ld1(v0, v1, T2D, post(buf, 32));
3401       ld1r(v4, T2D, post(tmp, 8));
3402       ld1r(v5, T2D, post(tmp, 8));
3403       ld1r(v6, T2D, post(tmp, 8));
3404       ld1r(v7, T2D, post(tmp, 8));
3405       mov(v16, T4S, 0, crc);
3406 
3407       eor(v0, T16B, v0, v16);
3408       sub(len, len, 64);
3409 
3410     BIND(L_fold);
3411       pmull(v22, T8H, v0, v5, T8B);
3412       pmull(v20, T8H, v0, v7, T8B);
3413       pmull(v23, T8H, v0, v4, T8B);
3414       pmull(v21, T8H, v0, v6, T8B);
3415 
3416       pmull2(v18, T8H, v0, v5, T16B);
3417       pmull2(v16, T8H, v0, v7, T16B);
3418       pmull2(v19, T8H, v0, v4, T16B);
3419       pmull2(v17, T8H, v0, v6, T16B);
3420 
3421       uzp1(v24, T8H, v20, v22);
3422       uzp2(v25, T8H, v20, v22);
3423       eor(v20, T16B, v24, v25);
3424 
3425       uzp1(v26, T8H, v16, v18);
3426       uzp2(v27, T8H, v16, v18);
3427       eor(v16, T16B, v26, v27);
3428 
3429       ushll2(v22, T4S, v20, T8H, 8);
3430       ushll(v20, T4S, v20, T4H, 8);
3431 
3432       ushll2(v18, T4S, v16, T8H, 8);
3433       ushll(v16, T4S, v16, T4H, 8);
3434 
3435       eor(v22, T16B, v23, v22);
3436       eor(v18, T16B, v19, v18);
3437       eor(v20, T16B, v21, v20);
3438       eor(v16, T16B, v17, v16);
3439 
3440       uzp1(v17, T2D, v16, v20);
3441       uzp2(v21, T2D, v16, v20);
3442       eor(v17, T16B, v17, v21);
3443 
3444       ushll2(v20, T2D, v17, T4S, 16);
3445       ushll(v16, T2D, v17, T2S, 16);
3446 
3447       eor(v20, T16B, v20, v22);
3448       eor(v16, T16B, v16, v18);
3449 
3450       uzp1(v17, T2D, v20, v16);
3451       uzp2(v21, T2D, v20, v16);
3452       eor(v28, T16B, v17, v21);
3453 
3454       pmull(v22, T8H, v1, v5, T8B);
3455       pmull(v20, T8H, v1, v7, T8B);
3456       pmull(v23, T8H, v1, v4, T8B);
3457       pmull(v21, T8H, v1, v6, T8B);
3458 
3459       pmull2(v18, T8H, v1, v5, T16B);
3460       pmull2(v16, T8H, v1, v7, T16B);
3461       pmull2(v19, T8H, v1, v4, T16B);
3462       pmull2(v17, T8H, v1, v6, T16B);
3463 
3464       ld1(v0, v1, T2D, post(buf, 32));
3465 
3466       uzp1(v24, T8H, v20, v22);
3467       uzp2(v25, T8H, v20, v22);
3468       eor(v20, T16B, v24, v25);
3469 
3470       uzp1(v26, T8H, v16, v18);
3471       uzp2(v27, T8H, v16, v18);
3472       eor(v16, T16B, v26, v27);
3473 
3474       ushll2(v22, T4S, v20, T8H, 8);
3475       ushll(v20, T4S, v20, T4H, 8);
3476 
3477       ushll2(v18, T4S, v16, T8H, 8);
3478       ushll(v16, T4S, v16, T4H, 8);
3479 
3480       eor(v22, T16B, v23, v22);
3481       eor(v18, T16B, v19, v18);
3482       eor(v20, T16B, v21, v20);
3483       eor(v16, T16B, v17, v16);
3484 
3485       uzp1(v17, T2D, v16, v20);
3486       uzp2(v21, T2D, v16, v20);
3487       eor(v16, T16B, v17, v21);
3488 
3489       ushll2(v20, T2D, v16, T4S, 16);
3490       ushll(v16, T2D, v16, T2S, 16);
3491 
3492       eor(v20, T16B, v22, v20);
3493       eor(v16, T16B, v16, v18);
3494 
3495       uzp1(v17, T2D, v20, v16);
3496       uzp2(v21, T2D, v20, v16);
3497       eor(v20, T16B, v17, v21);
3498 
3499       shl(v16, T2D, v28, 1);
3500       shl(v17, T2D, v20, 1);
3501 
3502       eor(v0, T16B, v0, v16);
3503       eor(v1, T16B, v1, v17);
3504 
3505       subs(len, len, 32);
3506       br(Assembler::GE, L_fold);
3507 
3508       mov(crc, 0);
3509       mov(tmp, v0, T1D, 0);
3510       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3511       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3512       mov(tmp, v0, T1D, 1);
3513       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3514       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3515       mov(tmp, v1, T1D, 0);
3516       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3517       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3518       mov(tmp, v1, T1D, 1);
3519       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3520       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3521 
3522       add(len, len, 32);
3523   }
3524 
3525   BIND(L_by16);
3526     subs(len, len, 16);
3527     br(Assembler::GE, L_by16_loop);
3528     adds(len, len, 16-4);
3529     br(Assembler::GE, L_by4_loop);
3530     adds(len, len, 4);
3531     br(Assembler::GT, L_by1_loop);
3532     b(L_exit);
3533 
3534   BIND(L_by4_loop);
3535     ldrw(tmp, Address(post(buf, 4)));
3536     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3537     subs(len, len, 4);
3538     br(Assembler::GE, L_by4_loop);
3539     adds(len, len, 4);
3540     br(Assembler::LE, L_exit);
3541   BIND(L_by1_loop);
3542     subs(len, len, 1);
3543     ldrb(tmp, Address(post(buf, 1)));
3544     update_byte_crc32(crc, tmp, table0);
3545     br(Assembler::GT, L_by1_loop);
3546     b(L_exit);
3547 
3548     align(CodeEntryAlignment);
3549   BIND(L_by16_loop);
3550     subs(len, len, 16);
3551     ldp(tmp, tmp3, Address(post(buf, 16)));
3552     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3553     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3554     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3555     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3556     br(Assembler::GE, L_by16_loop);
3557     adds(len, len, 16-4);
3558     br(Assembler::GE, L_by4_loop);
3559     adds(len, len, 4);
3560     br(Assembler::GT, L_by1_loop);
3561   BIND(L_exit);
3562     mvnw(crc, crc);
3563 }
3564 
3565 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3566         Register len, Register tmp0, Register tmp1, Register tmp2,
3567         Register tmp3) {
3568     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3569     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3570 
3571     subs(len, len, 128);
3572     br(Assembler::GE, CRC_by64_pre);
3573   BIND(CRC_less64);
3574     adds(len, len, 128-32);
3575     br(Assembler::GE, CRC_by32_loop);
3576   BIND(CRC_less32);
3577     adds(len, len, 32-4);
3578     br(Assembler::GE, CRC_by4_loop);
3579     adds(len, len, 4);
3580     br(Assembler::GT, CRC_by1_loop);
3581     b(L_exit);
3582 
3583   BIND(CRC_by32_loop);
3584     ldp(tmp0, tmp1, Address(post(buf, 16)));
3585     subs(len, len, 32);
3586     crc32cx(crc, crc, tmp0);
3587     ldr(tmp2, Address(post(buf, 8)));
3588     crc32cx(crc, crc, tmp1);
3589     ldr(tmp3, Address(post(buf, 8)));
3590     crc32cx(crc, crc, tmp2);
3591     crc32cx(crc, crc, tmp3);
3592     br(Assembler::GE, CRC_by32_loop);
3593     cmn(len, 32);
3594     br(Assembler::NE, CRC_less32);
3595     b(L_exit);
3596 
3597   BIND(CRC_by4_loop);
3598     ldrw(tmp0, Address(post(buf, 4)));
3599     subs(len, len, 4);
3600     crc32cw(crc, crc, tmp0);
3601     br(Assembler::GE, CRC_by4_loop);
3602     adds(len, len, 4);
3603     br(Assembler::LE, L_exit);
3604   BIND(CRC_by1_loop);
3605     ldrb(tmp0, Address(post(buf, 1)));
3606     subs(len, len, 1);
3607     crc32cb(crc, crc, tmp0);
3608     br(Assembler::GT, CRC_by1_loop);
3609     b(L_exit);
3610 
3611   BIND(CRC_by64_pre);
3612     sub(buf, buf, 8);
3613     ldp(tmp0, tmp1, Address(buf, 8));
3614     crc32cx(crc, crc, tmp0);
3615     ldr(tmp2, Address(buf, 24));
3616     crc32cx(crc, crc, tmp1);
3617     ldr(tmp3, Address(buf, 32));
3618     crc32cx(crc, crc, tmp2);
3619     ldr(tmp0, Address(buf, 40));
3620     crc32cx(crc, crc, tmp3);
3621     ldr(tmp1, Address(buf, 48));
3622     crc32cx(crc, crc, tmp0);
3623     ldr(tmp2, Address(buf, 56));
3624     crc32cx(crc, crc, tmp1);
3625     ldr(tmp3, Address(pre(buf, 64)));
3626 
3627     b(CRC_by64_loop);
3628 
3629     align(CodeEntryAlignment);
3630   BIND(CRC_by64_loop);
3631     subs(len, len, 64);
3632     crc32cx(crc, crc, tmp2);
3633     ldr(tmp0, Address(buf, 8));
3634     crc32cx(crc, crc, tmp3);
3635     ldr(tmp1, Address(buf, 16));
3636     crc32cx(crc, crc, tmp0);
3637     ldr(tmp2, Address(buf, 24));
3638     crc32cx(crc, crc, tmp1);
3639     ldr(tmp3, Address(buf, 32));
3640     crc32cx(crc, crc, tmp2);
3641     ldr(tmp0, Address(buf, 40));
3642     crc32cx(crc, crc, tmp3);
3643     ldr(tmp1, Address(buf, 48));
3644     crc32cx(crc, crc, tmp0);
3645     ldr(tmp2, Address(buf, 56));
3646     crc32cx(crc, crc, tmp1);
3647     ldr(tmp3, Address(pre(buf, 64)));
3648     br(Assembler::GE, CRC_by64_loop);
3649 
3650     // post-loop
3651     crc32cx(crc, crc, tmp2);
3652     crc32cx(crc, crc, tmp3);
3653 
3654     sub(len, len, 64);
3655     add(buf, buf, 8);
3656     cmn(len, 128);
3657     br(Assembler::NE, CRC_less64);
3658   BIND(L_exit);
3659 }
3660 
3661 /**
3662  * @param crc   register containing existing CRC (32-bit)
3663  * @param buf   register pointing to input byte buffer (byte*)
3664  * @param len   register containing number of bytes
3665  * @param table register that will contain address of CRC table
3666  * @param tmp   scratch register
3667  */
3668 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3669         Register table0, Register table1, Register table2, Register table3,
3670         Register tmp, Register tmp2, Register tmp3) {
3671   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3672 }
3673 
3674 
3675 SkipIfEqual::SkipIfEqual(
3676     MacroAssembler* masm, const bool* flag_addr, bool value) {
3677   _masm = masm;
3678   unsigned long offset;
3679   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3680   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3681   _masm->cbzw(rscratch1, _label);
3682 }
3683 
3684 SkipIfEqual::~SkipIfEqual() {
3685   _masm->bind(_label);
3686 }
3687 
3688 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3689   Address adr;
3690   switch(dst.getMode()) {
3691   case Address::base_plus_offset:
3692     // This is the expected mode, although we allow all the other
3693     // forms below.
3694     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3695     break;
3696   default:
3697     lea(rscratch2, dst);
3698     adr = Address(rscratch2);
3699     break;
3700   }
3701   ldr(rscratch1, adr);
3702   add(rscratch1, rscratch1, src);
3703   str(rscratch1, adr);
3704 }
3705 
3706 void MacroAssembler::cmpptr(Register src1, Address src2) {
3707   unsigned long offset;
3708   adrp(rscratch1, src2, offset);
3709   ldr(rscratch1, Address(rscratch1, offset));
3710   cmp(src1, rscratch1);
3711 }
3712 
3713 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3714   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3715   bs->obj_equals(this, obj1, obj2);
3716 }
3717 
3718 void MacroAssembler::load_klass(Register dst, Register src) {
3719   if (UseCompressedClassPointers) {
3720     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3721     decode_klass_not_null(dst);
3722   } else {
3723     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3724   }
3725 }
3726 
3727 // ((OopHandle)result).resolve();
3728 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3729   // OopHandle::resolve is an indirection.
3730   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3731 }
3732 
3733 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3734   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3735   ldr(dst, Address(rmethod, Method::const_offset()));
3736   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3737   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3738   ldr(dst, Address(dst, mirror_offset));
3739   resolve_oop_handle(dst, tmp);
3740 }
3741 
3742 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3743   if (UseCompressedClassPointers) {
3744     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3745     if (Universe::narrow_klass_base() == NULL) {
3746       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3747       return;
3748     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3749                && Universe::narrow_klass_shift() == 0) {
3750       // Only the bottom 32 bits matter
3751       cmpw(trial_klass, tmp);
3752       return;
3753     }
3754     decode_klass_not_null(tmp);
3755   } else {
3756     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3757   }
3758   cmp(trial_klass, tmp);
3759 }
3760 
3761 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3762   load_klass(dst, src);
3763   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3764 }
3765 
3766 void MacroAssembler::store_klass(Register dst, Register src) {
3767   // FIXME: Should this be a store release?  concurrent gcs assumes
3768   // klass length is valid if klass field is not null.
3769   if (UseCompressedClassPointers) {
3770     encode_klass_not_null(src);
3771     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3772   } else {
3773     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3774   }
3775 }
3776 
3777 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3778   if (UseCompressedClassPointers) {
3779     // Store to klass gap in destination
3780     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3781   }
3782 }
3783 
3784 // Algorithm must match CompressedOops::encode.
3785 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3786 #ifdef ASSERT
3787   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3788 #endif
3789   verify_oop(s, "broken oop in encode_heap_oop");
3790   if (Universe::narrow_oop_base() == NULL) {
3791     if (Universe::narrow_oop_shift() != 0) {
3792       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3793       lsr(d, s, LogMinObjAlignmentInBytes);
3794     } else {
3795       mov(d, s);
3796     }
3797   } else {
3798     subs(d, s, rheapbase);
3799     csel(d, d, zr, Assembler::HS);
3800     lsr(d, d, LogMinObjAlignmentInBytes);
3801 
3802     /*  Old algorithm: is this any worse?
3803     Label nonnull;
3804     cbnz(r, nonnull);
3805     sub(r, r, rheapbase);
3806     bind(nonnull);
3807     lsr(r, r, LogMinObjAlignmentInBytes);
3808     */
3809   }
3810 }
3811 
3812 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3813 #ifdef ASSERT
3814   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3815   if (CheckCompressedOops) {
3816     Label ok;
3817     cbnz(r, ok);
3818     stop("null oop passed to encode_heap_oop_not_null");
3819     bind(ok);
3820   }
3821 #endif
3822   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3823   if (Universe::narrow_oop_base() != NULL) {
3824     sub(r, r, rheapbase);
3825   }
3826   if (Universe::narrow_oop_shift() != 0) {
3827     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3828     lsr(r, r, LogMinObjAlignmentInBytes);
3829   }
3830 }
3831 
3832 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3833 #ifdef ASSERT
3834   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3835   if (CheckCompressedOops) {
3836     Label ok;
3837     cbnz(src, ok);
3838     stop("null oop passed to encode_heap_oop_not_null2");
3839     bind(ok);
3840   }
3841 #endif
3842   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3843 
3844   Register data = src;
3845   if (Universe::narrow_oop_base() != NULL) {
3846     sub(dst, src, rheapbase);
3847     data = dst;
3848   }
3849   if (Universe::narrow_oop_shift() != 0) {
3850     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3851     lsr(dst, data, LogMinObjAlignmentInBytes);
3852     data = dst;
3853   }
3854   if (data == src)
3855     mov(dst, src);
3856 }
3857 
3858 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3859 #ifdef ASSERT
3860   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3861 #endif
3862   if (Universe::narrow_oop_base() == NULL) {
3863     if (Universe::narrow_oop_shift() != 0 || d != s) {
3864       lsl(d, s, Universe::narrow_oop_shift());
3865     }
3866   } else {
3867     Label done;
3868     if (d != s)
3869       mov(d, s);
3870     cbz(s, done);
3871     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3872     bind(done);
3873   }
3874   verify_oop(d, "broken oop in decode_heap_oop");
3875 }
3876 
3877 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3878   assert (UseCompressedOops, "should only be used for compressed headers");
3879   assert (Universe::heap() != NULL, "java heap should be initialized");
3880   // Cannot assert, unverified entry point counts instructions (see .ad file)
3881   // vtableStubs also counts instructions in pd_code_size_limit.
3882   // Also do not verify_oop as this is called by verify_oop.
3883   if (Universe::narrow_oop_shift() != 0) {
3884     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3885     if (Universe::narrow_oop_base() != NULL) {
3886       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3887     } else {
3888       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3889     }
3890   } else {
3891     assert (Universe::narrow_oop_base() == NULL, "sanity");
3892   }
3893 }
3894 
3895 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3896   assert (UseCompressedOops, "should only be used for compressed headers");
3897   assert (Universe::heap() != NULL, "java heap should be initialized");
3898   // Cannot assert, unverified entry point counts instructions (see .ad file)
3899   // vtableStubs also counts instructions in pd_code_size_limit.
3900   // Also do not verify_oop as this is called by verify_oop.
3901   if (Universe::narrow_oop_shift() != 0) {
3902     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3903     if (Universe::narrow_oop_base() != NULL) {
3904       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3905     } else {
3906       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3907     }
3908   } else {
3909     assert (Universe::narrow_oop_base() == NULL, "sanity");
3910     if (dst != src) {
3911       mov(dst, src);
3912     }
3913   }
3914 }
3915 
3916 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3917   if (Universe::narrow_klass_base() == NULL) {
3918     if (Universe::narrow_klass_shift() != 0) {
3919       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3920       lsr(dst, src, LogKlassAlignmentInBytes);
3921     } else {
3922       if (dst != src) mov(dst, src);
3923     }
3924     return;
3925   }
3926 
3927   if (use_XOR_for_compressed_class_base) {
3928     if (Universe::narrow_klass_shift() != 0) {
3929       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3930       lsr(dst, dst, LogKlassAlignmentInBytes);
3931     } else {
3932       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3933     }
3934     return;
3935   }
3936 
3937   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3938       && Universe::narrow_klass_shift() == 0) {
3939     movw(dst, src);
3940     return;
3941   }
3942 
3943 #ifdef ASSERT
3944   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3945 #endif
3946 
3947   Register rbase = dst;
3948   if (dst == src) rbase = rheapbase;
3949   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3950   sub(dst, src, rbase);
3951   if (Universe::narrow_klass_shift() != 0) {
3952     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3953     lsr(dst, dst, LogKlassAlignmentInBytes);
3954   }
3955   if (dst == src) reinit_heapbase();
3956 }
3957 
3958 void MacroAssembler::encode_klass_not_null(Register r) {
3959   encode_klass_not_null(r, r);
3960 }
3961 
3962 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3963   Register rbase = dst;
3964   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3965 
3966   if (Universe::narrow_klass_base() == NULL) {
3967     if (Universe::narrow_klass_shift() != 0) {
3968       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3969       lsl(dst, src, LogKlassAlignmentInBytes);
3970     } else {
3971       if (dst != src) mov(dst, src);
3972     }
3973     return;
3974   }
3975 
3976   if (use_XOR_for_compressed_class_base) {
3977     if (Universe::narrow_klass_shift() != 0) {
3978       lsl(dst, src, LogKlassAlignmentInBytes);
3979       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3980     } else {
3981       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3982     }
3983     return;
3984   }
3985 
3986   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3987       && Universe::narrow_klass_shift() == 0) {
3988     if (dst != src)
3989       movw(dst, src);
3990     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3991     return;
3992   }
3993 
3994   // Cannot assert, unverified entry point counts instructions (see .ad file)
3995   // vtableStubs also counts instructions in pd_code_size_limit.
3996   // Also do not verify_oop as this is called by verify_oop.
3997   if (dst == src) rbase = rheapbase;
3998   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3999   if (Universe::narrow_klass_shift() != 0) {
4000     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
4001     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
4002   } else {
4003     add(dst, rbase, src);
4004   }
4005   if (dst == src) reinit_heapbase();
4006 }
4007 
4008 void  MacroAssembler::decode_klass_not_null(Register r) {
4009   decode_klass_not_null(r, r);
4010 }
4011 
4012 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4013 #ifdef ASSERT
4014   {
4015     ThreadInVMfromUnknown tiv;
4016     assert (UseCompressedOops, "should only be used for compressed oops");
4017     assert (Universe::heap() != NULL, "java heap should be initialized");
4018     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4019     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4020   }
4021 #endif
4022   int oop_index = oop_recorder()->find_index(obj);
4023   InstructionMark im(this);
4024   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4025   code_section()->relocate(inst_mark(), rspec);
4026   movz(dst, 0xDEAD, 16);
4027   movk(dst, 0xBEEF);
4028 }
4029 
4030 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4031   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4032   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4033   int index = oop_recorder()->find_index(k);
4034   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4035 
4036   InstructionMark im(this);
4037   RelocationHolder rspec = metadata_Relocation::spec(index);
4038   code_section()->relocate(inst_mark(), rspec);
4039   narrowKlass nk = Klass::encode_klass(k);
4040   movz(dst, (nk >> 16), 16);
4041   movk(dst, nk & 0xffff);
4042 }
4043 
4044 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4045                                     Register dst, Address src,
4046                                     Register tmp1, Register thread_tmp) {
4047   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4048   decorators = AccessInternal::decorator_fixup(decorators);
4049   bool as_raw = (decorators & AS_RAW) != 0;
4050   if (as_raw) {
4051     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4052   } else {
4053     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4054   }
4055 }
4056 
4057 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4058                                      Address dst, Register src,
4059                                      Register tmp1, Register thread_tmp) {
4060   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4061   decorators = AccessInternal::decorator_fixup(decorators);
4062   bool as_raw = (decorators & AS_RAW) != 0;
4063   if (as_raw) {
4064     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4065   } else {
4066     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4067   }
4068 }
4069 
4070 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4071   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4072   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4073     decorators |= ACCESS_READ | ACCESS_WRITE;
4074   }
4075   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4076   return bs->resolve(this, decorators, obj);
4077 }
4078 
4079 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4080                                    Register thread_tmp, DecoratorSet decorators) {
4081   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4082 }
4083 
4084 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4085                                             Register thread_tmp, DecoratorSet decorators) {
4086   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4087 }
4088 
4089 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4090                                     Register thread_tmp, DecoratorSet decorators) {
4091   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4092 }
4093 
4094 // Used for storing NULLs.
4095 void MacroAssembler::store_heap_oop_null(Address dst) {
4096   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4097 }
4098 
4099 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4100   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4101   int index = oop_recorder()->allocate_metadata_index(obj);
4102   RelocationHolder rspec = metadata_Relocation::spec(index);
4103   return Address((address)obj, rspec);
4104 }
4105 
4106 // Move an oop into a register.  immediate is true if we want
4107 // immediate instrcutions, i.e. we are not going to patch this
4108 // instruction while the code is being executed by another thread.  In
4109 // that case we can use move immediates rather than the constant pool.
4110 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4111   int oop_index;
4112   if (obj == NULL) {
4113     oop_index = oop_recorder()->allocate_oop_index(obj);
4114   } else {
4115 #ifdef ASSERT
4116     {
4117       ThreadInVMfromUnknown tiv;
4118       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4119     }
4120 #endif
4121     oop_index = oop_recorder()->find_index(obj);
4122   }
4123   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4124   if (! immediate) {
4125     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4126     ldr_constant(dst, Address(dummy, rspec));
4127   } else
4128     mov(dst, Address((address)obj, rspec));
4129 }
4130 
4131 // Move a metadata address into a register.
4132 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4133   int oop_index;
4134   if (obj == NULL) {
4135     oop_index = oop_recorder()->allocate_metadata_index(obj);
4136   } else {
4137     oop_index = oop_recorder()->find_index(obj);
4138   }
4139   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4140   mov(dst, Address((address)obj, rspec));
4141 }
4142 
4143 Address MacroAssembler::constant_oop_address(jobject obj) {
4144 #ifdef ASSERT
4145   {
4146     ThreadInVMfromUnknown tiv;
4147     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4148     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4149   }
4150 #endif
4151   int oop_index = oop_recorder()->find_index(obj);
4152   return Address((address)obj, oop_Relocation::spec(oop_index));
4153 }
4154 
4155 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4156 void MacroAssembler::tlab_allocate(Register obj,
4157                                    Register var_size_in_bytes,
4158                                    int con_size_in_bytes,
4159                                    Register t1,
4160                                    Register t2,
4161                                    Label& slow_case) {
4162   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4163   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4164 }
4165 
4166 // Defines obj, preserves var_size_in_bytes
4167 void MacroAssembler::eden_allocate(Register obj,
4168                                    Register var_size_in_bytes,
4169                                    int con_size_in_bytes,
4170                                    Register t1,
4171                                    Label& slow_case) {
4172   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4173   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4174 }
4175 
4176 // Zero words; len is in bytes
4177 // Destroys all registers except addr
4178 // len must be a nonzero multiple of wordSize
4179 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4180   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4181 
4182 #ifdef ASSERT
4183   { Label L;
4184     tst(len, BytesPerWord - 1);
4185     br(Assembler::EQ, L);
4186     stop("len is not a multiple of BytesPerWord");
4187     bind(L);
4188   }
4189 #endif
4190 
4191 #ifndef PRODUCT
4192   block_comment("zero memory");
4193 #endif
4194 
4195   Label loop;
4196   Label entry;
4197 
4198 //  Algorithm:
4199 //
4200 //    scratch1 = cnt & 7;
4201 //    cnt -= scratch1;
4202 //    p += scratch1;
4203 //    switch (scratch1) {
4204 //      do {
4205 //        cnt -= 8;
4206 //          p[-8] = 0;
4207 //        case 7:
4208 //          p[-7] = 0;
4209 //        case 6:
4210 //          p[-6] = 0;
4211 //          // ...
4212 //        case 1:
4213 //          p[-1] = 0;
4214 //        case 0:
4215 //          p += 8;
4216 //      } while (cnt);
4217 //    }
4218 
4219   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4220 
4221   lsr(len, len, LogBytesPerWord);
4222   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4223   sub(len, len, rscratch1);      // cnt -= unroll
4224   // t1 always points to the end of the region we're about to zero
4225   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4226   adr(rscratch2, entry);
4227   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4228   br(rscratch2);
4229   bind(loop);
4230   sub(len, len, unroll);
4231   for (int i = -unroll; i < 0; i++)
4232     Assembler::str(zr, Address(t1, i * wordSize));
4233   bind(entry);
4234   add(t1, t1, unroll * wordSize);
4235   cbnz(len, loop);
4236 }
4237 
4238 void MacroAssembler::verify_tlab() {
4239 #ifdef ASSERT
4240   if (UseTLAB && VerifyOops) {
4241     Label next, ok;
4242 
4243     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4244 
4245     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4246     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4247     cmp(rscratch2, rscratch1);
4248     br(Assembler::HS, next);
4249     STOP("assert(top >= start)");
4250     should_not_reach_here();
4251 
4252     bind(next);
4253     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4254     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4255     cmp(rscratch2, rscratch1);
4256     br(Assembler::HS, ok);
4257     STOP("assert(top <= end)");
4258     should_not_reach_here();
4259 
4260     bind(ok);
4261     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4262   }
4263 #endif
4264 }
4265 
4266 // Writes to stack successive pages until offset reached to check for
4267 // stack overflow + shadow pages.  This clobbers tmp.
4268 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4269   assert_different_registers(tmp, size, rscratch1);
4270   mov(tmp, sp);
4271   // Bang stack for total size given plus shadow page size.
4272   // Bang one page at a time because large size can bang beyond yellow and
4273   // red zones.
4274   Label loop;
4275   mov(rscratch1, os::vm_page_size());
4276   bind(loop);
4277   lea(tmp, Address(tmp, -os::vm_page_size()));
4278   subsw(size, size, rscratch1);
4279   str(size, Address(tmp));
4280   br(Assembler::GT, loop);
4281 
4282   // Bang down shadow pages too.
4283   // At this point, (tmp-0) is the last address touched, so don't
4284   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4285   // was post-decremented.)  Skip this address by starting at i=1, and
4286   // touch a few more pages below.  N.B.  It is important to touch all
4287   // the way down to and including i=StackShadowPages.
4288   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4289     // this could be any sized move but this is can be a debugging crumb
4290     // so the bigger the better.
4291     lea(tmp, Address(tmp, -os::vm_page_size()));
4292     str(size, Address(tmp));
4293   }
4294 }
4295 
4296 
4297 // Move the address of the polling page into dest.
4298 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4299   if (SafepointMechanism::uses_thread_local_poll()) {
4300     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4301   } else {
4302     unsigned long off;
4303     adrp(dest, Address(page, rtype), off);
4304     assert(off == 0, "polling page must be page aligned");
4305   }
4306 }
4307 
4308 // Move the address of the polling page into r, then read the polling
4309 // page.
4310 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4311   get_polling_page(r, page, rtype);
4312   return read_polling_page(r, rtype);
4313 }
4314 
4315 // Read the polling page.  The address of the polling page must
4316 // already be in r.
4317 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4318   InstructionMark im(this);
4319   code_section()->relocate(inst_mark(), rtype);
4320   ldrw(zr, Address(r, 0));
4321   return inst_mark();
4322 }
4323 
4324 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4325   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4326   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4327   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4328   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4329   long offset_low = dest_page - low_page;
4330   long offset_high = dest_page - high_page;
4331 
4332   assert(is_valid_AArch64_address(dest.target()), "bad address");
4333   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4334 
4335   InstructionMark im(this);
4336   code_section()->relocate(inst_mark(), dest.rspec());
4337   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4338   // the code cache so that if it is relocated we know it will still reach
4339   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4340     _adrp(reg1, dest.target());
4341   } else {
4342     unsigned long target = (unsigned long)dest.target();
4343     unsigned long adrp_target
4344       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4345 
4346     _adrp(reg1, (address)adrp_target);
4347     movk(reg1, target >> 32, 32);
4348   }
4349   byte_offset = (unsigned long)dest.target() & 0xfff;
4350 }
4351 
4352 void MacroAssembler::load_byte_map_base(Register reg) {
4353   CardTable::CardValue* byte_map_base =
4354     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4355 
4356   if (is_valid_AArch64_address((address)byte_map_base)) {
4357     // Strictly speaking the byte_map_base isn't an address at all,
4358     // and it might even be negative.
4359     unsigned long offset;
4360     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4361     // We expect offset to be zero with most collectors.
4362     if (offset != 0) {
4363       add(reg, reg, offset);
4364     }
4365   } else {
4366     mov(reg, (uint64_t)byte_map_base);
4367   }
4368 }
4369 
4370 void MacroAssembler::build_frame(int framesize) {
4371   assert(framesize > 0, "framesize must be > 0");
4372   if (framesize < ((1 << 9) + 2 * wordSize)) {
4373     sub(sp, sp, framesize);
4374     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4375     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4376   } else {
4377     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4378     if (PreserveFramePointer) mov(rfp, sp);
4379     if (framesize < ((1 << 12) + 2 * wordSize))
4380       sub(sp, sp, framesize - 2 * wordSize);
4381     else {
4382       mov(rscratch1, framesize - 2 * wordSize);
4383       sub(sp, sp, rscratch1);
4384     }
4385   }
4386 }
4387 
4388 void MacroAssembler::remove_frame(int framesize) {
4389   assert(framesize > 0, "framesize must be > 0");
4390   if (framesize < ((1 << 9) + 2 * wordSize)) {
4391     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4392     add(sp, sp, framesize);
4393   } else {
4394     if (framesize < ((1 << 12) + 2 * wordSize))
4395       add(sp, sp, framesize - 2 * wordSize);
4396     else {
4397       mov(rscratch1, framesize - 2 * wordSize);
4398       add(sp, sp, rscratch1);
4399     }
4400     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4401   }
4402 }
4403 
4404 #ifdef COMPILER2
4405 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4406 
4407 // Search for str1 in str2 and return index or -1
4408 void MacroAssembler::string_indexof(Register str2, Register str1,
4409                                     Register cnt2, Register cnt1,
4410                                     Register tmp1, Register tmp2,
4411                                     Register tmp3, Register tmp4,
4412                                     Register tmp5, Register tmp6,
4413                                     int icnt1, Register result, int ae) {
4414   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4415   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4416 
4417   Register ch1 = rscratch1;
4418   Register ch2 = rscratch2;
4419   Register cnt1tmp = tmp1;
4420   Register cnt2tmp = tmp2;
4421   Register cnt1_neg = cnt1;
4422   Register cnt2_neg = cnt2;
4423   Register result_tmp = tmp4;
4424 
4425   bool isL = ae == StrIntrinsicNode::LL;
4426 
4427   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4428   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4429   int str1_chr_shift = str1_isL ? 0:1;
4430   int str2_chr_shift = str2_isL ? 0:1;
4431   int str1_chr_size = str1_isL ? 1:2;
4432   int str2_chr_size = str2_isL ? 1:2;
4433   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4434                                       (chr_insn)&MacroAssembler::ldrh;
4435   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4436                                       (chr_insn)&MacroAssembler::ldrh;
4437   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4438   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4439 
4440   // Note, inline_string_indexOf() generates checks:
4441   // if (substr.count > string.count) return -1;
4442   // if (substr.count == 0) return 0;
4443 
4444   // We have two strings, a source string in str2, cnt2 and a pattern string
4445   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4446 
4447   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4448   // With a small pattern and source we use linear scan.
4449 
4450   if (icnt1 == -1) {
4451     sub(result_tmp, cnt2, cnt1);
4452     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4453     br(LT, LINEARSEARCH);
4454     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4455     subs(zr, cnt1, 256);
4456     lsr(tmp1, cnt2, 2);
4457     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4458     br(GE, LINEARSTUB);
4459   }
4460 
4461 // The Boyer Moore alogorithm is based on the description here:-
4462 //
4463 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4464 //
4465 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4466 // and the 'Good Suffix' rule.
4467 //
4468 // These rules are essentially heuristics for how far we can shift the
4469 // pattern along the search string.
4470 //
4471 // The implementation here uses the 'Bad Character' rule only because of the
4472 // complexity of initialisation for the 'Good Suffix' rule.
4473 //
4474 // This is also known as the Boyer-Moore-Horspool algorithm:-
4475 //
4476 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4477 //
4478 // This particular implementation has few java-specific optimizations.
4479 //
4480 // #define ASIZE 256
4481 //
4482 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4483 //       int i, j;
4484 //       unsigned c;
4485 //       unsigned char bc[ASIZE];
4486 //
4487 //       /* Preprocessing */
4488 //       for (i = 0; i < ASIZE; ++i)
4489 //          bc[i] = m;
4490 //       for (i = 0; i < m - 1; ) {
4491 //          c = x[i];
4492 //          ++i;
4493 //          // c < 256 for Latin1 string, so, no need for branch
4494 //          #ifdef PATTERN_STRING_IS_LATIN1
4495 //          bc[c] = m - i;
4496 //          #else
4497 //          if (c < ASIZE) bc[c] = m - i;
4498 //          #endif
4499 //       }
4500 //
4501 //       /* Searching */
4502 //       j = 0;
4503 //       while (j <= n - m) {
4504 //          c = y[i+j];
4505 //          if (x[m-1] == c)
4506 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4507 //          if (i < 0) return j;
4508 //          // c < 256 for Latin1 string, so, no need for branch
4509 //          #ifdef SOURCE_STRING_IS_LATIN1
4510 //          // LL case: (c< 256) always true. Remove branch
4511 //          j += bc[y[j+m-1]];
4512 //          #endif
4513 //          #ifndef PATTERN_STRING_IS_UTF
4514 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4515 //          if (c < ASIZE)
4516 //            j += bc[y[j+m-1]];
4517 //          else
4518 //            j += 1
4519 //          #endif
4520 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4521 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4522 //          if (c < ASIZE)
4523 //            j += bc[y[j+m-1]];
4524 //          else
4525 //            j += m
4526 //          #endif
4527 //       }
4528 //    }
4529 
4530   if (icnt1 == -1) {
4531     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4532         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4533     Register cnt1end = tmp2;
4534     Register str2end = cnt2;
4535     Register skipch = tmp2;
4536 
4537     // str1 length is >=8, so, we can read at least 1 register for cases when
4538     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4539     // UL case. We'll re-read last character in inner pre-loop code to have
4540     // single outer pre-loop load
4541     const int firstStep = isL ? 7 : 3;
4542 
4543     const int ASIZE = 256;
4544     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4545     sub(sp, sp, ASIZE);
4546     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4547     mov(ch1, sp);
4548     BIND(BM_INIT_LOOP);
4549       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4550       subs(tmp5, tmp5, 1);
4551       br(GT, BM_INIT_LOOP);
4552 
4553       sub(cnt1tmp, cnt1, 1);
4554       mov(tmp5, str2);
4555       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4556       sub(ch2, cnt1, 1);
4557       mov(tmp3, str1);
4558     BIND(BCLOOP);
4559       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4560       if (!str1_isL) {
4561         subs(zr, ch1, ASIZE);
4562         br(HS, BCSKIP);
4563       }
4564       strb(ch2, Address(sp, ch1));
4565     BIND(BCSKIP);
4566       subs(ch2, ch2, 1);
4567       br(GT, BCLOOP);
4568 
4569       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4570       if (str1_isL == str2_isL) {
4571         // load last 8 bytes (8LL/4UU symbols)
4572         ldr(tmp6, Address(tmp6, -wordSize));
4573       } else {
4574         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4575         // convert Latin1 to UTF. We'll have to wait until load completed, but
4576         // it's still faster than per-character loads+checks
4577         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4578         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4579         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4580         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4581         orr(ch2, ch1, ch2, LSL, 16);
4582         orr(tmp6, tmp6, tmp3, LSL, 48);
4583         orr(tmp6, tmp6, ch2, LSL, 16);
4584       }
4585     BIND(BMLOOPSTR2);
4586       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4587       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4588       if (str1_isL == str2_isL) {
4589         // re-init tmp3. It's for free because it's executed in parallel with
4590         // load above. Alternative is to initialize it before loop, but it'll
4591         // affect performance on in-order systems with 2 or more ld/st pipelines
4592         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4593       }
4594       if (!isL) { // UU/UL case
4595         lsl(ch2, cnt1tmp, 1); // offset in bytes
4596       }
4597       cmp(tmp3, skipch);
4598       br(NE, BMSKIP);
4599       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4600       mov(ch1, tmp6);
4601       if (isL) {
4602         b(BMLOOPSTR1_AFTER_LOAD);
4603       } else {
4604         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4605         b(BMLOOPSTR1_CMP);
4606       }
4607     BIND(BMLOOPSTR1);
4608       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4609       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4610     BIND(BMLOOPSTR1_AFTER_LOAD);
4611       subs(cnt1tmp, cnt1tmp, 1);
4612       br(LT, BMLOOPSTR1_LASTCMP);
4613     BIND(BMLOOPSTR1_CMP);
4614       cmp(ch1, ch2);
4615       br(EQ, BMLOOPSTR1);
4616     BIND(BMSKIP);
4617       if (!isL) {
4618         // if we've met UTF symbol while searching Latin1 pattern, then we can
4619         // skip cnt1 symbols
4620         if (str1_isL != str2_isL) {
4621           mov(result_tmp, cnt1);
4622         } else {
4623           mov(result_tmp, 1);
4624         }
4625         subs(zr, skipch, ASIZE);
4626         br(HS, BMADV);
4627       }
4628       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4629     BIND(BMADV);
4630       sub(cnt1tmp, cnt1, 1);
4631       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4632       cmp(str2, str2end);
4633       br(LE, BMLOOPSTR2);
4634       add(sp, sp, ASIZE);
4635       b(NOMATCH);
4636     BIND(BMLOOPSTR1_LASTCMP);
4637       cmp(ch1, ch2);
4638       br(NE, BMSKIP);
4639     BIND(BMMATCH);
4640       sub(result, str2, tmp5);
4641       if (!str2_isL) lsr(result, result, 1);
4642       add(sp, sp, ASIZE);
4643       b(DONE);
4644 
4645     BIND(LINEARSTUB);
4646     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4647     br(LT, LINEAR_MEDIUM);
4648     mov(result, zr);
4649     RuntimeAddress stub = NULL;
4650     if (isL) {
4651       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4652       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4653     } else if (str1_isL) {
4654       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4655        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4656     } else {
4657       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4658       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4659     }
4660     trampoline_call(stub);
4661     b(DONE);
4662   }
4663 
4664   BIND(LINEARSEARCH);
4665   {
4666     Label DO1, DO2, DO3;
4667 
4668     Register str2tmp = tmp2;
4669     Register first = tmp3;
4670 
4671     if (icnt1 == -1)
4672     {
4673         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4674 
4675         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4676         br(LT, DOSHORT);
4677       BIND(LINEAR_MEDIUM);
4678         (this->*str1_load_1chr)(first, Address(str1));
4679         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4680         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4681         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4682         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4683 
4684       BIND(FIRST_LOOP);
4685         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4686         cmp(first, ch2);
4687         br(EQ, STR1_LOOP);
4688       BIND(STR2_NEXT);
4689         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4690         br(LE, FIRST_LOOP);
4691         b(NOMATCH);
4692 
4693       BIND(STR1_LOOP);
4694         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4695         add(cnt2tmp, cnt2_neg, str2_chr_size);
4696         br(GE, MATCH);
4697 
4698       BIND(STR1_NEXT);
4699         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4700         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4701         cmp(ch1, ch2);
4702         br(NE, STR2_NEXT);
4703         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4704         add(cnt2tmp, cnt2tmp, str2_chr_size);
4705         br(LT, STR1_NEXT);
4706         b(MATCH);
4707 
4708       BIND(DOSHORT);
4709       if (str1_isL == str2_isL) {
4710         cmp(cnt1, (u1)2);
4711         br(LT, DO1);
4712         br(GT, DO3);
4713       }
4714     }
4715 
4716     if (icnt1 == 4) {
4717       Label CH1_LOOP;
4718 
4719         (this->*load_4chr)(ch1, str1);
4720         sub(result_tmp, cnt2, 4);
4721         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4722         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4723 
4724       BIND(CH1_LOOP);
4725         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4726         cmp(ch1, ch2);
4727         br(EQ, MATCH);
4728         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4729         br(LE, CH1_LOOP);
4730         b(NOMATCH);
4731       }
4732 
4733     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4734       Label CH1_LOOP;
4735 
4736       BIND(DO2);
4737         (this->*load_2chr)(ch1, str1);
4738         if (icnt1 == 2) {
4739           sub(result_tmp, cnt2, 2);
4740         }
4741         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4742         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4743       BIND(CH1_LOOP);
4744         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4745         cmp(ch1, ch2);
4746         br(EQ, MATCH);
4747         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4748         br(LE, CH1_LOOP);
4749         b(NOMATCH);
4750     }
4751 
4752     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4753       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4754 
4755       BIND(DO3);
4756         (this->*load_2chr)(first, str1);
4757         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4758         if (icnt1 == 3) {
4759           sub(result_tmp, cnt2, 3);
4760         }
4761         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4762         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4763       BIND(FIRST_LOOP);
4764         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4765         cmpw(first, ch2);
4766         br(EQ, STR1_LOOP);
4767       BIND(STR2_NEXT);
4768         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4769         br(LE, FIRST_LOOP);
4770         b(NOMATCH);
4771 
4772       BIND(STR1_LOOP);
4773         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4774         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4775         cmp(ch1, ch2);
4776         br(NE, STR2_NEXT);
4777         b(MATCH);
4778     }
4779 
4780     if (icnt1 == -1 || icnt1 == 1) {
4781       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4782 
4783       BIND(DO1);
4784         (this->*str1_load_1chr)(ch1, str1);
4785         cmp(cnt2, (u1)8);
4786         br(LT, DO1_SHORT);
4787 
4788         sub(result_tmp, cnt2, 8/str2_chr_size);
4789         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4790         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4791         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4792 
4793         if (str2_isL) {
4794           orr(ch1, ch1, ch1, LSL, 8);
4795         }
4796         orr(ch1, ch1, ch1, LSL, 16);
4797         orr(ch1, ch1, ch1, LSL, 32);
4798       BIND(CH1_LOOP);
4799         ldr(ch2, Address(str2, cnt2_neg));
4800         eor(ch2, ch1, ch2);
4801         sub(tmp1, ch2, tmp3);
4802         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4803         bics(tmp1, tmp1, tmp2);
4804         br(NE, HAS_ZERO);
4805         adds(cnt2_neg, cnt2_neg, 8);
4806         br(LT, CH1_LOOP);
4807 
4808         cmp(cnt2_neg, (u1)8);
4809         mov(cnt2_neg, 0);
4810         br(LT, CH1_LOOP);
4811         b(NOMATCH);
4812 
4813       BIND(HAS_ZERO);
4814         rev(tmp1, tmp1);
4815         clz(tmp1, tmp1);
4816         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4817         b(MATCH);
4818 
4819       BIND(DO1_SHORT);
4820         mov(result_tmp, cnt2);
4821         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4822         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4823       BIND(DO1_LOOP);
4824         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4825         cmpw(ch1, ch2);
4826         br(EQ, MATCH);
4827         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4828         br(LT, DO1_LOOP);
4829     }
4830   }
4831   BIND(NOMATCH);
4832     mov(result, -1);
4833     b(DONE);
4834   BIND(MATCH);
4835     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4836   BIND(DONE);
4837 }
4838 
4839 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4840 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4841 
4842 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4843                                          Register ch, Register result,
4844                                          Register tmp1, Register tmp2, Register tmp3)
4845 {
4846   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4847   Register cnt1_neg = cnt1;
4848   Register ch1 = rscratch1;
4849   Register result_tmp = rscratch2;
4850 
4851   cmp(cnt1, (u1)4);
4852   br(LT, DO1_SHORT);
4853 
4854   orr(ch, ch, ch, LSL, 16);
4855   orr(ch, ch, ch, LSL, 32);
4856 
4857   sub(cnt1, cnt1, 4);
4858   mov(result_tmp, cnt1);
4859   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4860   sub(cnt1_neg, zr, cnt1, LSL, 1);
4861 
4862   mov(tmp3, 0x0001000100010001);
4863 
4864   BIND(CH1_LOOP);
4865     ldr(ch1, Address(str1, cnt1_neg));
4866     eor(ch1, ch, ch1);
4867     sub(tmp1, ch1, tmp3);
4868     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4869     bics(tmp1, tmp1, tmp2);
4870     br(NE, HAS_ZERO);
4871     adds(cnt1_neg, cnt1_neg, 8);
4872     br(LT, CH1_LOOP);
4873 
4874     cmp(cnt1_neg, (u1)8);
4875     mov(cnt1_neg, 0);
4876     br(LT, CH1_LOOP);
4877     b(NOMATCH);
4878 
4879   BIND(HAS_ZERO);
4880     rev(tmp1, tmp1);
4881     clz(tmp1, tmp1);
4882     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4883     b(MATCH);
4884 
4885   BIND(DO1_SHORT);
4886     mov(result_tmp, cnt1);
4887     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4888     sub(cnt1_neg, zr, cnt1, LSL, 1);
4889   BIND(DO1_LOOP);
4890     ldrh(ch1, Address(str1, cnt1_neg));
4891     cmpw(ch, ch1);
4892     br(EQ, MATCH);
4893     adds(cnt1_neg, cnt1_neg, 2);
4894     br(LT, DO1_LOOP);
4895   BIND(NOMATCH);
4896     mov(result, -1);
4897     b(DONE);
4898   BIND(MATCH);
4899     add(result, result_tmp, cnt1_neg, ASR, 1);
4900   BIND(DONE);
4901 }
4902 
4903 // Compare strings.
4904 void MacroAssembler::string_compare(Register str1, Register str2,
4905     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4906     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4907   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4908       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4909       SHORT_LOOP_START, TAIL_CHECK;
4910 
4911   const u1 STUB_THRESHOLD = 64 + 8;
4912   bool isLL = ae == StrIntrinsicNode::LL;
4913   bool isLU = ae == StrIntrinsicNode::LU;
4914   bool isUL = ae == StrIntrinsicNode::UL;
4915 
4916   bool str1_isL = isLL || isLU;
4917   bool str2_isL = isLL || isUL;
4918 
4919   int str1_chr_shift = str1_isL ? 0 : 1;
4920   int str2_chr_shift = str2_isL ? 0 : 1;
4921   int str1_chr_size = str1_isL ? 1 : 2;
4922   int str2_chr_size = str2_isL ? 1 : 2;
4923   int minCharsInWord = isLL ? wordSize : wordSize/2;
4924 
4925   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4926   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4927                                       (chr_insn)&MacroAssembler::ldrh;
4928   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4929                                       (chr_insn)&MacroAssembler::ldrh;
4930   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4931                             (uxt_insn)&MacroAssembler::uxthw;
4932 
4933   BLOCK_COMMENT("string_compare {");
4934 
4935   // Bizzarely, the counts are passed in bytes, regardless of whether they
4936   // are L or U strings, however the result is always in characters.
4937   if (!str1_isL) asrw(cnt1, cnt1, 1);
4938   if (!str2_isL) asrw(cnt2, cnt2, 1);
4939 
4940   // Compute the minimum of the string lengths and save the difference.
4941   subsw(result, cnt1, cnt2);
4942   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4943 
4944   // A very short string
4945   cmpw(cnt2, minCharsInWord);
4946   br(Assembler::LE, SHORT_STRING);
4947 
4948   // Compare longwords
4949   // load first parts of strings and finish initialization while loading
4950   {
4951     if (str1_isL == str2_isL) { // LL or UU
4952       ldr(tmp1, Address(str1));
4953       cmp(str1, str2);
4954       br(Assembler::EQ, DONE);
4955       ldr(tmp2, Address(str2));
4956       cmp(cnt2, STUB_THRESHOLD);
4957       br(GE, STUB);
4958       subsw(cnt2, cnt2, minCharsInWord);
4959       br(EQ, TAIL_CHECK);
4960       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4961       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4962       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4963     } else if (isLU) {
4964       ldrs(vtmp, Address(str1));
4965       cmp(str1, str2);
4966       br(Assembler::EQ, DONE);
4967       ldr(tmp2, Address(str2));
4968       cmp(cnt2, STUB_THRESHOLD);
4969       br(GE, STUB);
4970       subw(cnt2, cnt2, 4);
4971       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4972       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4973       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4974       zip1(vtmp, T8B, vtmp, vtmpZ);
4975       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4976       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4977       add(cnt1, cnt1, 4);
4978       fmovd(tmp1, vtmp);
4979     } else { // UL case
4980       ldr(tmp1, Address(str1));
4981       cmp(str1, str2);
4982       br(Assembler::EQ, DONE);
4983       ldrs(vtmp, Address(str2));
4984       cmp(cnt2, STUB_THRESHOLD);
4985       br(GE, STUB);
4986       subw(cnt2, cnt2, 4);
4987       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4988       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4989       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4990       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4991       zip1(vtmp, T8B, vtmp, vtmpZ);
4992       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4993       add(cnt1, cnt1, 8);
4994       fmovd(tmp2, vtmp);
4995     }
4996     adds(cnt2, cnt2, isUL ? 4 : 8);
4997     br(GE, TAIL);
4998     eor(rscratch2, tmp1, tmp2);
4999     cbnz(rscratch2, DIFFERENCE);
5000     // main loop
5001     bind(NEXT_WORD);
5002     if (str1_isL == str2_isL) {
5003       ldr(tmp1, Address(str1, cnt2));
5004       ldr(tmp2, Address(str2, cnt2));
5005       adds(cnt2, cnt2, 8);
5006     } else if (isLU) {
5007       ldrs(vtmp, Address(str1, cnt1));
5008       ldr(tmp2, Address(str2, cnt2));
5009       add(cnt1, cnt1, 4);
5010       zip1(vtmp, T8B, vtmp, vtmpZ);
5011       fmovd(tmp1, vtmp);
5012       adds(cnt2, cnt2, 8);
5013     } else { // UL
5014       ldrs(vtmp, Address(str2, cnt2));
5015       ldr(tmp1, Address(str1, cnt1));
5016       zip1(vtmp, T8B, vtmp, vtmpZ);
5017       add(cnt1, cnt1, 8);
5018       fmovd(tmp2, vtmp);
5019       adds(cnt2, cnt2, 4);
5020     }
5021     br(GE, TAIL);
5022 
5023     eor(rscratch2, tmp1, tmp2);
5024     cbz(rscratch2, NEXT_WORD);
5025     b(DIFFERENCE);
5026     bind(TAIL);
5027     eor(rscratch2, tmp1, tmp2);
5028     cbnz(rscratch2, DIFFERENCE);
5029     // Last longword.  In the case where length == 4 we compare the
5030     // same longword twice, but that's still faster than another
5031     // conditional branch.
5032     if (str1_isL == str2_isL) {
5033       ldr(tmp1, Address(str1));
5034       ldr(tmp2, Address(str2));
5035     } else if (isLU) {
5036       ldrs(vtmp, Address(str1));
5037       ldr(tmp2, Address(str2));
5038       zip1(vtmp, T8B, vtmp, vtmpZ);
5039       fmovd(tmp1, vtmp);
5040     } else { // UL
5041       ldrs(vtmp, Address(str2));
5042       ldr(tmp1, Address(str1));
5043       zip1(vtmp, T8B, vtmp, vtmpZ);
5044       fmovd(tmp2, vtmp);
5045     }
5046     bind(TAIL_CHECK);
5047     eor(rscratch2, tmp1, tmp2);
5048     cbz(rscratch2, DONE);
5049 
5050     // Find the first different characters in the longwords and
5051     // compute their difference.
5052     bind(DIFFERENCE);
5053     rev(rscratch2, rscratch2);
5054     clz(rscratch2, rscratch2);
5055     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5056     lsrv(tmp1, tmp1, rscratch2);
5057     (this->*ext_chr)(tmp1, tmp1);
5058     lsrv(tmp2, tmp2, rscratch2);
5059     (this->*ext_chr)(tmp2, tmp2);
5060     subw(result, tmp1, tmp2);
5061     b(DONE);
5062   }
5063 
5064   bind(STUB);
5065     RuntimeAddress stub = NULL;
5066     switch(ae) {
5067       case StrIntrinsicNode::LL:
5068         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5069         break;
5070       case StrIntrinsicNode::UU:
5071         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5072         break;
5073       case StrIntrinsicNode::LU:
5074         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5075         break;
5076       case StrIntrinsicNode::UL:
5077         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5078         break;
5079       default:
5080         ShouldNotReachHere();
5081      }
5082     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5083     trampoline_call(stub);
5084     b(DONE);
5085 
5086   bind(SHORT_STRING);
5087   // Is the minimum length zero?
5088   cbz(cnt2, DONE);
5089   // arrange code to do most branches while loading and loading next characters
5090   // while comparing previous
5091   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5092   subs(cnt2, cnt2, 1);
5093   br(EQ, SHORT_LAST_INIT);
5094   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5095   b(SHORT_LOOP_START);
5096   bind(SHORT_LOOP);
5097   subs(cnt2, cnt2, 1);
5098   br(EQ, SHORT_LAST);
5099   bind(SHORT_LOOP_START);
5100   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5101   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5102   cmp(tmp1, cnt1);
5103   br(NE, SHORT_LOOP_TAIL);
5104   subs(cnt2, cnt2, 1);
5105   br(EQ, SHORT_LAST2);
5106   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5107   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5108   cmp(tmp2, rscratch1);
5109   br(EQ, SHORT_LOOP);
5110   sub(result, tmp2, rscratch1);
5111   b(DONE);
5112   bind(SHORT_LOOP_TAIL);
5113   sub(result, tmp1, cnt1);
5114   b(DONE);
5115   bind(SHORT_LAST2);
5116   cmp(tmp2, rscratch1);
5117   br(EQ, DONE);
5118   sub(result, tmp2, rscratch1);
5119 
5120   b(DONE);
5121   bind(SHORT_LAST_INIT);
5122   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5123   bind(SHORT_LAST);
5124   cmp(tmp1, cnt1);
5125   br(EQ, DONE);
5126   sub(result, tmp1, cnt1);
5127 
5128   bind(DONE);
5129 
5130   BLOCK_COMMENT("} string_compare");
5131 }
5132 #endif // COMPILER2
5133 
5134 // This method checks if provided byte array contains byte with highest bit set.
5135 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5136     // Simple and most common case of aligned small array which is not at the
5137     // end of memory page is placed here. All other cases are in stub.
5138     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5139     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5140     assert_different_registers(ary1, len, result);
5141 
5142     cmpw(len, 0);
5143     br(LE, SET_RESULT);
5144     cmpw(len, 4 * wordSize);
5145     br(GE, STUB_LONG); // size > 32 then go to stub
5146 
5147     int shift = 64 - exact_log2(os::vm_page_size());
5148     lsl(rscratch1, ary1, shift);
5149     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5150     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5151     br(CS, STUB); // at the end of page then go to stub
5152     subs(len, len, wordSize);
5153     br(LT, END);
5154 
5155   BIND(LOOP);
5156     ldr(rscratch1, Address(post(ary1, wordSize)));
5157     tst(rscratch1, UPPER_BIT_MASK);
5158     br(NE, SET_RESULT);
5159     subs(len, len, wordSize);
5160     br(GE, LOOP);
5161     cmpw(len, -wordSize);
5162     br(EQ, SET_RESULT);
5163 
5164   BIND(END);
5165     ldr(result, Address(ary1));
5166     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5167     lslv(result, result, len);
5168     tst(result, UPPER_BIT_MASK);
5169     b(SET_RESULT);
5170 
5171   BIND(STUB);
5172     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5173     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5174     trampoline_call(has_neg);
5175     b(DONE);
5176 
5177   BIND(STUB_LONG);
5178     RuntimeAddress has_neg_long =  RuntimeAddress(
5179             StubRoutines::aarch64::has_negatives_long());
5180     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5181     trampoline_call(has_neg_long);
5182     b(DONE);
5183 
5184   BIND(SET_RESULT);
5185     cset(result, NE); // set true or false
5186 
5187   BIND(DONE);
5188 }
5189 
5190 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5191                                    Register tmp4, Register tmp5, Register result,
5192                                    Register cnt1, int elem_size) {
5193   Label DONE, SAME;
5194   Register tmp1 = rscratch1;
5195   Register tmp2 = rscratch2;
5196   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5197   int elem_per_word = wordSize/elem_size;
5198   int log_elem_size = exact_log2(elem_size);
5199   int length_offset = arrayOopDesc::length_offset_in_bytes();
5200   int base_offset
5201     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5202   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5203 
5204   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5205   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5206 
5207 #ifndef PRODUCT
5208   {
5209     const char kind = (elem_size == 2) ? 'U' : 'L';
5210     char comment[64];
5211     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5212     BLOCK_COMMENT(comment);
5213   }
5214 #endif
5215 
5216   // if (a1 == a2)
5217   //     return true;
5218   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5219   br(EQ, SAME);
5220 
5221   if (UseSimpleArrayEquals) {
5222     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5223     // if (a1 == null || a2 == null)
5224     //     return false;
5225     // a1 & a2 == 0 means (some-pointer is null) or
5226     // (very-rare-or-even-probably-impossible-pointer-values)
5227     // so, we can save one branch in most cases
5228     tst(a1, a2);
5229     mov(result, false);
5230     br(EQ, A_MIGHT_BE_NULL);
5231     // if (a1.length != a2.length)
5232     //      return false;
5233     bind(A_IS_NOT_NULL);
5234     ldrw(cnt1, Address(a1, length_offset));
5235     ldrw(cnt2, Address(a2, length_offset));
5236     eorw(tmp5, cnt1, cnt2);
5237     cbnzw(tmp5, DONE);
5238     lea(a1, Address(a1, base_offset));
5239     lea(a2, Address(a2, base_offset));
5240     // Check for short strings, i.e. smaller than wordSize.
5241     subs(cnt1, cnt1, elem_per_word);
5242     br(Assembler::LT, SHORT);
5243     // Main 8 byte comparison loop.
5244     bind(NEXT_WORD); {
5245       ldr(tmp1, Address(post(a1, wordSize)));
5246       ldr(tmp2, Address(post(a2, wordSize)));
5247       subs(cnt1, cnt1, elem_per_word);
5248       eor(tmp5, tmp1, tmp2);
5249       cbnz(tmp5, DONE);
5250     } br(GT, NEXT_WORD);
5251     // Last longword.  In the case where length == 4 we compare the
5252     // same longword twice, but that's still faster than another
5253     // conditional branch.
5254     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5255     // length == 4.
5256     if (log_elem_size > 0)
5257       lsl(cnt1, cnt1, log_elem_size);
5258     ldr(tmp3, Address(a1, cnt1));
5259     ldr(tmp4, Address(a2, cnt1));
5260     eor(tmp5, tmp3, tmp4);
5261     cbnz(tmp5, DONE);
5262     b(SAME);
5263     bind(A_MIGHT_BE_NULL);
5264     // in case both a1 and a2 are not-null, proceed with loads
5265     cbz(a1, DONE);
5266     cbz(a2, DONE);
5267     b(A_IS_NOT_NULL);
5268     bind(SHORT);
5269 
5270     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5271     {
5272       ldrw(tmp1, Address(post(a1, 4)));
5273       ldrw(tmp2, Address(post(a2, 4)));
5274       eorw(tmp5, tmp1, tmp2);
5275       cbnzw(tmp5, DONE);
5276     }
5277     bind(TAIL03);
5278     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5279     {
5280       ldrh(tmp3, Address(post(a1, 2)));
5281       ldrh(tmp4, Address(post(a2, 2)));
5282       eorw(tmp5, tmp3, tmp4);
5283       cbnzw(tmp5, DONE);
5284     }
5285     bind(TAIL01);
5286     if (elem_size == 1) { // Only needed when comparing byte arrays.
5287       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5288       {
5289         ldrb(tmp1, a1);
5290         ldrb(tmp2, a2);
5291         eorw(tmp5, tmp1, tmp2);
5292         cbnzw(tmp5, DONE);
5293       }
5294     }
5295   } else {
5296     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5297         CSET_EQ, LAST_CHECK;
5298     mov(result, false);
5299     cbz(a1, DONE);
5300     ldrw(cnt1, Address(a1, length_offset));
5301     cbz(a2, DONE);
5302     ldrw(cnt2, Address(a2, length_offset));
5303     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5304     // faster to perform another branch before comparing a1 and a2
5305     cmp(cnt1, (u1)elem_per_word);
5306     br(LE, SHORT); // short or same
5307     ldr(tmp3, Address(pre(a1, base_offset)));
5308     subs(zr, cnt1, stubBytesThreshold);
5309     br(GE, STUB);
5310     ldr(tmp4, Address(pre(a2, base_offset)));
5311     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5312     cmp(cnt2, cnt1);
5313     br(NE, DONE);
5314 
5315     // Main 16 byte comparison loop with 2 exits
5316     bind(NEXT_DWORD); {
5317       ldr(tmp1, Address(pre(a1, wordSize)));
5318       ldr(tmp2, Address(pre(a2, wordSize)));
5319       subs(cnt1, cnt1, 2 * elem_per_word);
5320       br(LE, TAIL);
5321       eor(tmp4, tmp3, tmp4);
5322       cbnz(tmp4, DONE);
5323       ldr(tmp3, Address(pre(a1, wordSize)));
5324       ldr(tmp4, Address(pre(a2, wordSize)));
5325       cmp(cnt1, (u1)elem_per_word);
5326       br(LE, TAIL2);
5327       cmp(tmp1, tmp2);
5328     } br(EQ, NEXT_DWORD);
5329     b(DONE);
5330 
5331     bind(TAIL);
5332     eor(tmp4, tmp3, tmp4);
5333     eor(tmp2, tmp1, tmp2);
5334     lslv(tmp2, tmp2, tmp5);
5335     orr(tmp5, tmp4, tmp2);
5336     cmp(tmp5, zr);
5337     b(CSET_EQ);
5338 
5339     bind(TAIL2);
5340     eor(tmp2, tmp1, tmp2);
5341     cbnz(tmp2, DONE);
5342     b(LAST_CHECK);
5343 
5344     bind(STUB);
5345     ldr(tmp4, Address(pre(a2, base_offset)));
5346     cmp(cnt2, cnt1);
5347     br(NE, DONE);
5348     if (elem_size == 2) { // convert to byte counter
5349       lsl(cnt1, cnt1, 1);
5350     }
5351     eor(tmp5, tmp3, tmp4);
5352     cbnz(tmp5, DONE);
5353     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5354     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5355     trampoline_call(stub);
5356     b(DONE);
5357 
5358     bind(EARLY_OUT);
5359     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5360     // so, if a2 == null => return false(0), else return true, so we can return a2
5361     mov(result, a2);
5362     b(DONE);
5363     bind(SHORT);
5364     cmp(cnt2, cnt1);
5365     br(NE, DONE);
5366     cbz(cnt1, SAME);
5367     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5368     ldr(tmp3, Address(a1, base_offset));
5369     ldr(tmp4, Address(a2, base_offset));
5370     bind(LAST_CHECK);
5371     eor(tmp4, tmp3, tmp4);
5372     lslv(tmp5, tmp4, tmp5);
5373     cmp(tmp5, zr);
5374     bind(CSET_EQ);
5375     cset(result, EQ);
5376     b(DONE);
5377   }
5378 
5379   bind(SAME);
5380   mov(result, true);
5381   // That's it.
5382   bind(DONE);
5383 
5384   BLOCK_COMMENT("} array_equals");
5385 }
5386 
5387 // Compare Strings
5388 
5389 // For Strings we're passed the address of the first characters in a1
5390 // and a2 and the length in cnt1.
5391 // elem_size is the element size in bytes: either 1 or 2.
5392 // There are two implementations.  For arrays >= 8 bytes, all
5393 // comparisons (including the final one, which may overlap) are
5394 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5395 // halfword, then a short, and then a byte.
5396 
5397 void MacroAssembler::string_equals(Register a1, Register a2,
5398                                    Register result, Register cnt1, int elem_size)
5399 {
5400   Label SAME, DONE, SHORT, NEXT_WORD;
5401   Register tmp1 = rscratch1;
5402   Register tmp2 = rscratch2;
5403   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5404 
5405   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5406   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5407 
5408 #ifndef PRODUCT
5409   {
5410     const char kind = (elem_size == 2) ? 'U' : 'L';
5411     char comment[64];
5412     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5413     BLOCK_COMMENT(comment);
5414   }
5415 #endif
5416 
5417   mov(result, false);
5418 
5419   // Check for short strings, i.e. smaller than wordSize.
5420   subs(cnt1, cnt1, wordSize);
5421   br(Assembler::LT, SHORT);
5422   // Main 8 byte comparison loop.
5423   bind(NEXT_WORD); {
5424     ldr(tmp1, Address(post(a1, wordSize)));
5425     ldr(tmp2, Address(post(a2, wordSize)));
5426     subs(cnt1, cnt1, wordSize);
5427     eor(tmp1, tmp1, tmp2);
5428     cbnz(tmp1, DONE);
5429   } br(GT, NEXT_WORD);
5430   // Last longword.  In the case where length == 4 we compare the
5431   // same longword twice, but that's still faster than another
5432   // conditional branch.
5433   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5434   // length == 4.
5435   ldr(tmp1, Address(a1, cnt1));
5436   ldr(tmp2, Address(a2, cnt1));
5437   eor(tmp2, tmp1, tmp2);
5438   cbnz(tmp2, DONE);
5439   b(SAME);
5440 
5441   bind(SHORT);
5442   Label TAIL03, TAIL01;
5443 
5444   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5445   {
5446     ldrw(tmp1, Address(post(a1, 4)));
5447     ldrw(tmp2, Address(post(a2, 4)));
5448     eorw(tmp1, tmp1, tmp2);
5449     cbnzw(tmp1, DONE);
5450   }
5451   bind(TAIL03);
5452   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5453   {
5454     ldrh(tmp1, Address(post(a1, 2)));
5455     ldrh(tmp2, Address(post(a2, 2)));
5456     eorw(tmp1, tmp1, tmp2);
5457     cbnzw(tmp1, DONE);
5458   }
5459   bind(TAIL01);
5460   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5461     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5462     {
5463       ldrb(tmp1, a1);
5464       ldrb(tmp2, a2);
5465       eorw(tmp1, tmp1, tmp2);
5466       cbnzw(tmp1, DONE);
5467     }
5468   }
5469   // Arrays are equal.
5470   bind(SAME);
5471   mov(result, true);
5472 
5473   // That's it.
5474   bind(DONE);
5475   BLOCK_COMMENT("} string_equals");
5476 }
5477 
5478 
5479 // The size of the blocks erased by the zero_blocks stub.  We must
5480 // handle anything smaller than this ourselves in zero_words().
5481 const int MacroAssembler::zero_words_block_size = 8;
5482 
5483 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5484 // possible, handling small word counts locally and delegating
5485 // anything larger to the zero_blocks stub.  It is expanded many times
5486 // in compiled code, so it is important to keep it short.
5487 
5488 // ptr:   Address of a buffer to be zeroed.
5489 // cnt:   Count in HeapWords.
5490 //
5491 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5492 void MacroAssembler::zero_words(Register ptr, Register cnt)
5493 {
5494   assert(is_power_of_2(zero_words_block_size), "adjust this");
5495   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5496 
5497   BLOCK_COMMENT("zero_words {");
5498   cmp(cnt, (u1)zero_words_block_size);
5499   Label around;
5500   br(LO, around);
5501   {
5502     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5503     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5504     if (StubRoutines::aarch64::complete()) {
5505       trampoline_call(zero_blocks);
5506     } else {
5507       bl(zero_blocks);
5508     }
5509   }
5510   bind(around);
5511   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5512     Label l;
5513     tbz(cnt, exact_log2(i), l);
5514     for (int j = 0; j < i; j += 2) {
5515       stp(zr, zr, post(ptr, 16));
5516     }
5517     bind(l);
5518   }
5519   {
5520     Label l;
5521     tbz(cnt, 0, l);
5522     str(zr, Address(ptr));
5523     bind(l);
5524   }
5525   BLOCK_COMMENT("} zero_words");
5526 }
5527 
5528 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5529 // cnt:          Immediate count in HeapWords.
5530 #define SmallArraySize (18 * BytesPerLong)
5531 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5532 {
5533   BLOCK_COMMENT("zero_words {");
5534   int i = cnt & 1;  // store any odd word to start
5535   if (i) str(zr, Address(base));
5536 
5537   if (cnt <= SmallArraySize / BytesPerLong) {
5538     for (; i < (int)cnt; i += 2)
5539       stp(zr, zr, Address(base, i * wordSize));
5540   } else {
5541     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5542     int remainder = cnt % (2 * unroll);
5543     for (; i < remainder; i += 2)
5544       stp(zr, zr, Address(base, i * wordSize));
5545 
5546     Label loop;
5547     Register cnt_reg = rscratch1;
5548     Register loop_base = rscratch2;
5549     cnt = cnt - remainder;
5550     mov(cnt_reg, cnt);
5551     // adjust base and prebias by -2 * wordSize so we can pre-increment
5552     add(loop_base, base, (remainder - 2) * wordSize);
5553     bind(loop);
5554     sub(cnt_reg, cnt_reg, 2 * unroll);
5555     for (i = 1; i < unroll; i++)
5556       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5557     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5558     cbnz(cnt_reg, loop);
5559   }
5560   BLOCK_COMMENT("} zero_words");
5561 }
5562 
5563 // Zero blocks of memory by using DC ZVA.
5564 //
5565 // Aligns the base address first sufficently for DC ZVA, then uses
5566 // DC ZVA repeatedly for every full block.  cnt is the size to be
5567 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5568 // in cnt.
5569 //
5570 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5571 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5572 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5573   Register tmp = rscratch1;
5574   Register tmp2 = rscratch2;
5575   int zva_length = VM_Version::zva_length();
5576   Label initial_table_end, loop_zva;
5577   Label fini;
5578 
5579   // Base must be 16 byte aligned. If not just return and let caller handle it
5580   tst(base, 0x0f);
5581   br(Assembler::NE, fini);
5582   // Align base with ZVA length.
5583   neg(tmp, base);
5584   andr(tmp, tmp, zva_length - 1);
5585 
5586   // tmp: the number of bytes to be filled to align the base with ZVA length.
5587   add(base, base, tmp);
5588   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5589   adr(tmp2, initial_table_end);
5590   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5591   br(tmp2);
5592 
5593   for (int i = -zva_length + 16; i < 0; i += 16)
5594     stp(zr, zr, Address(base, i));
5595   bind(initial_table_end);
5596 
5597   sub(cnt, cnt, zva_length >> 3);
5598   bind(loop_zva);
5599   dc(Assembler::ZVA, base);
5600   subs(cnt, cnt, zva_length >> 3);
5601   add(base, base, zva_length);
5602   br(Assembler::GE, loop_zva);
5603   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5604   bind(fini);
5605 }
5606 
5607 // base:   Address of a buffer to be filled, 8 bytes aligned.
5608 // cnt:    Count in 8-byte unit.
5609 // value:  Value to be filled with.
5610 // base will point to the end of the buffer after filling.
5611 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5612 {
5613 //  Algorithm:
5614 //
5615 //    scratch1 = cnt & 7;
5616 //    cnt -= scratch1;
5617 //    p += scratch1;
5618 //    switch (scratch1) {
5619 //      do {
5620 //        cnt -= 8;
5621 //          p[-8] = v;
5622 //        case 7:
5623 //          p[-7] = v;
5624 //        case 6:
5625 //          p[-6] = v;
5626 //          // ...
5627 //        case 1:
5628 //          p[-1] = v;
5629 //        case 0:
5630 //          p += 8;
5631 //      } while (cnt);
5632 //    }
5633 
5634   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5635 
5636   Label fini, skip, entry, loop;
5637   const int unroll = 8; // Number of stp instructions we'll unroll
5638 
5639   cbz(cnt, fini);
5640   tbz(base, 3, skip);
5641   str(value, Address(post(base, 8)));
5642   sub(cnt, cnt, 1);
5643   bind(skip);
5644 
5645   andr(rscratch1, cnt, (unroll-1) * 2);
5646   sub(cnt, cnt, rscratch1);
5647   add(base, base, rscratch1, Assembler::LSL, 3);
5648   adr(rscratch2, entry);
5649   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5650   br(rscratch2);
5651 
5652   bind(loop);
5653   add(base, base, unroll * 16);
5654   for (int i = -unroll; i < 0; i++)
5655     stp(value, value, Address(base, i * 16));
5656   bind(entry);
5657   subs(cnt, cnt, unroll * 2);
5658   br(Assembler::GE, loop);
5659 
5660   tbz(cnt, 0, fini);
5661   str(value, Address(post(base, 8)));
5662   bind(fini);
5663 }
5664 
5665 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5666 // java/lang/StringUTF16.compress.
5667 void MacroAssembler::encode_iso_array(Register src, Register dst,
5668                       Register len, Register result,
5669                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5670                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5671 {
5672     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5673         NEXT_32_START, NEXT_32_PRFM_START;
5674     Register tmp1 = rscratch1, tmp2 = rscratch2;
5675 
5676       mov(result, len); // Save initial len
5677 
5678 #ifndef BUILTIN_SIM
5679       cmp(len, (u1)8); // handle shortest strings first
5680       br(LT, LOOP_1);
5681       cmp(len, (u1)32);
5682       br(LT, NEXT_8);
5683       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5684       // to convert chars to bytes
5685       if (SoftwarePrefetchHintDistance >= 0) {
5686         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5687         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5688         br(LE, NEXT_32_START);
5689         b(NEXT_32_PRFM_START);
5690         BIND(NEXT_32_PRFM);
5691           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5692         BIND(NEXT_32_PRFM_START);
5693           prfm(Address(src, SoftwarePrefetchHintDistance));
5694           orr(v4, T16B, Vtmp1, Vtmp2);
5695           orr(v5, T16B, Vtmp3, Vtmp4);
5696           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5697           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5698           uzp2(v5, T16B, v4, v5); // high bytes
5699           umov(tmp2, v5, D, 1);
5700           fmovd(tmp1, v5);
5701           orr(tmp1, tmp1, tmp2);
5702           cbnz(tmp1, LOOP_8);
5703           stpq(Vtmp1, Vtmp3, dst);
5704           sub(len, len, 32);
5705           add(dst, dst, 32);
5706           add(src, src, 64);
5707           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5708           br(GE, NEXT_32_PRFM);
5709           cmp(len, (u1)32);
5710           br(LT, LOOP_8);
5711         BIND(NEXT_32);
5712           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5713         BIND(NEXT_32_START);
5714       } else {
5715         BIND(NEXT_32);
5716           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5717       }
5718       prfm(Address(src, SoftwarePrefetchHintDistance));
5719       uzp1(v4, T16B, Vtmp1, Vtmp2);
5720       uzp1(v5, T16B, Vtmp3, Vtmp4);
5721       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5722       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5723       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5724       umov(tmp2, Vtmp1, D, 1);
5725       fmovd(tmp1, Vtmp1);
5726       orr(tmp1, tmp1, tmp2);
5727       cbnz(tmp1, LOOP_8);
5728       stpq(v4, v5, dst);
5729       sub(len, len, 32);
5730       add(dst, dst, 32);
5731       add(src, src, 64);
5732       cmp(len, (u1)32);
5733       br(GE, NEXT_32);
5734       cbz(len, DONE);
5735 
5736     BIND(LOOP_8);
5737       cmp(len, (u1)8);
5738       br(LT, LOOP_1);
5739     BIND(NEXT_8);
5740       ld1(Vtmp1, T8H, src);
5741       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5742       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5743       fmovd(tmp1, Vtmp3);
5744       cbnz(tmp1, NEXT_1);
5745       strd(Vtmp2, dst);
5746 
5747       sub(len, len, 8);
5748       add(dst, dst, 8);
5749       add(src, src, 16);
5750       cmp(len, (u1)8);
5751       br(GE, NEXT_8);
5752 
5753     BIND(LOOP_1);
5754 #endif
5755     cbz(len, DONE);
5756     BIND(NEXT_1);
5757       ldrh(tmp1, Address(post(src, 2)));
5758       tst(tmp1, 0xff00);
5759       br(NE, SET_RESULT);
5760       strb(tmp1, Address(post(dst, 1)));
5761       subs(len, len, 1);
5762       br(GT, NEXT_1);
5763 
5764     BIND(SET_RESULT);
5765       sub(result, result, len); // Return index where we stopped
5766                                 // Return len == 0 if we processed all
5767                                 // characters
5768     BIND(DONE);
5769 }
5770 
5771 
5772 // Inflate byte[] array to char[].
5773 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5774                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5775                                         Register tmp4) {
5776   Label big, done, after_init, to_stub;
5777 
5778   assert_different_registers(src, dst, len, tmp4, rscratch1);
5779 
5780   fmovd(vtmp1, zr);
5781   lsrw(tmp4, len, 3);
5782   bind(after_init);
5783   cbnzw(tmp4, big);
5784   // Short string: less than 8 bytes.
5785   {
5786     Label loop, tiny;
5787 
5788     cmpw(len, 4);
5789     br(LT, tiny);
5790     // Use SIMD to do 4 bytes.
5791     ldrs(vtmp2, post(src, 4));
5792     zip1(vtmp3, T8B, vtmp2, vtmp1);
5793     subw(len, len, 4);
5794     strd(vtmp3, post(dst, 8));
5795 
5796     cbzw(len, done);
5797 
5798     // Do the remaining bytes by steam.
5799     bind(loop);
5800     ldrb(tmp4, post(src, 1));
5801     strh(tmp4, post(dst, 2));
5802     subw(len, len, 1);
5803 
5804     bind(tiny);
5805     cbnz(len, loop);
5806 
5807     b(done);
5808   }
5809 
5810   if (SoftwarePrefetchHintDistance >= 0) {
5811     bind(to_stub);
5812       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5813       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5814       trampoline_call(stub);
5815       b(after_init);
5816   }
5817 
5818   // Unpack the bytes 8 at a time.
5819   bind(big);
5820   {
5821     Label loop, around, loop_last, loop_start;
5822 
5823     if (SoftwarePrefetchHintDistance >= 0) {
5824       const int large_loop_threshold = (64 + 16)/8;
5825       ldrd(vtmp2, post(src, 8));
5826       andw(len, len, 7);
5827       cmp(tmp4, (u1)large_loop_threshold);
5828       br(GE, to_stub);
5829       b(loop_start);
5830 
5831       bind(loop);
5832       ldrd(vtmp2, post(src, 8));
5833       bind(loop_start);
5834       subs(tmp4, tmp4, 1);
5835       br(EQ, loop_last);
5836       zip1(vtmp2, T16B, vtmp2, vtmp1);
5837       ldrd(vtmp3, post(src, 8));
5838       st1(vtmp2, T8H, post(dst, 16));
5839       subs(tmp4, tmp4, 1);
5840       zip1(vtmp3, T16B, vtmp3, vtmp1);
5841       st1(vtmp3, T8H, post(dst, 16));
5842       br(NE, loop);
5843       b(around);
5844       bind(loop_last);
5845       zip1(vtmp2, T16B, vtmp2, vtmp1);
5846       st1(vtmp2, T8H, post(dst, 16));
5847       bind(around);
5848       cbz(len, done);
5849     } else {
5850       andw(len, len, 7);
5851       bind(loop);
5852       ldrd(vtmp2, post(src, 8));
5853       sub(tmp4, tmp4, 1);
5854       zip1(vtmp3, T16B, vtmp2, vtmp1);
5855       st1(vtmp3, T8H, post(dst, 16));
5856       cbnz(tmp4, loop);
5857     }
5858   }
5859 
5860   // Do the tail of up to 8 bytes.
5861   add(src, src, len);
5862   ldrd(vtmp3, Address(src, -8));
5863   add(dst, dst, len, ext::uxtw, 1);
5864   zip1(vtmp3, T16B, vtmp3, vtmp1);
5865   strq(vtmp3, Address(dst, -16));
5866 
5867   bind(done);
5868 }
5869 
5870 // Compress char[] array to byte[].
5871 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5872                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5873                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5874                                          Register result) {
5875   encode_iso_array(src, dst, len, result,
5876                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5877   cmp(len, zr);
5878   csel(result, result, zr, EQ);
5879 }
5880 
5881 // get_thread() can be called anywhere inside generated code so we
5882 // need to save whatever non-callee save context might get clobbered
5883 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5884 // the call setup code.
5885 //
5886 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5887 //
5888 void MacroAssembler::get_thread(Register dst) {
5889   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5890   push(saved_regs, sp);
5891 
5892   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5893   blrt(lr, 1, 0, 1);
5894   if (dst != c_rarg0) {
5895     mov(dst, c_rarg0);
5896   }
5897 
5898   pop(saved_regs, sp);
5899 }
5900 
5901 // C2 compiled method's prolog code 
5902 // Moved here from aarch64.ad to support Valhalla code belows
5903 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5904 
5905 // n.b. frame size includes space for return pc and rfp
5906   const long framesize = C->frame_size_in_bytes();
5907   assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment");
5908 
5909   // insert a nop at the start of the prolog so we can patch in a
5910   // branch if we need to invalidate the method later
5911   nop();
5912 
5913   int bangsize = C->bang_size_in_bytes();
5914   if (C->need_stack_bang(bangsize) && UseStackBanging)
5915      generate_stack_overflow_check(bangsize);
5916 
5917   build_frame(framesize);
5918 
5919   if (NotifySimulator) {
5920     notify(Assembler::method_entry);
5921   }
5922 
5923   if (VerifyStackAtCalls) {
5924     Unimplemented();
5925   }
5926 }
5927 
5928 
5929 // DMS TODO: Need extra eyes to bring code below to good shape. 
5930 // 
5931 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
5932 
5933   assert(C->has_scalarized_args(), "value type argument scalarization is disabled");
5934   Method* method = C->method()->get_Method();
5935   const GrowableArray<SigEntry>* sig_cc = method->adapter()->get_sig_cc();
5936   assert(sig_cc != NULL, "must have scalarized signature");
5937 
5938   // Get unscalarized calling convention
5939   BasicType* sig_bt = NEW_RESOURCE_ARRAY(BasicType, sig_cc->length());
5940   int args_passed = 0;
5941   if (!method->is_static()) {
5942     sig_bt[args_passed++] = T_OBJECT;
5943   }
5944   if (!receiver_only) {
5945     for (SignatureStream ss(method->signature()); !ss.at_return_type(); ss.next()) {
5946       BasicType bt = ss.type();
5947       sig_bt[args_passed++] = bt;
5948       if (type2size[bt] == 2) {
5949         sig_bt[args_passed++] = T_VOID;
5950       }
5951     }
5952   } else {
5953     // Only unpack the receiver, all other arguments are already scalarized
5954     InstanceKlass* holder = method->method_holder();
5955     int rec_len = holder->is_value() ? ValueKlass::cast(holder)->extended_sig()->length() : 1;
5956     // Copy scalarized signature but skip receiver, value type delimiters and reserved entries
5957     for (int i = 0; i < sig_cc->length(); i++) {
5958       if (!SigEntry::is_reserved_entry(sig_cc, i)) {
5959         if (SigEntry::skip_value_delimiters(sig_cc, i) && rec_len <= 0) {
5960           sig_bt[args_passed++] = sig_cc->at(i)._bt;
5961         }
5962         rec_len--;
5963       }
5964     }
5965   }
5966 
5967   VMRegPair* regs = NEW_RESOURCE_ARRAY(VMRegPair, args_passed);
5968   int args_on_stack = SharedRuntime::java_calling_convention(sig_bt, regs, args_passed, false);
5969 
5970   // Get scalarized calling convention
5971   int args_passed_cc = SigEntry::fill_sig_bt(sig_cc, sig_bt);
5972   VMRegPair* regs_cc = NEW_RESOURCE_ARRAY(VMRegPair, sig_cc->length());
5973   int args_on_stack_cc = SharedRuntime::java_calling_convention(sig_bt, regs_cc, args_passed_cc, false);
5974 
5975   // Check if we need to extend the stack for unpacking
5976   int sp_inc = (args_on_stack_cc - args_on_stack) * VMRegImpl::stack_slot_size;
5977   if (sp_inc > 0) {
5978     // Save the return address, adjust the stack (make sure it is properly
5979     // 16-byte aligned) and copy the return address to the new top of the stack.
5980     // pop(r13);
5981     sp_inc = align_up(sp_inc, StackAlignmentInBytes);
5982     // DMS CHECK: subptr(rsp, sp_inc); 
5983     sub(sp, sp, sp_inc); 
5984     // push(r13);
5985   } else {
5986     // The scalarized calling convention needs less stack space than the unscalarized one.
5987     // No need to extend the stack, the caller will take care of these adjustments.
5988     sp_inc = 0;
5989   }
5990 
5991   // Initialize register/stack slot states (make all writable)
5992   int max_stack = MAX2(args_on_stack + sp_inc/VMRegImpl::stack_slot_size, args_on_stack_cc);
5993   int max_reg = VMRegImpl::stack2reg(max_stack)->value();
5994   RegState* reg_state = NEW_RESOURCE_ARRAY(RegState, max_reg);
5995   for (int i = 0; i < max_reg; ++i) {
5996     reg_state[i] = reg_writable;
5997   }
5998   // Set all source registers/stack slots to readonly to prevent accidental overwriting
5999   for (int i = 0; i < args_passed; ++i) {
6000     VMReg reg = regs[i].first();
6001     if (!reg->is_valid()) continue;
6002     if (reg->is_stack()) {
6003       // Update source stack location by adding stack increment
6004       reg = VMRegImpl::stack2reg(reg->reg2stack() + sp_inc/VMRegImpl::stack_slot_size);
6005       regs[i] = reg;
6006     }
6007     assert(reg->value() >= 0 && reg->value() < max_reg, "reg value out of bounds");
6008     reg_state[reg->value()] = reg_readonly;
6009   }
6010 
6011 
6012   // Emit code for unpacking value type arguments
6013   // We try multiple times and eventually start spilling to resolve (circular) dependencies
6014   bool done = false;
6015   for (int i = 0; i < 2 * args_passed_cc && !done; ++i) {
6016     done = true;
6017     bool spill = (i > args_passed_cc); // Start spilling?
6018     // Iterate over all arguments (in reverse)
6019     for (int from_index = args_passed - 1, to_index = args_passed_cc - 1, sig_index = sig_cc->length() - 1; sig_index >= 0; sig_index--) {
6020       if (SigEntry::is_reserved_entry(sig_cc, sig_index)) {
6021         to_index--; // Skip reserved entry
6022       } else {
6023         assert(from_index >= 0, "index out of bounds");
6024         VMReg reg = regs[from_index].first();
6025         if (spill && reg->is_valid() && reg_state[reg->value()] == reg_readonly) {
6026           // Spill argument to be able to write the source and resolve circular dependencies
6027           VMReg spill_reg = r14->as_VMReg();
6028           bool res = move_helper(reg, spill_reg, T_DOUBLE, reg_state, sp_inc);
6029           assert(res, "Spilling should not fail");
6030           // Set spill_reg as new source and update state
6031           reg = spill_reg;
6032           regs[from_index].set1(reg);
6033           reg_state[reg->value()] = reg_readonly;
6034           spill = false; // Do not spill again in this round
6035         }
6036         BasicType bt = sig_cc->at(sig_index)._bt;
6037         if (SigEntry::skip_value_delimiters(sig_cc, sig_index)) {
6038           assert(to_index >= 0, "index out of bounds");
6039           done &= move_helper(reg, regs_cc[to_index].first(), bt, reg_state, sp_inc);
6040           to_index--;
6041         } else if (!receiver_only || (from_index == 0 && bt == T_VOID)) {
6042           done &= unpack_value_helper(sig_cc, sig_index, reg, regs_cc, to_index, reg_state, sp_inc);
6043         } else {
6044           continue;
6045         }
6046         from_index--;
6047       }
6048     }
6049   }
6050   guarantee(done, "Could not resolve circular dependency when unpacking value type arguments");
6051 
6052   // Emit code for verified entry and save increment for stack repair on return
6053   verified_entry(C, sp_inc);
6054 }
6055 
6056 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off) {
6057  if (reg_state[to->value()] == reg_written) {
6058     return true; // Already written
6059   }
6060   if (from != to && bt != T_VOID) {
6061     if (reg_state[to->value()] == reg_readonly) {
6062       return false; // Not yet writable
6063     }
6064     if (from->is_reg()) {
6065       if (to->is_reg()) {
6066           mov(to->as_Register(), from->as_Register());
6067       } else {
6068         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6069         assert(st_off != ret_off, "overwriting return address at %d", st_off);
6070         Address to_addr = Address(sp, st_off);
6071         str(from->as_Register(), to_addr);
6072       }
6073     } else {
6074       Address from_addr = Address(sp, from->reg2stack() * VMRegImpl::stack_slot_size + wordSize);
6075       if (to->is_reg()) {
6076         ldr(to->as_Register(), from_addr);
6077       } else {
6078         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6079         assert(st_off != ret_off, "overwriting return address at %d", st_off);
6080         ldr(rscratch1, from_addr);
6081         str(rscratch1, Address(sp, st_off));
6082       }
6083     }
6084   }
6085   // Update register states
6086   reg_state[from->value()] = reg_writable;
6087   reg_state[to->value()] = reg_written;
6088   return true;
6089 }
6090 
6091 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to, int& to_index, RegState reg_state[], int ret_off) {
6092   Register fromReg = from->is_reg() ? from->as_Register() : noreg;
6093   assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6094 
6095   int vt = 1;
6096   bool done = true;
6097   bool mark_done = true;
6098   do {
6099     sig_index--;
6100     BasicType bt = sig->at(sig_index)._bt;
6101     if (bt == T_VALUETYPE) {
6102       vt--;
6103     } else if (bt == T_VOID && sig->at(sig_index-1)._bt != T_LONG && sig->at(sig_index-1)._bt != T_DOUBLE) {
6104       vt++;
6105     } else if (SigEntry::is_reserved_entry(sig, sig_index)) {
6106       to_index--; // Ignore this
6107     } else {
6108 
6109       assert(to_index >= 0, "invalid to_index");
6110       VMRegPair pair_to = regs_to[to_index--];
6111       VMReg r_1 = pair_to.first();
6112       VMReg r_2 = pair_to.second();
6113 
6114       if (bt == T_VOID) continue;
6115 
6116       int idx = (int) r_1->value();
6117       if (reg_state[idx] == reg_readonly) {
6118          if (idx != from->value()) {
6119            mark_done = false;
6120          }
6121          done = false;
6122          continue;
6123       } else if (reg_state[idx] == reg_written) {
6124         continue;
6125       } else {
6126         assert(reg_state[idx] == reg_writable, "must be writable");
6127         reg_state[idx] = reg_written;
6128       }
6129 
6130       if (fromReg == noreg) {
6131         int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + wordSize;
6132         ldr(r10, Address(sp, st_off)); 
6133         fromReg = r10;
6134       }
6135 
6136       int off = sig->at(sig_index)._offset;
6137       assert(off > 0, "offset in object should be positive");
6138 
6139       Address fromAddr = Address(fromReg, off);
6140 
6141       if (r_1->is_stack()) {
6142         // Convert stack slot to an SP offset (+ wordSize to account for return address )
6143          int st_off = r_1->reg2stack() * VMRegImpl::stack_slot_size;
6144         if (!r_2->is_valid()) {
6145           // sign extend???
6146           ldrsw(rscratch2, fromAddr);
6147           str(rscratch2, Address(sp, st_off));
6148         } else {
6149           ldr(rscratch2, fromAddr);
6150           str(rscratch2, Address(sp, st_off));
6151         }
6152      } else if (r_1->is_Register()) {  // Register argument
6153         Register r = r_1->as_Register();
6154         if (r_2->is_valid()) {
6155           ldr(r, fromAddr);
6156         } else {
6157           ldrw(r, fromAddr);
6158         }
6159      } else { 
6160        if (!r_2->is_valid()) {
6161          ldrs(r_1->as_FloatRegister(), fromAddr);
6162        } else {
6163          ldrd(r_1->as_FloatRegister(), fromAddr);
6164        }
6165     }
6166 
6167    }
6168   } while (vt != 0);
6169 
6170   if (mark_done && reg_state[from->value()] != reg_written) {
6171     // This is okay because no one else will write to that slot
6172     reg_state[from->value()] = reg_writable;
6173   }
6174   return done;
6175 }
6176