1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/thread.hpp"
  50 #ifdef COMPILER1
  51 #include "c1/c1_LIRAssembler.hpp"
  52 #endif
  53 #ifdef COMPILER2
  54 #include "oops/oop.hpp"
  55 #include "opto/compile.hpp"
  56 #include "opto/intrinsicnode.hpp"
  57 #include "opto/node.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #define STOP(error) stop(error)
  63 #else
  64 #define BLOCK_COMMENT(str) block_comment(str)
  65 #define STOP(error) block_comment(error); stop(error)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Patch any kind of instruction; there may be several instructions.
  71 // Return the total length (in bytes) of the instructions.
  72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  73   int instructions = 1;
  74   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  75   long offset = (target - branch) >> 2;
  76   unsigned insn = *(unsigned*)branch;
  77   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  78     // Load register (literal)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  81     // Unconditional branch (immediate)
  82     Instruction_aarch64::spatch(branch, 25, 0, offset);
  83   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  84     // Conditional branch (immediate)
  85     Instruction_aarch64::spatch(branch, 23, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  87     // Compare & branch (immediate)
  88     Instruction_aarch64::spatch(branch, 23, 5, offset);
  89   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  90     // Test & branch (immediate)
  91     Instruction_aarch64::spatch(branch, 18, 5, offset);
  92   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  93     // PC-rel. addressing
  94     offset = target-branch;
  95     int shift = Instruction_aarch64::extract(insn, 31, 31);
  96     if (shift) {
  97       u_int64_t dest = (u_int64_t)target;
  98       uint64_t pc_page = (uint64_t)branch >> 12;
  99       uint64_t adr_page = (uint64_t)target >> 12;
 100       unsigned offset_lo = dest & 0xfff;
 101       offset = adr_page - pc_page;
 102 
 103       // We handle 4 types of PC relative addressing
 104       //   1 - adrp    Rx, target_page
 105       //       ldr/str Ry, [Rx, #offset_in_page]
 106       //   2 - adrp    Rx, target_page
 107       //       add     Ry, Rx, #offset_in_page
 108       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 109       //       movk    Rx, #imm16<<32
 110       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 111       // In the first 3 cases we must check that Rx is the same in the adrp and the
 112       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 113       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 114       // to be followed by a random unrelated ldr/str, add or movk instruction.
 115       //
 116       unsigned insn2 = ((unsigned*)branch)[1];
 117       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 118                 Instruction_aarch64::extract(insn, 4, 0) ==
 119                         Instruction_aarch64::extract(insn2, 9, 5)) {
 120         // Load/store register (unsigned immediate)
 121         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 122         Instruction_aarch64::patch(branch + sizeof (unsigned),
 123                                     21, 10, offset_lo >> size);
 124         guarantee(((dest >> size) << size) == dest, "misaligned target");
 125         instructions = 2;
 126       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 127                 Instruction_aarch64::extract(insn, 4, 0) ==
 128                         Instruction_aarch64::extract(insn2, 4, 0)) {
 129         // add (immediate)
 130         Instruction_aarch64::patch(branch + sizeof (unsigned),
 131                                    21, 10, offset_lo);
 132         instructions = 2;
 133       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 134                    Instruction_aarch64::extract(insn, 4, 0) ==
 135                      Instruction_aarch64::extract(insn2, 4, 0)) {
 136         // movk #imm16<<32
 137         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 138         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 139         long pc_page = (long)branch >> 12;
 140         long adr_page = (long)dest >> 12;
 141         offset = adr_page - pc_page;
 142         instructions = 2;
 143       }
 144     }
 145     int offset_lo = offset & 3;
 146     offset >>= 2;
 147     Instruction_aarch64::spatch(branch, 23, 5, offset);
 148     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 149   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 150     u_int64_t dest = (u_int64_t)target;
 151     // Move wide constant
 152     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 153     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 154     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 155     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 156     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 157     assert(target_addr_for_insn(branch) == target, "should be");
 158     instructions = 3;
 159   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 160              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 161     // nothing to do
 162     assert(target == 0, "did not expect to relocate target for polling page load");
 163   } else {
 164     ShouldNotReachHere();
 165   }
 166   return instructions * NativeInstruction::instruction_size;
 167 }
 168 
 169 int MacroAssembler::patch_oop(address insn_addr, address o) {
 170   int instructions;
 171   unsigned insn = *(unsigned*)insn_addr;
 172   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 173 
 174   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 175   // narrow OOPs by setting the upper 16 bits in the first
 176   // instruction.
 177   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 178     // Move narrow OOP
 179     narrowOop n = CompressedOops::encode((oop)o);
 180     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 181     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 182     instructions = 2;
 183   } else {
 184     // Move wide OOP
 185     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 186     uintptr_t dest = (uintptr_t)o;
 187     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 190     instructions = 3;
 191   }
 192   return instructions * NativeInstruction::instruction_size;
 193 }
 194 
 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 196   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 197   // We encode narrow ones by setting the upper 16 bits in the first
 198   // instruction.
 199   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 200   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 201          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 202 
 203   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 204   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 205   return 2 * NativeInstruction::instruction_size;
 206 }
 207 
 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 209   long offset = 0;
 210   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 211     // Load register (literal)
 212     offset = Instruction_aarch64::sextract(insn, 23, 5);
 213     return address(((uint64_t)insn_addr + (offset << 2)));
 214   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 215     // Unconditional branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 25, 0);
 217   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 218     // Conditional branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 23, 5);
 220   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 221     // Compare & branch (immediate)
 222     offset = Instruction_aarch64::sextract(insn, 23, 5);
 223    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 224     // Test & branch (immediate)
 225     offset = Instruction_aarch64::sextract(insn, 18, 5);
 226   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 227     // PC-rel. addressing
 228     offset = Instruction_aarch64::extract(insn, 30, 29);
 229     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 230     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 231     if (shift) {
 232       offset <<= shift;
 233       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 234       target_page &= ((uint64_t)-1) << shift;
 235       // Return the target address for the following sequences
 236       //   1 - adrp    Rx, target_page
 237       //       ldr/str Ry, [Rx, #offset_in_page]
 238       //   2 - adrp    Rx, target_page
 239       //       add     Ry, Rx, #offset_in_page
 240       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 241       //       movk    Rx, #imm12<<32
 242       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 243       //
 244       // In the first two cases  we check that the register is the same and
 245       // return the target_page + the offset within the page.
 246       // Otherwise we assume it is a page aligned relocation and return
 247       // the target page only.
 248       //
 249       unsigned insn2 = ((unsigned*)insn_addr)[1];
 250       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 251                 Instruction_aarch64::extract(insn, 4, 0) ==
 252                         Instruction_aarch64::extract(insn2, 9, 5)) {
 253         // Load/store register (unsigned immediate)
 254         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 255         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 256         return address(target_page + (byte_offset << size));
 257       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 258                 Instruction_aarch64::extract(insn, 4, 0) ==
 259                         Instruction_aarch64::extract(insn2, 4, 0)) {
 260         // add (immediate)
 261         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 262         return address(target_page + byte_offset);
 263       } else {
 264         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 265                Instruction_aarch64::extract(insn, 4, 0) ==
 266                  Instruction_aarch64::extract(insn2, 4, 0)) {
 267           target_page = (target_page & 0xffffffff) |
 268                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 269         }
 270         return (address)target_page;
 271       }
 272     } else {
 273       ShouldNotReachHere();
 274     }
 275   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 276     u_int32_t *insns = (u_int32_t *)insn_addr;
 277     // Move wide constant: movz, movk, movk.  See movptr().
 278     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 279     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 280     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 283   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 284              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 285     return 0;
 286   } else {
 287     ShouldNotReachHere();
 288   }
 289   return address(((uint64_t)insn_addr + (offset << 2)));
 290 }
 291 
 292 void MacroAssembler::safepoint_poll(Label& slow_path) {
 293   if (SafepointMechanism::uses_thread_local_poll()) {
 294     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 295     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 296   } else {
 297     unsigned long offset;
 298     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 299     ldrw(rscratch1, Address(rscratch1, offset));
 300     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 301     cbnz(rscratch1, slow_path);
 302   }
 303 }
 304 
 305 // Just like safepoint_poll, but use an acquiring load for thread-
 306 // local polling.
 307 //
 308 // We need an acquire here to ensure that any subsequent load of the
 309 // global SafepointSynchronize::_state flag is ordered after this load
 310 // of the local Thread::_polling page.  We don't want this poll to
 311 // return false (i.e. not safepointing) and a later poll of the global
 312 // SafepointSynchronize::_state spuriously to return true.
 313 //
 314 // This is to avoid a race when we're in a native->Java transition
 315 // racing the code which wakes up from a safepoint.
 316 //
 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 318   if (SafepointMechanism::uses_thread_local_poll()) {
 319     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 320     ldar(rscratch1, rscratch1);
 321     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 322   } else {
 323     safepoint_poll(slow_path);
 324   }
 325 }
 326 
 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 328   // we must set sp to zero to clear frame
 329   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 330 
 331   // must clear fp, so that compiled frames are not confused; it is
 332   // possible that we need it only for debugging
 333   if (clear_fp) {
 334     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 335   }
 336 
 337   // Always clear the pc because it could have been set by make_walkable()
 338   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 339 }
 340 
 341 // Calls to C land
 342 //
 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 345 // has to be reset to 0. This is required to allow proper stack traversal.
 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 347                                          Register last_java_fp,
 348                                          Register last_java_pc,
 349                                          Register scratch) {
 350 
 351   if (last_java_pc->is_valid()) {
 352       str(last_java_pc, Address(rthread,
 353                                 JavaThread::frame_anchor_offset()
 354                                 + JavaFrameAnchor::last_Java_pc_offset()));
 355     }
 356 
 357   // determine last_java_sp register
 358   if (last_java_sp == sp) {
 359     mov(scratch, sp);
 360     last_java_sp = scratch;
 361   } else if (!last_java_sp->is_valid()) {
 362     last_java_sp = esp;
 363   }
 364 
 365   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 366 
 367   // last_java_fp is optional
 368   if (last_java_fp->is_valid()) {
 369     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 370   }
 371 }
 372 
 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 374                                          Register last_java_fp,
 375                                          address  last_java_pc,
 376                                          Register scratch) {
 377   assert(last_java_pc != NULL, "must provide a valid PC");
 378 
 379   adr(scratch, last_java_pc);
 380   str(scratch, Address(rthread,
 381                        JavaThread::frame_anchor_offset()
 382                        + JavaFrameAnchor::last_Java_pc_offset()));
 383 
 384   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 385 }
 386 
 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 388                                          Register last_java_fp,
 389                                          Label &L,
 390                                          Register scratch) {
 391   if (L.is_bound()) {
 392     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 393   } else {
 394     InstructionMark im(this);
 395     L.add_patch_at(code(), locator());
 396     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 397   }
 398 }
 399 
 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 401   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 402   assert(CodeCache::find_blob(entry.target()) != NULL,
 403          "destination of far call not found in code cache");
 404   if (far_branches()) {
 405     unsigned long offset;
 406     // We can use ADRP here because we know that the total size of
 407     // the code cache cannot exceed 2Gb.
 408     adrp(tmp, entry, offset);
 409     add(tmp, tmp, offset);
 410     if (cbuf) cbuf->set_insts_mark();
 411     blr(tmp);
 412   } else {
 413     if (cbuf) cbuf->set_insts_mark();
 414     bl(entry);
 415   }
 416 }
 417 
 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 419   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 420   assert(CodeCache::find_blob(entry.target()) != NULL,
 421          "destination of far call not found in code cache");
 422   if (far_branches()) {
 423     unsigned long offset;
 424     // We can use ADRP here because we know that the total size of
 425     // the code cache cannot exceed 2Gb.
 426     adrp(tmp, entry, offset);
 427     add(tmp, tmp, offset);
 428     if (cbuf) cbuf->set_insts_mark();
 429     br(tmp);
 430   } else {
 431     if (cbuf) cbuf->set_insts_mark();
 432     b(entry);
 433   }
 434 }
 435 
 436 void MacroAssembler::reserved_stack_check() {
 437     // testing if reserved zone needs to be enabled
 438     Label no_reserved_zone_enabling;
 439 
 440     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 441     cmp(sp, rscratch1);
 442     br(Assembler::LO, no_reserved_zone_enabling);
 443 
 444     enter();   // LR and FP are live.
 445     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 446     mov(c_rarg0, rthread);
 447     blr(rscratch1);
 448     leave();
 449 
 450     // We have already removed our own frame.
 451     // throw_delayed_StackOverflowError will think that it's been
 452     // called by our caller.
 453     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 454     br(rscratch1);
 455     should_not_reach_here();
 456 
 457     bind(no_reserved_zone_enabling);
 458 }
 459 
 460 int MacroAssembler::biased_locking_enter(Register lock_reg,
 461                                          Register obj_reg,
 462                                          Register swap_reg,
 463                                          Register tmp_reg,
 464                                          bool swap_reg_contains_mark,
 465                                          Label& done,
 466                                          Label* slow_case,
 467                                          BiasedLockingCounters* counters) {
 468   assert(UseBiasedLocking, "why call this otherwise?");
 469   assert_different_registers(lock_reg, obj_reg, swap_reg);
 470 
 471   if (PrintBiasedLockingStatistics && counters == NULL)
 472     counters = BiasedLocking::counters();
 473 
 474   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 475   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
 476   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 477   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 478   Address saved_mark_addr(lock_reg, 0);
 479 
 480   // Biased locking
 481   // See whether the lock is currently biased toward our thread and
 482   // whether the epoch is still valid
 483   // Note that the runtime guarantees sufficient alignment of JavaThread
 484   // pointers to allow age to be placed into low bits
 485   // First check to see whether biasing is even enabled for this object
 486   Label cas_label;
 487   int null_check_offset = -1;
 488   if (!swap_reg_contains_mark) {
 489     null_check_offset = offset();
 490     ldr(swap_reg, mark_addr);
 491   }
 492   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
 493   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
 494   br(Assembler::NE, cas_label);
 495   // The bias pattern is present in the object's header. Need to check
 496   // whether the bias owner and the epoch are both still current.
 497   load_prototype_header(tmp_reg, obj_reg);
 498   orr(tmp_reg, tmp_reg, rthread);
 499   eor(tmp_reg, swap_reg, tmp_reg);
 500   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
 501   if (counters != NULL) {
 502     Label around;
 503     cbnz(tmp_reg, around);
 504     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 505     b(done);
 506     bind(around);
 507   } else {
 508     cbz(tmp_reg, done);
 509   }
 510 
 511   Label try_revoke_bias;
 512   Label try_rebias;
 513 
 514   // At this point we know that the header has the bias pattern and
 515   // that we are not the bias owner in the current epoch. We need to
 516   // figure out more details about the state of the header in order to
 517   // know what operations can be legally performed on the object's
 518   // header.
 519 
 520   // If the low three bits in the xor result aren't clear, that means
 521   // the prototype header is no longer biased and we have to revoke
 522   // the bias on this object.
 523   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
 524   cbnz(rscratch1, try_revoke_bias);
 525 
 526   // Biasing is still enabled for this data type. See whether the
 527   // epoch of the current bias is still valid, meaning that the epoch
 528   // bits of the mark word are equal to the epoch bits of the
 529   // prototype header. (Note that the prototype header's epoch bits
 530   // only change at a safepoint.) If not, attempt to rebias the object
 531   // toward the current thread. Note that we must be absolutely sure
 532   // that the current epoch is invalid in order to do this because
 533   // otherwise the manipulations it performs on the mark word are
 534   // illegal.
 535   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
 536   cbnz(rscratch1, try_rebias);
 537 
 538   // The epoch of the current bias is still valid but we know nothing
 539   // about the owner; it might be set or it might be clear. Try to
 540   // acquire the bias of the object using an atomic operation. If this
 541   // fails we will go in to the runtime to revoke the object's bias.
 542   // Note that we first construct the presumed unbiased header so we
 543   // don't accidentally blow away another thread's valid bias.
 544   {
 545     Label here;
 546     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
 547     andr(swap_reg, swap_reg, rscratch1);
 548     orr(tmp_reg, swap_reg, rthread);
 549     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 550     // If the biasing toward our thread failed, this means that
 551     // another thread succeeded in biasing it toward itself and we
 552     // need to revoke that bias. The revocation will occur in the
 553     // interpreter runtime in the slow case.
 554     bind(here);
 555     if (counters != NULL) {
 556       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 557                   tmp_reg, rscratch1, rscratch2);
 558     }
 559   }
 560   b(done);
 561 
 562   bind(try_rebias);
 563   // At this point we know the epoch has expired, meaning that the
 564   // current "bias owner", if any, is actually invalid. Under these
 565   // circumstances _only_, we are allowed to use the current header's
 566   // value as the comparison value when doing the cas to acquire the
 567   // bias in the current epoch. In other words, we allow transfer of
 568   // the bias from one thread to another directly in this situation.
 569   //
 570   // FIXME: due to a lack of registers we currently blow away the age
 571   // bits in this situation. Should attempt to preserve them.
 572   {
 573     Label here;
 574     load_prototype_header(tmp_reg, obj_reg);
 575     orr(tmp_reg, rthread, tmp_reg);
 576     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 577     // If the biasing toward our thread failed, then another thread
 578     // succeeded in biasing it toward itself and we need to revoke that
 579     // bias. The revocation will occur in the runtime in the slow case.
 580     bind(here);
 581     if (counters != NULL) {
 582       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 583                   tmp_reg, rscratch1, rscratch2);
 584     }
 585   }
 586   b(done);
 587 
 588   bind(try_revoke_bias);
 589   // The prototype mark in the klass doesn't have the bias bit set any
 590   // more, indicating that objects of this data type are not supposed
 591   // to be biased any more. We are going to try to reset the mark of
 592   // this object to the prototype value and fall through to the
 593   // CAS-based locking scheme. Note that if our CAS fails, it means
 594   // that another thread raced us for the privilege of revoking the
 595   // bias of this particular object, so it's okay to continue in the
 596   // normal locking code.
 597   //
 598   // FIXME: due to a lack of registers we currently blow away the age
 599   // bits in this situation. Should attempt to preserve them.
 600   {
 601     Label here, nope;
 602     load_prototype_header(tmp_reg, obj_reg);
 603     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 604     bind(here);
 605 
 606     // Fall through to the normal CAS-based lock, because no matter what
 607     // the result of the above CAS, some thread must have succeeded in
 608     // removing the bias bit from the object's header.
 609     if (counters != NULL) {
 610       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 611                   rscratch1, rscratch2);
 612     }
 613     bind(nope);
 614   }
 615 
 616   bind(cas_label);
 617 
 618   return null_check_offset;
 619 }
 620 
 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 622   assert(UseBiasedLocking, "why call this otherwise?");
 623 
 624   // Check for biased locking unlock case, which is a no-op
 625   // Note: we do not have to check the thread ID for two reasons.
 626   // First, the interpreter checks for IllegalMonitorStateException at
 627   // a higher level. Second, if the bias was revoked while we held the
 628   // lock, the object could not be rebiased toward another thread, so
 629   // the bias bit would be clear.
 630   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 631   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
 632   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
 633   br(Assembler::EQ, done);
 634 }
 635 
 636 static void pass_arg0(MacroAssembler* masm, Register arg) {
 637   if (c_rarg0 != arg ) {
 638     masm->mov(c_rarg0, arg);
 639   }
 640 }
 641 
 642 static void pass_arg1(MacroAssembler* masm, Register arg) {
 643   if (c_rarg1 != arg ) {
 644     masm->mov(c_rarg1, arg);
 645   }
 646 }
 647 
 648 static void pass_arg2(MacroAssembler* masm, Register arg) {
 649   if (c_rarg2 != arg ) {
 650     masm->mov(c_rarg2, arg);
 651   }
 652 }
 653 
 654 static void pass_arg3(MacroAssembler* masm, Register arg) {
 655   if (c_rarg3 != arg ) {
 656     masm->mov(c_rarg3, arg);
 657   }
 658 }
 659 
 660 void MacroAssembler::call_VM_base(Register oop_result,
 661                                   Register java_thread,
 662                                   Register last_java_sp,
 663                                   address  entry_point,
 664                                   int      number_of_arguments,
 665                                   bool     check_exceptions) {
 666    // determine java_thread register
 667   if (!java_thread->is_valid()) {
 668     java_thread = rthread;
 669   }
 670 
 671   // determine last_java_sp register
 672   if (!last_java_sp->is_valid()) {
 673     last_java_sp = esp;
 674   }
 675 
 676   // debugging support
 677   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 678   assert(java_thread == rthread, "unexpected register");
 679 #ifdef ASSERT
 680   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 681   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 682 #endif // ASSERT
 683 
 684   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 685   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 686 
 687   // push java thread (becomes first argument of C function)
 688 
 689   mov(c_rarg0, java_thread);
 690 
 691   // set last Java frame before call
 692   assert(last_java_sp != rfp, "can't use rfp");
 693 
 694   Label l;
 695   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 696 
 697   // do the call, remove parameters
 698   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 699 
 700   // reset last Java frame
 701   // Only interpreter should have to clear fp
 702   reset_last_Java_frame(true);
 703 
 704    // C++ interp handles this in the interpreter
 705   check_and_handle_popframe(java_thread);
 706   check_and_handle_earlyret(java_thread);
 707 
 708   if (check_exceptions) {
 709     // check for pending exceptions (java_thread is set upon return)
 710     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 711     Label ok;
 712     cbz(rscratch1, ok);
 713     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 714     br(rscratch1);
 715     bind(ok);
 716   }
 717 
 718   // get oop result if there is one and reset the value in the thread
 719   if (oop_result->is_valid()) {
 720     get_vm_result(oop_result, java_thread);
 721   }
 722 }
 723 
 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 725   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 726 }
 727 
 728 // Maybe emit a call via a trampoline.  If the code cache is small
 729 // trampolines won't be emitted.
 730 
 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 732   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 733   assert(entry.rspec().type() == relocInfo::runtime_call_type
 734          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 735          || entry.rspec().type() == relocInfo::static_call_type
 736          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 737 
 738   // We need a trampoline if branches are far.
 739   if (far_branches()) {
 740     bool in_scratch_emit_size = false;
 741 #ifdef COMPILER2
 742     // We don't want to emit a trampoline if C2 is generating dummy
 743     // code during its branch shortening phase.
 744     CompileTask* task = ciEnv::current()->task();
 745     in_scratch_emit_size =
 746       (task != NULL && is_c2_compile(task->comp_level()) &&
 747        Compile::current()->in_scratch_emit_size());
 748 #endif
 749     if (!in_scratch_emit_size) {
 750       address stub = emit_trampoline_stub(offset(), entry.target());
 751       if (stub == NULL) {
 752         return NULL; // CodeCache is full
 753       }
 754     }
 755   }
 756 
 757   if (cbuf) cbuf->set_insts_mark();
 758   relocate(entry.rspec());
 759   if (!far_branches()) {
 760     bl(entry.target());
 761   } else {
 762     bl(pc());
 763   }
 764   // just need to return a non-null address
 765   return pc();
 766 }
 767 
 768 
 769 // Emit a trampoline stub for a call to a target which is too far away.
 770 //
 771 // code sequences:
 772 //
 773 // call-site:
 774 //   branch-and-link to <destination> or <trampoline stub>
 775 //
 776 // Related trampoline stub for this call site in the stub section:
 777 //   load the call target from the constant pool
 778 //   branch (LR still points to the call site above)
 779 
 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 781                                              address dest) {
 782   // Max stub size: alignment nop, TrampolineStub.
 783   address stub = start_a_stub(NativeInstruction::instruction_size
 784                    + NativeCallTrampolineStub::instruction_size);
 785   if (stub == NULL) {
 786     return NULL;  // CodeBuffer::expand failed
 787   }
 788 
 789   // Create a trampoline stub relocation which relates this trampoline stub
 790   // with the call instruction at insts_call_instruction_offset in the
 791   // instructions code-section.
 792   align(wordSize);
 793   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 794                                             + insts_call_instruction_offset));
 795   const int stub_start_offset = offset();
 796 
 797   // Now, create the trampoline stub's code:
 798   // - load the call
 799   // - call
 800   Label target;
 801   ldr(rscratch1, target);
 802   br(rscratch1);
 803   bind(target);
 804   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 805          "should be");
 806   emit_int64((int64_t)dest);
 807 
 808   const address stub_start_addr = addr_at(stub_start_offset);
 809 
 810   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 811 
 812   end_a_stub();
 813   return stub_start_addr;
 814 }
 815 
 816 void MacroAssembler::emit_static_call_stub() {
 817   // CompiledDirectStaticCall::set_to_interpreted knows the
 818   // exact layout of this stub.
 819 
 820   isb();
 821   mov_metadata(rmethod, (Metadata*)NULL);
 822 
 823   // Jump to the entry point of the i2c stub.
 824   movptr(rscratch1, 0);
 825   br(rscratch1);
 826 }
 827 
 828 void MacroAssembler::c2bool(Register x) {
 829   // implements x == 0 ? 0 : 1
 830   // note: must only look at least-significant byte of x
 831   //       since C-style booleans are stored in one byte
 832   //       only! (was bug)
 833   tst(x, 0xff);
 834   cset(x, Assembler::NE);
 835 }
 836 
 837 address MacroAssembler::ic_call(address entry, jint method_index) {
 838   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 839   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 840   // unsigned long offset;
 841   // ldr_constant(rscratch2, const_ptr);
 842   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 843   return trampoline_call(Address(entry, rh));
 844 }
 845 
 846 // Implementation of call_VM versions
 847 
 848 void MacroAssembler::call_VM(Register oop_result,
 849                              address entry_point,
 850                              bool check_exceptions) {
 851   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 852 }
 853 
 854 void MacroAssembler::call_VM(Register oop_result,
 855                              address entry_point,
 856                              Register arg_1,
 857                              bool check_exceptions) {
 858   pass_arg1(this, arg_1);
 859   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 860 }
 861 
 862 void MacroAssembler::call_VM(Register oop_result,
 863                              address entry_point,
 864                              Register arg_1,
 865                              Register arg_2,
 866                              bool check_exceptions) {
 867   assert(arg_1 != c_rarg2, "smashed arg");
 868   pass_arg2(this, arg_2);
 869   pass_arg1(this, arg_1);
 870   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 871 }
 872 
 873 void MacroAssembler::call_VM(Register oop_result,
 874                              address entry_point,
 875                              Register arg_1,
 876                              Register arg_2,
 877                              Register arg_3,
 878                              bool check_exceptions) {
 879   assert(arg_1 != c_rarg3, "smashed arg");
 880   assert(arg_2 != c_rarg3, "smashed arg");
 881   pass_arg3(this, arg_3);
 882 
 883   assert(arg_1 != c_rarg2, "smashed arg");
 884   pass_arg2(this, arg_2);
 885 
 886   pass_arg1(this, arg_1);
 887   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 888 }
 889 
 890 void MacroAssembler::call_VM(Register oop_result,
 891                              Register last_java_sp,
 892                              address entry_point,
 893                              int number_of_arguments,
 894                              bool check_exceptions) {
 895   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 896 }
 897 
 898 void MacroAssembler::call_VM(Register oop_result,
 899                              Register last_java_sp,
 900                              address entry_point,
 901                              Register arg_1,
 902                              bool check_exceptions) {
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              bool check_exceptions) {
 913 
 914   assert(arg_1 != c_rarg2, "smashed arg");
 915   pass_arg2(this, arg_2);
 916   pass_arg1(this, arg_1);
 917   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 918 }
 919 
 920 void MacroAssembler::call_VM(Register oop_result,
 921                              Register last_java_sp,
 922                              address entry_point,
 923                              Register arg_1,
 924                              Register arg_2,
 925                              Register arg_3,
 926                              bool check_exceptions) {
 927   assert(arg_1 != c_rarg3, "smashed arg");
 928   assert(arg_2 != c_rarg3, "smashed arg");
 929   pass_arg3(this, arg_3);
 930   assert(arg_1 != c_rarg2, "smashed arg");
 931   pass_arg2(this, arg_2);
 932   pass_arg1(this, arg_1);
 933   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 934 }
 935 
 936 
 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 938   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 939   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 940   verify_oop(oop_result, "broken oop in call_VM_base");
 941 }
 942 
 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 944   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 945   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 946 }
 947 
 948 void MacroAssembler::align(int modulus) {
 949   while (offset() % modulus != 0) nop();
 950 }
 951 
 952 // these are no-ops overridden by InterpreterMacroAssembler
 953 
 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 955 
 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 957 
 958 
 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 960                                                       Register tmp,
 961                                                       int offset) {
 962   intptr_t value = *delayed_value_addr;
 963   if (value != 0)
 964     return RegisterOrConstant(value + offset);
 965 
 966   // load indirectly to solve generation ordering problem
 967   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 968 
 969   if (offset != 0)
 970     add(tmp, tmp, offset);
 971 
 972   return RegisterOrConstant(tmp);
 973 }
 974 
 975 // Look up the method for a megamorphic invokeinterface call.
 976 // The target method is determined by <intf_klass, itable_index>.
 977 // The receiver klass is in recv_klass.
 978 // On success, the result will be in method_result, and execution falls through.
 979 // On failure, execution transfers to the given label.
 980 void MacroAssembler::lookup_interface_method(Register recv_klass,
 981                                              Register intf_klass,
 982                                              RegisterOrConstant itable_index,
 983                                              Register method_result,
 984                                              Register scan_temp,
 985                                              Label& L_no_such_interface,
 986                          bool return_method) {
 987   assert_different_registers(recv_klass, intf_klass, scan_temp);
 988   assert_different_registers(method_result, intf_klass, scan_temp);
 989   assert(recv_klass != method_result || !return_method,
 990      "recv_klass can be destroyed when method isn't needed");
 991   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 992          "caller must use same register for non-constant itable index as for method");
 993 
 994   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 995   int vtable_base = in_bytes(Klass::vtable_start_offset());
 996   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 997   int scan_step   = itableOffsetEntry::size() * wordSize;
 998   int vte_size    = vtableEntry::size_in_bytes();
 999   assert(vte_size == wordSize, "else adjust times_vte_scale");
1000 
1001   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1002 
1003   // %%% Could store the aligned, prescaled offset in the klassoop.
1004   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1005   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1006   add(scan_temp, scan_temp, vtable_base);
1007 
1008   if (return_method) {
1009     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1010     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1011     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1012     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1013     if (itentry_off)
1014       add(recv_klass, recv_klass, itentry_off);
1015   }
1016 
1017   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1018   //   if (scan->interface() == intf) {
1019   //     result = (klass + scan->offset() + itable_index);
1020   //   }
1021   // }
1022   Label search, found_method;
1023 
1024   for (int peel = 1; peel >= 0; peel--) {
1025     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1026     cmp(intf_klass, method_result);
1027 
1028     if (peel) {
1029       br(Assembler::EQ, found_method);
1030     } else {
1031       br(Assembler::NE, search);
1032       // (invert the test to fall through to found_method...)
1033     }
1034 
1035     if (!peel)  break;
1036 
1037     bind(search);
1038 
1039     // Check that the previous entry is non-null.  A null entry means that
1040     // the receiver class doesn't implement the interface, and wasn't the
1041     // same as when the caller was compiled.
1042     cbz(method_result, L_no_such_interface);
1043     add(scan_temp, scan_temp, scan_step);
1044   }
1045 
1046   bind(found_method);
1047 
1048   // Got a hit.
1049   if (return_method) {
1050     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1051     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1052   }
1053 }
1054 
1055 // virtual method calling
1056 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1057                                            RegisterOrConstant vtable_index,
1058                                            Register method_result) {
1059   const int base = in_bytes(Klass::vtable_start_offset());
1060   assert(vtableEntry::size() * wordSize == 8,
1061          "adjust the scaling in the code below");
1062   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1063 
1064   if (vtable_index.is_register()) {
1065     lea(method_result, Address(recv_klass,
1066                                vtable_index.as_register(),
1067                                Address::lsl(LogBytesPerWord)));
1068     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1069   } else {
1070     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1071     ldr(method_result,
1072         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1073   }
1074 }
1075 
1076 void MacroAssembler::check_klass_subtype(Register sub_klass,
1077                            Register super_klass,
1078                            Register temp_reg,
1079                            Label& L_success) {
1080   Label L_failure;
1081   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1082   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1083   bind(L_failure);
1084 }
1085 
1086 
1087 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1088                                                    Register super_klass,
1089                                                    Register temp_reg,
1090                                                    Label* L_success,
1091                                                    Label* L_failure,
1092                                                    Label* L_slow_path,
1093                                         RegisterOrConstant super_check_offset) {
1094   assert_different_registers(sub_klass, super_klass, temp_reg);
1095   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1096   if (super_check_offset.is_register()) {
1097     assert_different_registers(sub_klass, super_klass,
1098                                super_check_offset.as_register());
1099   } else if (must_load_sco) {
1100     assert(temp_reg != noreg, "supply either a temp or a register offset");
1101   }
1102 
1103   Label L_fallthrough;
1104   int label_nulls = 0;
1105   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1106   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1107   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1108   assert(label_nulls <= 1, "at most one NULL in the batch");
1109 
1110   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1111   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1112   Address super_check_offset_addr(super_klass, sco_offset);
1113 
1114   // Hacked jmp, which may only be used just before L_fallthrough.
1115 #define final_jmp(label)                                                \
1116   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1117   else                            b(label)                /*omit semi*/
1118 
1119   // If the pointers are equal, we are done (e.g., String[] elements).
1120   // This self-check enables sharing of secondary supertype arrays among
1121   // non-primary types such as array-of-interface.  Otherwise, each such
1122   // type would need its own customized SSA.
1123   // We move this check to the front of the fast path because many
1124   // type checks are in fact trivially successful in this manner,
1125   // so we get a nicely predicted branch right at the start of the check.
1126   cmp(sub_klass, super_klass);
1127   br(Assembler::EQ, *L_success);
1128 
1129   // Check the supertype display:
1130   if (must_load_sco) {
1131     ldrw(temp_reg, super_check_offset_addr);
1132     super_check_offset = RegisterOrConstant(temp_reg);
1133   }
1134   Address super_check_addr(sub_klass, super_check_offset);
1135   ldr(rscratch1, super_check_addr);
1136   cmp(super_klass, rscratch1); // load displayed supertype
1137 
1138   // This check has worked decisively for primary supers.
1139   // Secondary supers are sought in the super_cache ('super_cache_addr').
1140   // (Secondary supers are interfaces and very deeply nested subtypes.)
1141   // This works in the same check above because of a tricky aliasing
1142   // between the super_cache and the primary super display elements.
1143   // (The 'super_check_addr' can address either, as the case requires.)
1144   // Note that the cache is updated below if it does not help us find
1145   // what we need immediately.
1146   // So if it was a primary super, we can just fail immediately.
1147   // Otherwise, it's the slow path for us (no success at this point).
1148 
1149   if (super_check_offset.is_register()) {
1150     br(Assembler::EQ, *L_success);
1151     subs(zr, super_check_offset.as_register(), sc_offset);
1152     if (L_failure == &L_fallthrough) {
1153       br(Assembler::EQ, *L_slow_path);
1154     } else {
1155       br(Assembler::NE, *L_failure);
1156       final_jmp(*L_slow_path);
1157     }
1158   } else if (super_check_offset.as_constant() == sc_offset) {
1159     // Need a slow path; fast failure is impossible.
1160     if (L_slow_path == &L_fallthrough) {
1161       br(Assembler::EQ, *L_success);
1162     } else {
1163       br(Assembler::NE, *L_slow_path);
1164       final_jmp(*L_success);
1165     }
1166   } else {
1167     // No slow path; it's a fast decision.
1168     if (L_failure == &L_fallthrough) {
1169       br(Assembler::EQ, *L_success);
1170     } else {
1171       br(Assembler::NE, *L_failure);
1172       final_jmp(*L_success);
1173     }
1174   }
1175 
1176   bind(L_fallthrough);
1177 
1178 #undef final_jmp
1179 }
1180 
1181 // These two are taken from x86, but they look generally useful
1182 
1183 // scans count pointer sized words at [addr] for occurence of value,
1184 // generic
1185 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1186                                 Register scratch) {
1187   Label Lloop, Lexit;
1188   cbz(count, Lexit);
1189   bind(Lloop);
1190   ldr(scratch, post(addr, wordSize));
1191   cmp(value, scratch);
1192   br(EQ, Lexit);
1193   sub(count, count, 1);
1194   cbnz(count, Lloop);
1195   bind(Lexit);
1196 }
1197 
1198 // scans count 4 byte words at [addr] for occurence of value,
1199 // generic
1200 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1201                                 Register scratch) {
1202   Label Lloop, Lexit;
1203   cbz(count, Lexit);
1204   bind(Lloop);
1205   ldrw(scratch, post(addr, wordSize));
1206   cmpw(value, scratch);
1207   br(EQ, Lexit);
1208   sub(count, count, 1);
1209   cbnz(count, Lloop);
1210   bind(Lexit);
1211 }
1212 
1213 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1214                                                    Register super_klass,
1215                                                    Register temp_reg,
1216                                                    Register temp2_reg,
1217                                                    Label* L_success,
1218                                                    Label* L_failure,
1219                                                    bool set_cond_codes) {
1220   assert_different_registers(sub_klass, super_klass, temp_reg);
1221   if (temp2_reg != noreg)
1222     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1223 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1224 
1225   Label L_fallthrough;
1226   int label_nulls = 0;
1227   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1228   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1229   assert(label_nulls <= 1, "at most one NULL in the batch");
1230 
1231   // a couple of useful fields in sub_klass:
1232   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1233   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1234   Address secondary_supers_addr(sub_klass, ss_offset);
1235   Address super_cache_addr(     sub_klass, sc_offset);
1236 
1237   BLOCK_COMMENT("check_klass_subtype_slow_path");
1238 
1239   // Do a linear scan of the secondary super-klass chain.
1240   // This code is rarely used, so simplicity is a virtue here.
1241   // The repne_scan instruction uses fixed registers, which we must spill.
1242   // Don't worry too much about pre-existing connections with the input regs.
1243 
1244   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1245   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1246 
1247   RegSet pushed_registers;
1248   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1249   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1250 
1251   if (super_klass != r0 || UseCompressedOops) {
1252     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1253   }
1254 
1255   push(pushed_registers, sp);
1256 
1257   // Get super_klass value into r0 (even if it was in r5 or r2).
1258   if (super_klass != r0) {
1259     mov(r0, super_klass);
1260   }
1261 
1262 #ifndef PRODUCT
1263   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1264   Address pst_counter_addr(rscratch2);
1265   ldr(rscratch1, pst_counter_addr);
1266   add(rscratch1, rscratch1, 1);
1267   str(rscratch1, pst_counter_addr);
1268 #endif //PRODUCT
1269 
1270   // We will consult the secondary-super array.
1271   ldr(r5, secondary_supers_addr);
1272   // Load the array length.
1273   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1274   // Skip to start of data.
1275   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1276 
1277   cmp(sp, zr); // Clear Z flag; SP is never zero
1278   // Scan R2 words at [R5] for an occurrence of R0.
1279   // Set NZ/Z based on last compare.
1280   repne_scan(r5, r0, r2, rscratch1);
1281 
1282   // Unspill the temp. registers:
1283   pop(pushed_registers, sp);
1284 
1285   br(Assembler::NE, *L_failure);
1286 
1287   // Success.  Cache the super we found and proceed in triumph.
1288   str(super_klass, super_cache_addr);
1289 
1290   if (L_success != &L_fallthrough) {
1291     b(*L_success);
1292   }
1293 
1294 #undef IS_A_TEMP
1295 
1296   bind(L_fallthrough);
1297 }
1298 
1299 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1300   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1301   assert_different_registers(klass, rthread, scratch);
1302 
1303   Label L_fallthrough, L_tmp;
1304   if (L_fast_path == NULL) {
1305     L_fast_path = &L_fallthrough;
1306   } else if (L_slow_path == NULL) {
1307     L_slow_path = &L_fallthrough;
1308   }
1309   // Fast path check: class is fully initialized
1310   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1311   subs(zr, scratch, InstanceKlass::fully_initialized);
1312   br(Assembler::EQ, *L_fast_path);
1313 
1314   // Fast path check: current thread is initializer thread
1315   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1316   cmp(rthread, scratch);
1317 
1318   if (L_slow_path == &L_fallthrough) {
1319     br(Assembler::EQ, *L_fast_path);
1320     bind(*L_slow_path);
1321   } else if (L_fast_path == &L_fallthrough) {
1322     br(Assembler::NE, *L_slow_path);
1323     bind(*L_fast_path);
1324   } else {
1325     Unimplemented();
1326   }
1327 }
1328 
1329 void MacroAssembler::verify_oop(Register reg, const char* s) {
1330   if (!VerifyOops) return;
1331 
1332   // Pass register number to verify_oop_subroutine
1333   const char* b = NULL;
1334   {
1335     ResourceMark rm;
1336     stringStream ss;
1337     ss.print("verify_oop: %s: %s", reg->name(), s);
1338     b = code_string(ss.as_string());
1339   }
1340   BLOCK_COMMENT("verify_oop {");
1341 
1342   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1343   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1344 
1345   mov(r0, reg);
1346   mov(rscratch1, (address)b);
1347 
1348   // call indirectly to solve generation ordering problem
1349   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1350   ldr(rscratch2, Address(rscratch2));
1351   blr(rscratch2);
1352 
1353   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1354   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1355 
1356   BLOCK_COMMENT("} verify_oop");
1357 }
1358 
1359 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1360   if (!VerifyOops) return;
1361 
1362   const char* b = NULL;
1363   {
1364     ResourceMark rm;
1365     stringStream ss;
1366     ss.print("verify_oop_addr: %s", s);
1367     b = code_string(ss.as_string());
1368   }
1369   BLOCK_COMMENT("verify_oop_addr {");
1370 
1371   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1372   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1373 
1374   // addr may contain sp so we will have to adjust it based on the
1375   // pushes that we just did.
1376   if (addr.uses(sp)) {
1377     lea(r0, addr);
1378     ldr(r0, Address(r0, 4 * wordSize));
1379   } else {
1380     ldr(r0, addr);
1381   }
1382   mov(rscratch1, (address)b);
1383 
1384   // call indirectly to solve generation ordering problem
1385   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1386   ldr(rscratch2, Address(rscratch2));
1387   blr(rscratch2);
1388 
1389   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1390   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1391 
1392   BLOCK_COMMENT("} verify_oop_addr");
1393 }
1394 
1395 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1396                                          int extra_slot_offset) {
1397   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1398   int stackElementSize = Interpreter::stackElementSize;
1399   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1400 #ifdef ASSERT
1401   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1402   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1403 #endif
1404   if (arg_slot.is_constant()) {
1405     return Address(esp, arg_slot.as_constant() * stackElementSize
1406                    + offset);
1407   } else {
1408     add(rscratch1, esp, arg_slot.as_register(),
1409         ext::uxtx, exact_log2(stackElementSize));
1410     return Address(rscratch1, offset);
1411   }
1412 }
1413 
1414 void MacroAssembler::call_VM_leaf_base(address entry_point,
1415                                        int number_of_arguments,
1416                                        Label *retaddr) {
1417   Label E, L;
1418 
1419   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1420 
1421   mov(rscratch1, entry_point);
1422   blr(rscratch1);
1423   if (retaddr)
1424     bind(*retaddr);
1425 
1426   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1427   maybe_isb();
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1431   call_VM_leaf_base(entry_point, number_of_arguments);
1432 }
1433 
1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1435   pass_arg0(this, arg_0);
1436   call_VM_leaf_base(entry_point, 1);
1437 }
1438 
1439 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1440   pass_arg0(this, arg_0);
1441   pass_arg1(this, arg_1);
1442   call_VM_leaf_base(entry_point, 2);
1443 }
1444 
1445 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1446                                   Register arg_1, Register arg_2) {
1447   pass_arg0(this, arg_0);
1448   pass_arg1(this, arg_1);
1449   pass_arg2(this, arg_2);
1450   call_VM_leaf_base(entry_point, 3);
1451 }
1452 
1453 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1454   pass_arg0(this, arg_0);
1455   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1456 }
1457 
1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1459 
1460   assert(arg_0 != c_rarg1, "smashed arg");
1461   pass_arg1(this, arg_1);
1462   pass_arg0(this, arg_0);
1463   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1464 }
1465 
1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1467   assert(arg_0 != c_rarg2, "smashed arg");
1468   assert(arg_1 != c_rarg2, "smashed arg");
1469   pass_arg2(this, arg_2);
1470   assert(arg_0 != c_rarg1, "smashed arg");
1471   pass_arg1(this, arg_1);
1472   pass_arg0(this, arg_0);
1473   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1474 }
1475 
1476 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1477   assert(arg_0 != c_rarg3, "smashed arg");
1478   assert(arg_1 != c_rarg3, "smashed arg");
1479   assert(arg_2 != c_rarg3, "smashed arg");
1480   pass_arg3(this, arg_3);
1481   assert(arg_0 != c_rarg2, "smashed arg");
1482   assert(arg_1 != c_rarg2, "smashed arg");
1483   pass_arg2(this, arg_2);
1484   assert(arg_0 != c_rarg1, "smashed arg");
1485   pass_arg1(this, arg_1);
1486   pass_arg0(this, arg_0);
1487   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1488 }
1489 
1490 void MacroAssembler::null_check(Register reg, int offset) {
1491   if (needs_explicit_null_check(offset)) {
1492     // provoke OS NULL exception if reg = NULL by
1493     // accessing M[reg] w/o changing any registers
1494     // NOTE: this is plenty to provoke a segv
1495     ldr(zr, Address(reg));
1496   } else {
1497     // nothing to do, (later) access of M[reg + offset]
1498     // will provoke OS NULL exception if reg = NULL
1499   }
1500 }
1501 
1502 // MacroAssembler protected routines needed to implement
1503 // public methods
1504 
1505 void MacroAssembler::mov(Register r, Address dest) {
1506   code_section()->relocate(pc(), dest.rspec());
1507   u_int64_t imm64 = (u_int64_t)dest.target();
1508   movptr(r, imm64);
1509 }
1510 
1511 // Move a constant pointer into r.  In AArch64 mode the virtual
1512 // address space is 48 bits in size, so we only need three
1513 // instructions to create a patchable instruction sequence that can
1514 // reach anywhere.
1515 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1516 #ifndef PRODUCT
1517   {
1518     char buffer[64];
1519     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1520     block_comment(buffer);
1521   }
1522 #endif
1523   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1524   movz(r, imm64 & 0xffff);
1525   imm64 >>= 16;
1526   movk(r, imm64 & 0xffff, 16);
1527   imm64 >>= 16;
1528   movk(r, imm64 & 0xffff, 32);
1529 }
1530 
1531 // Macro to mov replicated immediate to vector register.
1532 //  Vd will get the following values for different arrangements in T
1533 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1534 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1535 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1536 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1537 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1538 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1539 //   T1D/T2D: invalid
1540 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1541   assert(T != T1D && T != T2D, "invalid arrangement");
1542   if (T == T8B || T == T16B) {
1543     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1544     movi(Vd, T, imm32 & 0xff, 0);
1545     return;
1546   }
1547   u_int32_t nimm32 = ~imm32;
1548   if (T == T4H || T == T8H) {
1549     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1550     imm32 &= 0xffff;
1551     nimm32 &= 0xffff;
1552   }
1553   u_int32_t x = imm32;
1554   int movi_cnt = 0;
1555   int movn_cnt = 0;
1556   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1557   x = nimm32;
1558   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1559   if (movn_cnt < movi_cnt) imm32 = nimm32;
1560   unsigned lsl = 0;
1561   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1562   if (movn_cnt < movi_cnt)
1563     mvni(Vd, T, imm32 & 0xff, lsl);
1564   else
1565     movi(Vd, T, imm32 & 0xff, lsl);
1566   imm32 >>= 8; lsl += 8;
1567   while (imm32) {
1568     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1569     if (movn_cnt < movi_cnt)
1570       bici(Vd, T, imm32 & 0xff, lsl);
1571     else
1572       orri(Vd, T, imm32 & 0xff, lsl);
1573     lsl += 8; imm32 >>= 8;
1574   }
1575 }
1576 
1577 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1578 {
1579 #ifndef PRODUCT
1580   {
1581     char buffer[64];
1582     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1583     block_comment(buffer);
1584   }
1585 #endif
1586   if (operand_valid_for_logical_immediate(false, imm64)) {
1587     orr(dst, zr, imm64);
1588   } else {
1589     // we can use a combination of MOVZ or MOVN with
1590     // MOVK to build up the constant
1591     u_int64_t imm_h[4];
1592     int zero_count = 0;
1593     int neg_count = 0;
1594     int i;
1595     for (i = 0; i < 4; i++) {
1596       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1597       if (imm_h[i] == 0) {
1598         zero_count++;
1599       } else if (imm_h[i] == 0xffffL) {
1600         neg_count++;
1601       }
1602     }
1603     if (zero_count == 4) {
1604       // one MOVZ will do
1605       movz(dst, 0);
1606     } else if (neg_count == 4) {
1607       // one MOVN will do
1608       movn(dst, 0);
1609     } else if (zero_count == 3) {
1610       for (i = 0; i < 4; i++) {
1611         if (imm_h[i] != 0L) {
1612           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1613           break;
1614         }
1615       }
1616     } else if (neg_count == 3) {
1617       // one MOVN will do
1618       for (int i = 0; i < 4; i++) {
1619         if (imm_h[i] != 0xffffL) {
1620           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1621           break;
1622         }
1623       }
1624     } else if (zero_count == 2) {
1625       // one MOVZ and one MOVK will do
1626       for (i = 0; i < 3; i++) {
1627         if (imm_h[i] != 0L) {
1628           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1629           i++;
1630           break;
1631         }
1632       }
1633       for (;i < 4; i++) {
1634         if (imm_h[i] != 0L) {
1635           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1636         }
1637       }
1638     } else if (neg_count == 2) {
1639       // one MOVN and one MOVK will do
1640       for (i = 0; i < 4; i++) {
1641         if (imm_h[i] != 0xffffL) {
1642           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1643           i++;
1644           break;
1645         }
1646       }
1647       for (;i < 4; i++) {
1648         if (imm_h[i] != 0xffffL) {
1649           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1650         }
1651       }
1652     } else if (zero_count == 1) {
1653       // one MOVZ and two MOVKs will do
1654       for (i = 0; i < 4; i++) {
1655         if (imm_h[i] != 0L) {
1656           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1657           i++;
1658           break;
1659         }
1660       }
1661       for (;i < 4; i++) {
1662         if (imm_h[i] != 0x0L) {
1663           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664         }
1665       }
1666     } else if (neg_count == 1) {
1667       // one MOVN and two MOVKs will do
1668       for (i = 0; i < 4; i++) {
1669         if (imm_h[i] != 0xffffL) {
1670           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1671           i++;
1672           break;
1673         }
1674       }
1675       for (;i < 4; i++) {
1676         if (imm_h[i] != 0xffffL) {
1677           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1678         }
1679       }
1680     } else {
1681       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1682       movz(dst, (u_int32_t)imm_h[0], 0);
1683       for (i = 1; i < 4; i++) {
1684         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1685       }
1686     }
1687   }
1688 }
1689 
1690 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1691 {
1692 #ifndef PRODUCT
1693     {
1694       char buffer[64];
1695       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1696       block_comment(buffer);
1697     }
1698 #endif
1699   if (operand_valid_for_logical_immediate(true, imm32)) {
1700     orrw(dst, zr, imm32);
1701   } else {
1702     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1703     // constant
1704     u_int32_t imm_h[2];
1705     imm_h[0] = imm32 & 0xffff;
1706     imm_h[1] = ((imm32 >> 16) & 0xffff);
1707     if (imm_h[0] == 0) {
1708       movzw(dst, imm_h[1], 16);
1709     } else if (imm_h[0] == 0xffff) {
1710       movnw(dst, imm_h[1] ^ 0xffff, 16);
1711     } else if (imm_h[1] == 0) {
1712       movzw(dst, imm_h[0], 0);
1713     } else if (imm_h[1] == 0xffff) {
1714       movnw(dst, imm_h[0] ^ 0xffff, 0);
1715     } else {
1716       // use a MOVZ and MOVK (makes it easier to debug)
1717       movzw(dst, imm_h[0], 0);
1718       movkw(dst, imm_h[1], 16);
1719     }
1720   }
1721 }
1722 
1723 // Form an address from base + offset in Rd.  Rd may or may
1724 // not actually be used: you must use the Address that is returned.
1725 // It is up to you to ensure that the shift provided matches the size
1726 // of your data.
1727 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1728   if (Address::offset_ok_for_immed(byte_offset, shift))
1729     // It fits; no need for any heroics
1730     return Address(base, byte_offset);
1731 
1732   // Don't do anything clever with negative or misaligned offsets
1733   unsigned mask = (1 << shift) - 1;
1734   if (byte_offset < 0 || byte_offset & mask) {
1735     mov(Rd, byte_offset);
1736     add(Rd, base, Rd);
1737     return Address(Rd);
1738   }
1739 
1740   // See if we can do this with two 12-bit offsets
1741   {
1742     unsigned long word_offset = byte_offset >> shift;
1743     unsigned long masked_offset = word_offset & 0xfff000;
1744     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1745         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1746       add(Rd, base, masked_offset << shift);
1747       word_offset -= masked_offset;
1748       return Address(Rd, word_offset << shift);
1749     }
1750   }
1751 
1752   // Do it the hard way
1753   mov(Rd, byte_offset);
1754   add(Rd, base, Rd);
1755   return Address(Rd);
1756 }
1757 
1758 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1759   if (UseLSE) {
1760     mov(tmp, 1);
1761     ldadd(Assembler::word, tmp, zr, counter_addr);
1762     return;
1763   }
1764   Label retry_load;
1765   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1766     prfm(Address(counter_addr), PSTL1STRM);
1767   bind(retry_load);
1768   // flush and load exclusive from the memory location
1769   ldxrw(tmp, counter_addr);
1770   addw(tmp, tmp, 1);
1771   // if we store+flush with no intervening write tmp wil be zero
1772   stxrw(tmp2, tmp, counter_addr);
1773   cbnzw(tmp2, retry_load);
1774 }
1775 
1776 
1777 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1778                                     bool want_remainder, Register scratch)
1779 {
1780   // Full implementation of Java idiv and irem.  The function
1781   // returns the (pc) offset of the div instruction - may be needed
1782   // for implicit exceptions.
1783   //
1784   // constraint : ra/rb =/= scratch
1785   //         normal case
1786   //
1787   // input : ra: dividend
1788   //         rb: divisor
1789   //
1790   // result: either
1791   //         quotient  (= ra idiv rb)
1792   //         remainder (= ra irem rb)
1793 
1794   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1795 
1796   int idivl_offset = offset();
1797   if (! want_remainder) {
1798     sdivw(result, ra, rb);
1799   } else {
1800     sdivw(scratch, ra, rb);
1801     Assembler::msubw(result, scratch, rb, ra);
1802   }
1803 
1804   return idivl_offset;
1805 }
1806 
1807 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1808                                     bool want_remainder, Register scratch)
1809 {
1810   // Full implementation of Java ldiv and lrem.  The function
1811   // returns the (pc) offset of the div instruction - may be needed
1812   // for implicit exceptions.
1813   //
1814   // constraint : ra/rb =/= scratch
1815   //         normal case
1816   //
1817   // input : ra: dividend
1818   //         rb: divisor
1819   //
1820   // result: either
1821   //         quotient  (= ra idiv rb)
1822   //         remainder (= ra irem rb)
1823 
1824   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1825 
1826   int idivq_offset = offset();
1827   if (! want_remainder) {
1828     sdiv(result, ra, rb);
1829   } else {
1830     sdiv(scratch, ra, rb);
1831     Assembler::msub(result, scratch, rb, ra);
1832   }
1833 
1834   return idivq_offset;
1835 }
1836 
1837 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1838   address prev = pc() - NativeMembar::instruction_size;
1839   address last = code()->last_insn();
1840   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1841     NativeMembar *bar = NativeMembar_at(prev);
1842     // We are merging two memory barrier instructions.  On AArch64 we
1843     // can do this simply by ORing them together.
1844     bar->set_kind(bar->get_kind() | order_constraint);
1845     BLOCK_COMMENT("merged membar");
1846   } else {
1847     code()->set_last_insn(pc());
1848     dmb(Assembler::barrier(order_constraint));
1849   }
1850 }
1851 
1852 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1853   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1854     merge_ldst(rt, adr, size_in_bytes, is_store);
1855     code()->clear_last_insn();
1856     return true;
1857   } else {
1858     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1859     const unsigned mask = size_in_bytes - 1;
1860     if (adr.getMode() == Address::base_plus_offset &&
1861         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1862       code()->set_last_insn(pc());
1863     }
1864     return false;
1865   }
1866 }
1867 
1868 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1869   // We always try to merge two adjacent loads into one ldp.
1870   if (!try_merge_ldst(Rx, adr, 8, false)) {
1871     Assembler::ldr(Rx, adr);
1872   }
1873 }
1874 
1875 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1876   // We always try to merge two adjacent loads into one ldp.
1877   if (!try_merge_ldst(Rw, adr, 4, false)) {
1878     Assembler::ldrw(Rw, adr);
1879   }
1880 }
1881 
1882 void MacroAssembler::str(Register Rx, const Address &adr) {
1883   // We always try to merge two adjacent stores into one stp.
1884   if (!try_merge_ldst(Rx, adr, 8, true)) {
1885     Assembler::str(Rx, adr);
1886   }
1887 }
1888 
1889 void MacroAssembler::strw(Register Rw, const Address &adr) {
1890   // We always try to merge two adjacent stores into one stp.
1891   if (!try_merge_ldst(Rw, adr, 4, true)) {
1892     Assembler::strw(Rw, adr);
1893   }
1894 }
1895 
1896 // MacroAssembler routines found actually to be needed
1897 
1898 void MacroAssembler::push(Register src)
1899 {
1900   str(src, Address(pre(esp, -1 * wordSize)));
1901 }
1902 
1903 void MacroAssembler::pop(Register dst)
1904 {
1905   ldr(dst, Address(post(esp, 1 * wordSize)));
1906 }
1907 
1908 // Note: load_unsigned_short used to be called load_unsigned_word.
1909 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1910   int off = offset();
1911   ldrh(dst, src);
1912   return off;
1913 }
1914 
1915 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1916   int off = offset();
1917   ldrb(dst, src);
1918   return off;
1919 }
1920 
1921 int MacroAssembler::load_signed_short(Register dst, Address src) {
1922   int off = offset();
1923   ldrsh(dst, src);
1924   return off;
1925 }
1926 
1927 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1928   int off = offset();
1929   ldrsb(dst, src);
1930   return off;
1931 }
1932 
1933 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1934   int off = offset();
1935   ldrshw(dst, src);
1936   return off;
1937 }
1938 
1939 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1940   int off = offset();
1941   ldrsbw(dst, src);
1942   return off;
1943 }
1944 
1945 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1946   switch (size_in_bytes) {
1947   case  8:  ldr(dst, src); break;
1948   case  4:  ldrw(dst, src); break;
1949   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1950   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1951   default:  ShouldNotReachHere();
1952   }
1953 }
1954 
1955 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1956   switch (size_in_bytes) {
1957   case  8:  str(src, dst); break;
1958   case  4:  strw(src, dst); break;
1959   case  2:  strh(src, dst); break;
1960   case  1:  strb(src, dst); break;
1961   default:  ShouldNotReachHere();
1962   }
1963 }
1964 
1965 void MacroAssembler::decrementw(Register reg, int value)
1966 {
1967   if (value < 0)  { incrementw(reg, -value);      return; }
1968   if (value == 0) {                               return; }
1969   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1970   /* else */ {
1971     guarantee(reg != rscratch2, "invalid dst for register decrement");
1972     movw(rscratch2, (unsigned)value);
1973     subw(reg, reg, rscratch2);
1974   }
1975 }
1976 
1977 void MacroAssembler::decrement(Register reg, int value)
1978 {
1979   if (value < 0)  { increment(reg, -value);      return; }
1980   if (value == 0) {                              return; }
1981   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1982   /* else */ {
1983     assert(reg != rscratch2, "invalid dst for register decrement");
1984     mov(rscratch2, (unsigned long)value);
1985     sub(reg, reg, rscratch2);
1986   }
1987 }
1988 
1989 void MacroAssembler::decrementw(Address dst, int value)
1990 {
1991   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1992   if (dst.getMode() == Address::literal) {
1993     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1994     lea(rscratch2, dst);
1995     dst = Address(rscratch2);
1996   }
1997   ldrw(rscratch1, dst);
1998   decrementw(rscratch1, value);
1999   strw(rscratch1, dst);
2000 }
2001 
2002 void MacroAssembler::decrement(Address dst, int value)
2003 {
2004   assert(!dst.uses(rscratch1), "invalid address for decrement");
2005   if (dst.getMode() == Address::literal) {
2006     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2007     lea(rscratch2, dst);
2008     dst = Address(rscratch2);
2009   }
2010   ldr(rscratch1, dst);
2011   decrement(rscratch1, value);
2012   str(rscratch1, dst);
2013 }
2014 
2015 void MacroAssembler::incrementw(Register reg, int value)
2016 {
2017   if (value < 0)  { decrementw(reg, -value);      return; }
2018   if (value == 0) {                               return; }
2019   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2020   /* else */ {
2021     assert(reg != rscratch2, "invalid dst for register increment");
2022     movw(rscratch2, (unsigned)value);
2023     addw(reg, reg, rscratch2);
2024   }
2025 }
2026 
2027 void MacroAssembler::increment(Register reg, int value)
2028 {
2029   if (value < 0)  { decrement(reg, -value);      return; }
2030   if (value == 0) {                              return; }
2031   if (value < (1 << 12)) { add(reg, reg, value); return; }
2032   /* else */ {
2033     assert(reg != rscratch2, "invalid dst for register increment");
2034     movw(rscratch2, (unsigned)value);
2035     add(reg, reg, rscratch2);
2036   }
2037 }
2038 
2039 void MacroAssembler::incrementw(Address dst, int value)
2040 {
2041   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2042   if (dst.getMode() == Address::literal) {
2043     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2044     lea(rscratch2, dst);
2045     dst = Address(rscratch2);
2046   }
2047   ldrw(rscratch1, dst);
2048   incrementw(rscratch1, value);
2049   strw(rscratch1, dst);
2050 }
2051 
2052 void MacroAssembler::increment(Address dst, int value)
2053 {
2054   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2055   if (dst.getMode() == Address::literal) {
2056     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2057     lea(rscratch2, dst);
2058     dst = Address(rscratch2);
2059   }
2060   ldr(rscratch1, dst);
2061   increment(rscratch1, value);
2062   str(rscratch1, dst);
2063 }
2064 
2065 
2066 void MacroAssembler::pusha() {
2067   push(0x7fffffff, sp);
2068 }
2069 
2070 void MacroAssembler::popa() {
2071   pop(0x7fffffff, sp);
2072 }
2073 
2074 // Push lots of registers in the bit set supplied.  Don't push sp.
2075 // Return the number of words pushed
2076 int MacroAssembler::push(unsigned int bitset, Register stack) {
2077   int words_pushed = 0;
2078 
2079   // Scan bitset to accumulate register pairs
2080   unsigned char regs[32];
2081   int count = 0;
2082   for (int reg = 0; reg <= 30; reg++) {
2083     if (1 & bitset)
2084       regs[count++] = reg;
2085     bitset >>= 1;
2086   }
2087   regs[count++] = zr->encoding_nocheck();
2088   count &= ~1;  // Only push an even nuber of regs
2089 
2090   if (count) {
2091     stp(as_Register(regs[0]), as_Register(regs[1]),
2092        Address(pre(stack, -count * wordSize)));
2093     words_pushed += 2;
2094   }
2095   for (int i = 2; i < count; i += 2) {
2096     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2097        Address(stack, i * wordSize));
2098     words_pushed += 2;
2099   }
2100 
2101   assert(words_pushed == count, "oops, pushed != count");
2102 
2103   return count;
2104 }
2105 
2106 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2107   int words_pushed = 0;
2108 
2109   // Scan bitset to accumulate register pairs
2110   unsigned char regs[32];
2111   int count = 0;
2112   for (int reg = 0; reg <= 30; reg++) {
2113     if (1 & bitset)
2114       regs[count++] = reg;
2115     bitset >>= 1;
2116   }
2117   regs[count++] = zr->encoding_nocheck();
2118   count &= ~1;
2119 
2120   for (int i = 2; i < count; i += 2) {
2121     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2122        Address(stack, i * wordSize));
2123     words_pushed += 2;
2124   }
2125   if (count) {
2126     ldp(as_Register(regs[0]), as_Register(regs[1]),
2127        Address(post(stack, count * wordSize)));
2128     words_pushed += 2;
2129   }
2130 
2131   assert(words_pushed == count, "oops, pushed != count");
2132 
2133   return count;
2134 }
2135 
2136 // Push lots of registers in the bit set supplied.  Don't push sp.
2137 // Return the number of words pushed
2138 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2139   int words_pushed = 0;
2140 
2141   // Scan bitset to accumulate register pairs
2142   unsigned char regs[32];
2143   int count = 0;
2144   for (int reg = 0; reg <= 31; reg++) {
2145     if (1 & bitset)
2146       regs[count++] = reg;
2147     bitset >>= 1;
2148   }
2149   regs[count++] = zr->encoding_nocheck();
2150   count &= ~1;  // Only push an even number of regs
2151 
2152   // Always pushing full 128 bit registers.
2153   if (count) {
2154     stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2)));
2155     words_pushed += 2;
2156   }
2157   for (int i = 2; i < count; i += 2) {
2158     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2159     words_pushed += 2;
2160   }
2161 
2162   assert(words_pushed == count, "oops, pushed != count");
2163   return count;
2164 }
2165 
2166 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2167   int words_pushed = 0;
2168 
2169   // Scan bitset to accumulate register pairs
2170   unsigned char regs[32];
2171   int count = 0;
2172   for (int reg = 0; reg <= 31; reg++) {
2173     if (1 & bitset)
2174       regs[count++] = reg;
2175     bitset >>= 1;
2176   }
2177   regs[count++] = zr->encoding_nocheck();
2178   count &= ~1;
2179 
2180   for (int i = 2; i < count; i += 2) {
2181     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2182     words_pushed += 2;
2183   }
2184   if (count) {
2185     ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2)));
2186     words_pushed += 2;
2187   }
2188 
2189   assert(words_pushed == count, "oops, pushed != count");
2190 
2191   return count;
2192 }
2193 
2194 #ifdef ASSERT
2195 void MacroAssembler::verify_heapbase(const char* msg) {
2196 #if 0
2197   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2198   assert (Universe::heap() != NULL, "java heap should be initialized");
2199   if (CheckCompressedOops) {
2200     Label ok;
2201     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2202     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2203     br(Assembler::EQ, ok);
2204     stop(msg);
2205     bind(ok);
2206     pop(1 << rscratch1->encoding(), sp);
2207   }
2208 #endif
2209 }
2210 #endif
2211 
2212 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2213   Label done, not_weak;
2214   cbz(value, done);           // Use NULL as-is.
2215 
2216   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2217   tbz(r0, 0, not_weak);    // Test for jweak tag.
2218 
2219   // Resolve jweak.
2220   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2221                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2222   verify_oop(value);
2223   b(done);
2224 
2225   bind(not_weak);
2226   // Resolve (untagged) jobject.
2227   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2228   verify_oop(value);
2229   bind(done);
2230 }
2231 
2232 void MacroAssembler::stop(const char* msg) {
2233   address ip = pc();
2234   pusha();
2235   mov(c_rarg0, (address)msg);
2236   mov(c_rarg1, (address)ip);
2237   mov(c_rarg2, sp);
2238   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2239   blr(c_rarg3);
2240   hlt(0);
2241 }
2242 
2243 void MacroAssembler::warn(const char* msg) {
2244   pusha();
2245   mov(c_rarg0, (address)msg);
2246   mov(lr, CAST_FROM_FN_PTR(address, warning));
2247   blr(lr);
2248   popa();
2249 }
2250 
2251 void MacroAssembler::unimplemented(const char* what) {
2252   const char* buf = NULL;
2253   {
2254     ResourceMark rm;
2255     stringStream ss;
2256     ss.print("unimplemented: %s", what);
2257     buf = code_string(ss.as_string());
2258   }
2259   stop(buf);
2260 }
2261 
2262 // If a constant does not fit in an immediate field, generate some
2263 // number of MOV instructions and then perform the operation.
2264 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2265                                            add_sub_imm_insn insn1,
2266                                            add_sub_reg_insn insn2) {
2267   assert(Rd != zr, "Rd = zr and not setting flags?");
2268   if (operand_valid_for_add_sub_immediate((int)imm)) {
2269     (this->*insn1)(Rd, Rn, imm);
2270   } else {
2271     if (uabs(imm) < (1 << 24)) {
2272        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2273        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2274     } else {
2275        assert_different_registers(Rd, Rn);
2276        mov(Rd, (uint64_t)imm);
2277        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2278     }
2279   }
2280 }
2281 
2282 // Seperate vsn which sets the flags. Optimisations are more restricted
2283 // because we must set the flags correctly.
2284 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2285                                            add_sub_imm_insn insn1,
2286                                            add_sub_reg_insn insn2) {
2287   if (operand_valid_for_add_sub_immediate((int)imm)) {
2288     (this->*insn1)(Rd, Rn, imm);
2289   } else {
2290     assert_different_registers(Rd, Rn);
2291     assert(Rd != zr, "overflow in immediate operand");
2292     mov(Rd, (uint64_t)imm);
2293     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2294   }
2295 }
2296 
2297 
2298 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2299   if (increment.is_register()) {
2300     add(Rd, Rn, increment.as_register());
2301   } else {
2302     add(Rd, Rn, increment.as_constant());
2303   }
2304 }
2305 
2306 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2307   if (increment.is_register()) {
2308     addw(Rd, Rn, increment.as_register());
2309   } else {
2310     addw(Rd, Rn, increment.as_constant());
2311   }
2312 }
2313 
2314 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2315   if (decrement.is_register()) {
2316     sub(Rd, Rn, decrement.as_register());
2317   } else {
2318     sub(Rd, Rn, decrement.as_constant());
2319   }
2320 }
2321 
2322 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2323   if (decrement.is_register()) {
2324     subw(Rd, Rn, decrement.as_register());
2325   } else {
2326     subw(Rd, Rn, decrement.as_constant());
2327   }
2328 }
2329 
2330 void MacroAssembler::reinit_heapbase()
2331 {
2332   if (UseCompressedOops) {
2333     if (Universe::is_fully_initialized()) {
2334       mov(rheapbase, CompressedOops::ptrs_base());
2335     } else {
2336       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2337       ldr(rheapbase, Address(rheapbase));
2338     }
2339   }
2340 }
2341 
2342 // this simulates the behaviour of the x86 cmpxchg instruction using a
2343 // load linked/store conditional pair. we use the acquire/release
2344 // versions of these instructions so that we flush pending writes as
2345 // per Java semantics.
2346 
2347 // n.b the x86 version assumes the old value to be compared against is
2348 // in rax and updates rax with the value located in memory if the
2349 // cmpxchg fails. we supply a register for the old value explicitly
2350 
2351 // the aarch64 load linked/store conditional instructions do not
2352 // accept an offset. so, unlike x86, we must provide a plain register
2353 // to identify the memory word to be compared/exchanged rather than a
2354 // register+offset Address.
2355 
2356 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2357                                 Label &succeed, Label *fail) {
2358   // oldv holds comparison value
2359   // newv holds value to write in exchange
2360   // addr identifies memory word to compare against/update
2361   if (UseLSE) {
2362     mov(tmp, oldv);
2363     casal(Assembler::xword, oldv, newv, addr);
2364     cmp(tmp, oldv);
2365     br(Assembler::EQ, succeed);
2366     membar(AnyAny);
2367   } else {
2368     Label retry_load, nope;
2369     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2370       prfm(Address(addr), PSTL1STRM);
2371     bind(retry_load);
2372     // flush and load exclusive from the memory location
2373     // and fail if it is not what we expect
2374     ldaxr(tmp, addr);
2375     cmp(tmp, oldv);
2376     br(Assembler::NE, nope);
2377     // if we store+flush with no intervening write tmp wil be zero
2378     stlxr(tmp, newv, addr);
2379     cbzw(tmp, succeed);
2380     // retry so we only ever return after a load fails to compare
2381     // ensures we don't return a stale value after a failed write.
2382     b(retry_load);
2383     // if the memory word differs we return it in oldv and signal a fail
2384     bind(nope);
2385     membar(AnyAny);
2386     mov(oldv, tmp);
2387   }
2388   if (fail)
2389     b(*fail);
2390 }
2391 
2392 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2393                                         Label &succeed, Label *fail) {
2394   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2395   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2396 }
2397 
2398 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2399                                 Label &succeed, Label *fail) {
2400   // oldv holds comparison value
2401   // newv holds value to write in exchange
2402   // addr identifies memory word to compare against/update
2403   // tmp returns 0/1 for success/failure
2404   if (UseLSE) {
2405     mov(tmp, oldv);
2406     casal(Assembler::word, oldv, newv, addr);
2407     cmp(tmp, oldv);
2408     br(Assembler::EQ, succeed);
2409     membar(AnyAny);
2410   } else {
2411     Label retry_load, nope;
2412     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2413       prfm(Address(addr), PSTL1STRM);
2414     bind(retry_load);
2415     // flush and load exclusive from the memory location
2416     // and fail if it is not what we expect
2417     ldaxrw(tmp, addr);
2418     cmp(tmp, oldv);
2419     br(Assembler::NE, nope);
2420     // if we store+flush with no intervening write tmp wil be zero
2421     stlxrw(tmp, newv, addr);
2422     cbzw(tmp, succeed);
2423     // retry so we only ever return after a load fails to compare
2424     // ensures we don't return a stale value after a failed write.
2425     b(retry_load);
2426     // if the memory word differs we return it in oldv and signal a fail
2427     bind(nope);
2428     membar(AnyAny);
2429     mov(oldv, tmp);
2430   }
2431   if (fail)
2432     b(*fail);
2433 }
2434 
2435 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2436 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2437 // Pass a register for the result, otherwise pass noreg.
2438 
2439 // Clobbers rscratch1
2440 void MacroAssembler::cmpxchg(Register addr, Register expected,
2441                              Register new_val,
2442                              enum operand_size size,
2443                              bool acquire, bool release,
2444                              bool weak,
2445                              Register result) {
2446   if (result == noreg)  result = rscratch1;
2447   BLOCK_COMMENT("cmpxchg {");
2448   if (UseLSE) {
2449     mov(result, expected);
2450     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2451     compare_eq(result, expected, size);
2452   } else {
2453     Label retry_load, done;
2454     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2455       prfm(Address(addr), PSTL1STRM);
2456     bind(retry_load);
2457     load_exclusive(result, addr, size, acquire);
2458     compare_eq(result, expected, size);
2459     br(Assembler::NE, done);
2460     store_exclusive(rscratch1, new_val, addr, size, release);
2461     if (weak) {
2462       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2463     } else {
2464       cbnzw(rscratch1, retry_load);
2465     }
2466     bind(done);
2467   }
2468   BLOCK_COMMENT("} cmpxchg");
2469 }
2470 
2471 // A generic comparison. Only compares for equality, clobbers rscratch1.
2472 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2473   if (size == xword) {
2474     cmp(rm, rn);
2475   } else if (size == word) {
2476     cmpw(rm, rn);
2477   } else if (size == halfword) {
2478     eorw(rscratch1, rm, rn);
2479     ands(zr, rscratch1, 0xffff);
2480   } else if (size == byte) {
2481     eorw(rscratch1, rm, rn);
2482     ands(zr, rscratch1, 0xff);
2483   } else {
2484     ShouldNotReachHere();
2485   }
2486 }
2487 
2488 
2489 static bool different(Register a, RegisterOrConstant b, Register c) {
2490   if (b.is_constant())
2491     return a != c;
2492   else
2493     return a != b.as_register() && a != c && b.as_register() != c;
2494 }
2495 
2496 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2497 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2498   if (UseLSE) {                                                         \
2499     prev = prev->is_valid() ? prev : zr;                                \
2500     if (incr.is_register()) {                                           \
2501       AOP(sz, incr.as_register(), prev, addr);                          \
2502     } else {                                                            \
2503       mov(rscratch2, incr.as_constant());                               \
2504       AOP(sz, rscratch2, prev, addr);                                   \
2505     }                                                                   \
2506     return;                                                             \
2507   }                                                                     \
2508   Register result = rscratch2;                                          \
2509   if (prev->is_valid())                                                 \
2510     result = different(prev, incr, addr) ? prev : rscratch2;            \
2511                                                                         \
2512   Label retry_load;                                                     \
2513   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2514     prfm(Address(addr), PSTL1STRM);                                     \
2515   bind(retry_load);                                                     \
2516   LDXR(result, addr);                                                   \
2517   OP(rscratch1, result, incr);                                          \
2518   STXR(rscratch2, rscratch1, addr);                                     \
2519   cbnzw(rscratch2, retry_load);                                         \
2520   if (prev->is_valid() && prev != result) {                             \
2521     IOP(prev, rscratch1, incr);                                         \
2522   }                                                                     \
2523 }
2524 
2525 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2526 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2527 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2528 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2529 
2530 #undef ATOMIC_OP
2531 
2532 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2533 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2534   if (UseLSE) {                                                         \
2535     prev = prev->is_valid() ? prev : zr;                                \
2536     AOP(sz, newv, prev, addr);                                          \
2537     return;                                                             \
2538   }                                                                     \
2539   Register result = rscratch2;                                          \
2540   if (prev->is_valid())                                                 \
2541     result = different(prev, newv, addr) ? prev : rscratch2;            \
2542                                                                         \
2543   Label retry_load;                                                     \
2544   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2545     prfm(Address(addr), PSTL1STRM);                                     \
2546   bind(retry_load);                                                     \
2547   LDXR(result, addr);                                                   \
2548   STXR(rscratch1, newv, addr);                                          \
2549   cbnzw(rscratch1, retry_load);                                         \
2550   if (prev->is_valid() && prev != result)                               \
2551     mov(prev, result);                                                  \
2552 }
2553 
2554 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2555 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2556 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2557 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2558 
2559 #undef ATOMIC_XCHG
2560 
2561 #ifndef PRODUCT
2562 extern "C" void findpc(intptr_t x);
2563 #endif
2564 
2565 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2566 {
2567   // In order to get locks to work, we need to fake a in_VM state
2568   if (ShowMessageBoxOnError ) {
2569     JavaThread* thread = JavaThread::current();
2570     JavaThreadState saved_state = thread->thread_state();
2571     thread->set_thread_state(_thread_in_vm);
2572 #ifndef PRODUCT
2573     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2574       ttyLocker ttyl;
2575       BytecodeCounter::print();
2576     }
2577 #endif
2578     if (os::message_box(msg, "Execution stopped, print registers?")) {
2579       ttyLocker ttyl;
2580       tty->print_cr(" pc = 0x%016lx", pc);
2581 #ifndef PRODUCT
2582       tty->cr();
2583       findpc(pc);
2584       tty->cr();
2585 #endif
2586       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2587       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2588       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2589       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2590       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2591       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2592       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2593       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2594       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2595       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2596       tty->print_cr("r10 = 0x%016lx", regs[10]);
2597       tty->print_cr("r11 = 0x%016lx", regs[11]);
2598       tty->print_cr("r12 = 0x%016lx", regs[12]);
2599       tty->print_cr("r13 = 0x%016lx", regs[13]);
2600       tty->print_cr("r14 = 0x%016lx", regs[14]);
2601       tty->print_cr("r15 = 0x%016lx", regs[15]);
2602       tty->print_cr("r16 = 0x%016lx", regs[16]);
2603       tty->print_cr("r17 = 0x%016lx", regs[17]);
2604       tty->print_cr("r18 = 0x%016lx", regs[18]);
2605       tty->print_cr("r19 = 0x%016lx", regs[19]);
2606       tty->print_cr("r20 = 0x%016lx", regs[20]);
2607       tty->print_cr("r21 = 0x%016lx", regs[21]);
2608       tty->print_cr("r22 = 0x%016lx", regs[22]);
2609       tty->print_cr("r23 = 0x%016lx", regs[23]);
2610       tty->print_cr("r24 = 0x%016lx", regs[24]);
2611       tty->print_cr("r25 = 0x%016lx", regs[25]);
2612       tty->print_cr("r26 = 0x%016lx", regs[26]);
2613       tty->print_cr("r27 = 0x%016lx", regs[27]);
2614       tty->print_cr("r28 = 0x%016lx", regs[28]);
2615       tty->print_cr("r30 = 0x%016lx", regs[30]);
2616       tty->print_cr("r31 = 0x%016lx", regs[31]);
2617       BREAKPOINT;
2618     }
2619   }
2620   fatal("DEBUG MESSAGE: %s", msg);
2621 }
2622 
2623 void MacroAssembler::push_call_clobbered_registers() {
2624   int step = 4 * wordSize;
2625   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2626   sub(sp, sp, step);
2627   mov(rscratch1, -step);
2628   // Push v0-v7, v16-v31.
2629   for (int i = 31; i>= 4; i -= 4) {
2630     if (i <= v7->encoding() || i >= v16->encoding())
2631       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2632           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2633   }
2634   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2635       as_FloatRegister(3), T1D, Address(sp));
2636 }
2637 
2638 void MacroAssembler::pop_call_clobbered_registers() {
2639   for (int i = 0; i < 32; i += 4) {
2640     if (i <= v7->encoding() || i >= v16->encoding())
2641       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2642           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2643   }
2644 
2645   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2646 }
2647 
2648 void MacroAssembler::push_CPU_state(bool save_vectors) {
2649   int step = (save_vectors ? 8 : 4) * wordSize;
2650   push(0x3fffffff, sp);         // integer registers except lr & sp
2651   mov(rscratch1, -step);
2652   sub(sp, sp, step);
2653   for (int i = 28; i >= 4; i -= 4) {
2654     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2656   }
2657   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2658 }
2659 
2660 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2661   int step = (restore_vectors ? 8 : 4) * wordSize;
2662   for (int i = 0; i <= 28; i += 4)
2663     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2664         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2665   pop(0x3fffffff, sp);         // integer registers except lr & sp
2666 }
2667 
2668 /**
2669  * Helpers for multiply_to_len().
2670  */
2671 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2672                                      Register src1, Register src2) {
2673   adds(dest_lo, dest_lo, src1);
2674   adc(dest_hi, dest_hi, zr);
2675   adds(dest_lo, dest_lo, src2);
2676   adc(final_dest_hi, dest_hi, zr);
2677 }
2678 
2679 // Generate an address from (r + r1 extend offset).  "size" is the
2680 // size of the operand.  The result may be in rscratch2.
2681 Address MacroAssembler::offsetted_address(Register r, Register r1,
2682                                           Address::extend ext, int offset, int size) {
2683   if (offset || (ext.shift() % size != 0)) {
2684     lea(rscratch2, Address(r, r1, ext));
2685     return Address(rscratch2, offset);
2686   } else {
2687     return Address(r, r1, ext);
2688   }
2689 }
2690 
2691 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2692 {
2693   assert(offset >= 0, "spill to negative address?");
2694   // Offset reachable ?
2695   //   Not aligned - 9 bits signed offset
2696   //   Aligned - 12 bits unsigned offset shifted
2697   Register base = sp;
2698   if ((offset & (size-1)) && offset >= (1<<8)) {
2699     add(tmp, base, offset & ((1<<12)-1));
2700     base = tmp;
2701     offset &= -1u<<12;
2702   }
2703 
2704   if (offset >= (1<<12) * size) {
2705     add(tmp, base, offset & (((1<<12)-1)<<12));
2706     base = tmp;
2707     offset &= ~(((1<<12)-1)<<12);
2708   }
2709 
2710   return Address(base, offset);
2711 }
2712 
2713 // Checks whether offset is aligned.
2714 // Returns true if it is, else false.
2715 bool MacroAssembler::merge_alignment_check(Register base,
2716                                            size_t size,
2717                                            long cur_offset,
2718                                            long prev_offset) const {
2719   if (AvoidUnalignedAccesses) {
2720     if (base == sp) {
2721       // Checks whether low offset if aligned to pair of registers.
2722       long pair_mask = size * 2 - 1;
2723       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2724       return (offset & pair_mask) == 0;
2725     } else { // If base is not sp, we can't guarantee the access is aligned.
2726       return false;
2727     }
2728   } else {
2729     long mask = size - 1;
2730     // Load/store pair instruction only supports element size aligned offset.
2731     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2732   }
2733 }
2734 
2735 // Checks whether current and previous loads/stores can be merged.
2736 // Returns true if it can be merged, else false.
2737 bool MacroAssembler::ldst_can_merge(Register rt,
2738                                     const Address &adr,
2739                                     size_t cur_size_in_bytes,
2740                                     bool is_store) const {
2741   address prev = pc() - NativeInstruction::instruction_size;
2742   address last = code()->last_insn();
2743 
2744   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2745     return false;
2746   }
2747 
2748   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2749     return false;
2750   }
2751 
2752   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2753   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2754 
2755   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2756   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2757 
2758   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2759     return false;
2760   }
2761 
2762   long max_offset = 63 * prev_size_in_bytes;
2763   long min_offset = -64 * prev_size_in_bytes;
2764 
2765   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2766 
2767   // Only same base can be merged.
2768   if (adr.base() != prev_ldst->base()) {
2769     return false;
2770   }
2771 
2772   long cur_offset = adr.offset();
2773   long prev_offset = prev_ldst->offset();
2774   size_t diff = abs(cur_offset - prev_offset);
2775   if (diff != prev_size_in_bytes) {
2776     return false;
2777   }
2778 
2779   // Following cases can not be merged:
2780   // ldr x2, [x2, #8]
2781   // ldr x3, [x2, #16]
2782   // or:
2783   // ldr x2, [x3, #8]
2784   // ldr x2, [x3, #16]
2785   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2786   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2787     return false;
2788   }
2789 
2790   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2791   // Offset range must be in ldp/stp instruction's range.
2792   if (low_offset > max_offset || low_offset < min_offset) {
2793     return false;
2794   }
2795 
2796   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2797     return true;
2798   }
2799 
2800   return false;
2801 }
2802 
2803 // Merge current load/store with previous load/store into ldp/stp.
2804 void MacroAssembler::merge_ldst(Register rt,
2805                                 const Address &adr,
2806                                 size_t cur_size_in_bytes,
2807                                 bool is_store) {
2808 
2809   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2810 
2811   Register rt_low, rt_high;
2812   address prev = pc() - NativeInstruction::instruction_size;
2813   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2814 
2815   long offset;
2816 
2817   if (adr.offset() < prev_ldst->offset()) {
2818     offset = adr.offset();
2819     rt_low = rt;
2820     rt_high = prev_ldst->target();
2821   } else {
2822     offset = prev_ldst->offset();
2823     rt_low = prev_ldst->target();
2824     rt_high = rt;
2825   }
2826 
2827   Address adr_p = Address(prev_ldst->base(), offset);
2828   // Overwrite previous generated binary.
2829   code_section()->set_end(prev);
2830 
2831   const int sz = prev_ldst->size_in_bytes();
2832   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2833   if (!is_store) {
2834     BLOCK_COMMENT("merged ldr pair");
2835     if (sz == 8) {
2836       ldp(rt_low, rt_high, adr_p);
2837     } else {
2838       ldpw(rt_low, rt_high, adr_p);
2839     }
2840   } else {
2841     BLOCK_COMMENT("merged str pair");
2842     if (sz == 8) {
2843       stp(rt_low, rt_high, adr_p);
2844     } else {
2845       stpw(rt_low, rt_high, adr_p);
2846     }
2847   }
2848 }
2849 
2850 /**
2851  * Multiply 64 bit by 64 bit first loop.
2852  */
2853 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2854                                            Register y, Register y_idx, Register z,
2855                                            Register carry, Register product,
2856                                            Register idx, Register kdx) {
2857   //
2858   //  jlong carry, x[], y[], z[];
2859   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2860   //    huge_128 product = y[idx] * x[xstart] + carry;
2861   //    z[kdx] = (jlong)product;
2862   //    carry  = (jlong)(product >>> 64);
2863   //  }
2864   //  z[xstart] = carry;
2865   //
2866 
2867   Label L_first_loop, L_first_loop_exit;
2868   Label L_one_x, L_one_y, L_multiply;
2869 
2870   subsw(xstart, xstart, 1);
2871   br(Assembler::MI, L_one_x);
2872 
2873   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2874   ldr(x_xstart, Address(rscratch1));
2875   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2876 
2877   bind(L_first_loop);
2878   subsw(idx, idx, 1);
2879   br(Assembler::MI, L_first_loop_exit);
2880   subsw(idx, idx, 1);
2881   br(Assembler::MI, L_one_y);
2882   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2883   ldr(y_idx, Address(rscratch1));
2884   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2885   bind(L_multiply);
2886 
2887   // AArch64 has a multiply-accumulate instruction that we can't use
2888   // here because it has no way to process carries, so we have to use
2889   // separate add and adc instructions.  Bah.
2890   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2891   mul(product, x_xstart, y_idx);
2892   adds(product, product, carry);
2893   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2894 
2895   subw(kdx, kdx, 2);
2896   ror(product, product, 32); // back to big-endian
2897   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2898 
2899   b(L_first_loop);
2900 
2901   bind(L_one_y);
2902   ldrw(y_idx, Address(y,  0));
2903   b(L_multiply);
2904 
2905   bind(L_one_x);
2906   ldrw(x_xstart, Address(x,  0));
2907   b(L_first_loop);
2908 
2909   bind(L_first_loop_exit);
2910 }
2911 
2912 /**
2913  * Multiply 128 bit by 128. Unrolled inner loop.
2914  *
2915  */
2916 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2917                                              Register carry, Register carry2,
2918                                              Register idx, Register jdx,
2919                                              Register yz_idx1, Register yz_idx2,
2920                                              Register tmp, Register tmp3, Register tmp4,
2921                                              Register tmp6, Register product_hi) {
2922 
2923   //   jlong carry, x[], y[], z[];
2924   //   int kdx = ystart+1;
2925   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2926   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2927   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2928   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2929   //     carry  = (jlong)(tmp4 >>> 64);
2930   //     z[kdx+idx+1] = (jlong)tmp3;
2931   //     z[kdx+idx] = (jlong)tmp4;
2932   //   }
2933   //   idx += 2;
2934   //   if (idx > 0) {
2935   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2936   //     z[kdx+idx] = (jlong)yz_idx1;
2937   //     carry  = (jlong)(yz_idx1 >>> 64);
2938   //   }
2939   //
2940 
2941   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2942 
2943   lsrw(jdx, idx, 2);
2944 
2945   bind(L_third_loop);
2946 
2947   subsw(jdx, jdx, 1);
2948   br(Assembler::MI, L_third_loop_exit);
2949   subw(idx, idx, 4);
2950 
2951   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2952 
2953   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2954 
2955   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2956 
2957   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2958   ror(yz_idx2, yz_idx2, 32);
2959 
2960   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2961 
2962   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2963   umulh(tmp4, product_hi, yz_idx1);
2964 
2965   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2966   ror(rscratch2, rscratch2, 32);
2967 
2968   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2969   umulh(carry2, product_hi, yz_idx2);
2970 
2971   // propagate sum of both multiplications into carry:tmp4:tmp3
2972   adds(tmp3, tmp3, carry);
2973   adc(tmp4, tmp4, zr);
2974   adds(tmp3, tmp3, rscratch1);
2975   adcs(tmp4, tmp4, tmp);
2976   adc(carry, carry2, zr);
2977   adds(tmp4, tmp4, rscratch2);
2978   adc(carry, carry, zr);
2979 
2980   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2981   ror(tmp4, tmp4, 32);
2982   stp(tmp4, tmp3, Address(tmp6, 0));
2983 
2984   b(L_third_loop);
2985   bind (L_third_loop_exit);
2986 
2987   andw (idx, idx, 0x3);
2988   cbz(idx, L_post_third_loop_done);
2989 
2990   Label L_check_1;
2991   subsw(idx, idx, 2);
2992   br(Assembler::MI, L_check_1);
2993 
2994   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2995   ldr(yz_idx1, Address(rscratch1, 0));
2996   ror(yz_idx1, yz_idx1, 32);
2997   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2998   umulh(tmp4, product_hi, yz_idx1);
2999   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3000   ldr(yz_idx2, Address(rscratch1, 0));
3001   ror(yz_idx2, yz_idx2, 32);
3002 
3003   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3004 
3005   ror(tmp3, tmp3, 32);
3006   str(tmp3, Address(rscratch1, 0));
3007 
3008   bind (L_check_1);
3009 
3010   andw (idx, idx, 0x1);
3011   subsw(idx, idx, 1);
3012   br(Assembler::MI, L_post_third_loop_done);
3013   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3014   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3015   umulh(carry2, tmp4, product_hi);
3016   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3017 
3018   add2_with_carry(carry2, tmp3, tmp4, carry);
3019 
3020   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3021   extr(carry, carry2, tmp3, 32);
3022 
3023   bind(L_post_third_loop_done);
3024 }
3025 
3026 /**
3027  * Code for BigInteger::multiplyToLen() instrinsic.
3028  *
3029  * r0: x
3030  * r1: xlen
3031  * r2: y
3032  * r3: ylen
3033  * r4:  z
3034  * r5: zlen
3035  * r10: tmp1
3036  * r11: tmp2
3037  * r12: tmp3
3038  * r13: tmp4
3039  * r14: tmp5
3040  * r15: tmp6
3041  * r16: tmp7
3042  *
3043  */
3044 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3045                                      Register z, Register zlen,
3046                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3047                                      Register tmp5, Register tmp6, Register product_hi) {
3048 
3049   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3050 
3051   const Register idx = tmp1;
3052   const Register kdx = tmp2;
3053   const Register xstart = tmp3;
3054 
3055   const Register y_idx = tmp4;
3056   const Register carry = tmp5;
3057   const Register product  = xlen;
3058   const Register x_xstart = zlen;  // reuse register
3059 
3060   // First Loop.
3061   //
3062   //  final static long LONG_MASK = 0xffffffffL;
3063   //  int xstart = xlen - 1;
3064   //  int ystart = ylen - 1;
3065   //  long carry = 0;
3066   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3067   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3068   //    z[kdx] = (int)product;
3069   //    carry = product >>> 32;
3070   //  }
3071   //  z[xstart] = (int)carry;
3072   //
3073 
3074   movw(idx, ylen);      // idx = ylen;
3075   movw(kdx, zlen);      // kdx = xlen+ylen;
3076   mov(carry, zr);       // carry = 0;
3077 
3078   Label L_done;
3079 
3080   movw(xstart, xlen);
3081   subsw(xstart, xstart, 1);
3082   br(Assembler::MI, L_done);
3083 
3084   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3085 
3086   Label L_second_loop;
3087   cbzw(kdx, L_second_loop);
3088 
3089   Label L_carry;
3090   subw(kdx, kdx, 1);
3091   cbzw(kdx, L_carry);
3092 
3093   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3094   lsr(carry, carry, 32);
3095   subw(kdx, kdx, 1);
3096 
3097   bind(L_carry);
3098   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3099 
3100   // Second and third (nested) loops.
3101   //
3102   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3103   //   carry = 0;
3104   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3105   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3106   //                    (z[k] & LONG_MASK) + carry;
3107   //     z[k] = (int)product;
3108   //     carry = product >>> 32;
3109   //   }
3110   //   z[i] = (int)carry;
3111   // }
3112   //
3113   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3114 
3115   const Register jdx = tmp1;
3116 
3117   bind(L_second_loop);
3118   mov(carry, zr);                // carry = 0;
3119   movw(jdx, ylen);               // j = ystart+1
3120 
3121   subsw(xstart, xstart, 1);      // i = xstart-1;
3122   br(Assembler::MI, L_done);
3123 
3124   str(z, Address(pre(sp, -4 * wordSize)));
3125 
3126   Label L_last_x;
3127   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3128   subsw(xstart, xstart, 1);       // i = xstart-1;
3129   br(Assembler::MI, L_last_x);
3130 
3131   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3132   ldr(product_hi, Address(rscratch1));
3133   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3134 
3135   Label L_third_loop_prologue;
3136   bind(L_third_loop_prologue);
3137 
3138   str(ylen, Address(sp, wordSize));
3139   stp(x, xstart, Address(sp, 2 * wordSize));
3140   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3141                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3142   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3143   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3144 
3145   addw(tmp3, xlen, 1);
3146   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3147   subsw(tmp3, tmp3, 1);
3148   br(Assembler::MI, L_done);
3149 
3150   lsr(carry, carry, 32);
3151   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3152   b(L_second_loop);
3153 
3154   // Next infrequent code is moved outside loops.
3155   bind(L_last_x);
3156   ldrw(product_hi, Address(x,  0));
3157   b(L_third_loop_prologue);
3158 
3159   bind(L_done);
3160 }
3161 
3162 // Code for BigInteger::mulAdd instrinsic
3163 // out     = r0
3164 // in      = r1
3165 // offset  = r2  (already out.length-offset)
3166 // len     = r3
3167 // k       = r4
3168 //
3169 // pseudo code from java implementation:
3170 // carry = 0;
3171 // offset = out.length-offset - 1;
3172 // for (int j=len-1; j >= 0; j--) {
3173 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3174 //     out[offset--] = (int)product;
3175 //     carry = product >>> 32;
3176 // }
3177 // return (int)carry;
3178 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3179       Register len, Register k) {
3180     Label LOOP, END;
3181     // pre-loop
3182     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3183     csel(out, zr, out, Assembler::EQ);
3184     br(Assembler::EQ, END);
3185     add(in, in, len, LSL, 2); // in[j+1] address
3186     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3187     mov(out, zr); // used to keep carry now
3188     BIND(LOOP);
3189     ldrw(rscratch1, Address(pre(in, -4)));
3190     madd(rscratch1, rscratch1, k, out);
3191     ldrw(rscratch2, Address(pre(offset, -4)));
3192     add(rscratch1, rscratch1, rscratch2);
3193     strw(rscratch1, Address(offset));
3194     lsr(out, rscratch1, 32);
3195     subs(len, len, 1);
3196     br(Assembler::NE, LOOP);
3197     BIND(END);
3198 }
3199 
3200 /**
3201  * Emits code to update CRC-32 with a byte value according to constants in table
3202  *
3203  * @param [in,out]crc   Register containing the crc.
3204  * @param [in]val       Register containing the byte to fold into the CRC.
3205  * @param [in]table     Register containing the table of crc constants.
3206  *
3207  * uint32_t crc;
3208  * val = crc_table[(val ^ crc) & 0xFF];
3209  * crc = val ^ (crc >> 8);
3210  *
3211  */
3212 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3213   eor(val, val, crc);
3214   andr(val, val, 0xff);
3215   ldrw(val, Address(table, val, Address::lsl(2)));
3216   eor(crc, val, crc, Assembler::LSR, 8);
3217 }
3218 
3219 /**
3220  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3221  *
3222  * @param [in,out]crc   Register containing the crc.
3223  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3224  * @param [in]table0    Register containing table 0 of crc constants.
3225  * @param [in]table1    Register containing table 1 of crc constants.
3226  * @param [in]table2    Register containing table 2 of crc constants.
3227  * @param [in]table3    Register containing table 3 of crc constants.
3228  *
3229  * uint32_t crc;
3230  *   v = crc ^ v
3231  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3232  *
3233  */
3234 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3235         Register table0, Register table1, Register table2, Register table3,
3236         bool upper) {
3237   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3238   uxtb(tmp, v);
3239   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3240   ubfx(tmp, v, 8, 8);
3241   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3242   eor(crc, crc, tmp);
3243   ubfx(tmp, v, 16, 8);
3244   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3245   eor(crc, crc, tmp);
3246   ubfx(tmp, v, 24, 8);
3247   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3248   eor(crc, crc, tmp);
3249 }
3250 
3251 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3252         Register len, Register tmp0, Register tmp1, Register tmp2,
3253         Register tmp3) {
3254     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3255     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3256 
3257     mvnw(crc, crc);
3258 
3259     subs(len, len, 128);
3260     br(Assembler::GE, CRC_by64_pre);
3261   BIND(CRC_less64);
3262     adds(len, len, 128-32);
3263     br(Assembler::GE, CRC_by32_loop);
3264   BIND(CRC_less32);
3265     adds(len, len, 32-4);
3266     br(Assembler::GE, CRC_by4_loop);
3267     adds(len, len, 4);
3268     br(Assembler::GT, CRC_by1_loop);
3269     b(L_exit);
3270 
3271   BIND(CRC_by32_loop);
3272     ldp(tmp0, tmp1, Address(post(buf, 16)));
3273     subs(len, len, 32);
3274     crc32x(crc, crc, tmp0);
3275     ldr(tmp2, Address(post(buf, 8)));
3276     crc32x(crc, crc, tmp1);
3277     ldr(tmp3, Address(post(buf, 8)));
3278     crc32x(crc, crc, tmp2);
3279     crc32x(crc, crc, tmp3);
3280     br(Assembler::GE, CRC_by32_loop);
3281     cmn(len, 32);
3282     br(Assembler::NE, CRC_less32);
3283     b(L_exit);
3284 
3285   BIND(CRC_by4_loop);
3286     ldrw(tmp0, Address(post(buf, 4)));
3287     subs(len, len, 4);
3288     crc32w(crc, crc, tmp0);
3289     br(Assembler::GE, CRC_by4_loop);
3290     adds(len, len, 4);
3291     br(Assembler::LE, L_exit);
3292   BIND(CRC_by1_loop);
3293     ldrb(tmp0, Address(post(buf, 1)));
3294     subs(len, len, 1);
3295     crc32b(crc, crc, tmp0);
3296     br(Assembler::GT, CRC_by1_loop);
3297     b(L_exit);
3298 
3299   BIND(CRC_by64_pre);
3300     sub(buf, buf, 8);
3301     ldp(tmp0, tmp1, Address(buf, 8));
3302     crc32x(crc, crc, tmp0);
3303     ldr(tmp2, Address(buf, 24));
3304     crc32x(crc, crc, tmp1);
3305     ldr(tmp3, Address(buf, 32));
3306     crc32x(crc, crc, tmp2);
3307     ldr(tmp0, Address(buf, 40));
3308     crc32x(crc, crc, tmp3);
3309     ldr(tmp1, Address(buf, 48));
3310     crc32x(crc, crc, tmp0);
3311     ldr(tmp2, Address(buf, 56));
3312     crc32x(crc, crc, tmp1);
3313     ldr(tmp3, Address(pre(buf, 64)));
3314 
3315     b(CRC_by64_loop);
3316 
3317     align(CodeEntryAlignment);
3318   BIND(CRC_by64_loop);
3319     subs(len, len, 64);
3320     crc32x(crc, crc, tmp2);
3321     ldr(tmp0, Address(buf, 8));
3322     crc32x(crc, crc, tmp3);
3323     ldr(tmp1, Address(buf, 16));
3324     crc32x(crc, crc, tmp0);
3325     ldr(tmp2, Address(buf, 24));
3326     crc32x(crc, crc, tmp1);
3327     ldr(tmp3, Address(buf, 32));
3328     crc32x(crc, crc, tmp2);
3329     ldr(tmp0, Address(buf, 40));
3330     crc32x(crc, crc, tmp3);
3331     ldr(tmp1, Address(buf, 48));
3332     crc32x(crc, crc, tmp0);
3333     ldr(tmp2, Address(buf, 56));
3334     crc32x(crc, crc, tmp1);
3335     ldr(tmp3, Address(pre(buf, 64)));
3336     br(Assembler::GE, CRC_by64_loop);
3337 
3338     // post-loop
3339     crc32x(crc, crc, tmp2);
3340     crc32x(crc, crc, tmp3);
3341 
3342     sub(len, len, 64);
3343     add(buf, buf, 8);
3344     cmn(len, 128);
3345     br(Assembler::NE, CRC_less64);
3346   BIND(L_exit);
3347     mvnw(crc, crc);
3348 }
3349 
3350 /**
3351  * @param crc   register containing existing CRC (32-bit)
3352  * @param buf   register pointing to input byte buffer (byte*)
3353  * @param len   register containing number of bytes
3354  * @param table register that will contain address of CRC table
3355  * @param tmp   scratch register
3356  */
3357 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3358         Register table0, Register table1, Register table2, Register table3,
3359         Register tmp, Register tmp2, Register tmp3) {
3360   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3361   unsigned long offset;
3362 
3363   if (UseCRC32) {
3364       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3365       return;
3366   }
3367 
3368     mvnw(crc, crc);
3369 
3370     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3371     if (offset) add(table0, table0, offset);
3372     add(table1, table0, 1*256*sizeof(juint));
3373     add(table2, table0, 2*256*sizeof(juint));
3374     add(table3, table0, 3*256*sizeof(juint));
3375 
3376   if (UseNeon) {
3377       cmp(len, (u1)64);
3378       br(Assembler::LT, L_by16);
3379       eor(v16, T16B, v16, v16);
3380 
3381     Label L_fold;
3382 
3383       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3384 
3385       ld1(v0, v1, T2D, post(buf, 32));
3386       ld1r(v4, T2D, post(tmp, 8));
3387       ld1r(v5, T2D, post(tmp, 8));
3388       ld1r(v6, T2D, post(tmp, 8));
3389       ld1r(v7, T2D, post(tmp, 8));
3390       mov(v16, T4S, 0, crc);
3391 
3392       eor(v0, T16B, v0, v16);
3393       sub(len, len, 64);
3394 
3395     BIND(L_fold);
3396       pmull(v22, T8H, v0, v5, T8B);
3397       pmull(v20, T8H, v0, v7, T8B);
3398       pmull(v23, T8H, v0, v4, T8B);
3399       pmull(v21, T8H, v0, v6, T8B);
3400 
3401       pmull2(v18, T8H, v0, v5, T16B);
3402       pmull2(v16, T8H, v0, v7, T16B);
3403       pmull2(v19, T8H, v0, v4, T16B);
3404       pmull2(v17, T8H, v0, v6, T16B);
3405 
3406       uzp1(v24, T8H, v20, v22);
3407       uzp2(v25, T8H, v20, v22);
3408       eor(v20, T16B, v24, v25);
3409 
3410       uzp1(v26, T8H, v16, v18);
3411       uzp2(v27, T8H, v16, v18);
3412       eor(v16, T16B, v26, v27);
3413 
3414       ushll2(v22, T4S, v20, T8H, 8);
3415       ushll(v20, T4S, v20, T4H, 8);
3416 
3417       ushll2(v18, T4S, v16, T8H, 8);
3418       ushll(v16, T4S, v16, T4H, 8);
3419 
3420       eor(v22, T16B, v23, v22);
3421       eor(v18, T16B, v19, v18);
3422       eor(v20, T16B, v21, v20);
3423       eor(v16, T16B, v17, v16);
3424 
3425       uzp1(v17, T2D, v16, v20);
3426       uzp2(v21, T2D, v16, v20);
3427       eor(v17, T16B, v17, v21);
3428 
3429       ushll2(v20, T2D, v17, T4S, 16);
3430       ushll(v16, T2D, v17, T2S, 16);
3431 
3432       eor(v20, T16B, v20, v22);
3433       eor(v16, T16B, v16, v18);
3434 
3435       uzp1(v17, T2D, v20, v16);
3436       uzp2(v21, T2D, v20, v16);
3437       eor(v28, T16B, v17, v21);
3438 
3439       pmull(v22, T8H, v1, v5, T8B);
3440       pmull(v20, T8H, v1, v7, T8B);
3441       pmull(v23, T8H, v1, v4, T8B);
3442       pmull(v21, T8H, v1, v6, T8B);
3443 
3444       pmull2(v18, T8H, v1, v5, T16B);
3445       pmull2(v16, T8H, v1, v7, T16B);
3446       pmull2(v19, T8H, v1, v4, T16B);
3447       pmull2(v17, T8H, v1, v6, T16B);
3448 
3449       ld1(v0, v1, T2D, post(buf, 32));
3450 
3451       uzp1(v24, T8H, v20, v22);
3452       uzp2(v25, T8H, v20, v22);
3453       eor(v20, T16B, v24, v25);
3454 
3455       uzp1(v26, T8H, v16, v18);
3456       uzp2(v27, T8H, v16, v18);
3457       eor(v16, T16B, v26, v27);
3458 
3459       ushll2(v22, T4S, v20, T8H, 8);
3460       ushll(v20, T4S, v20, T4H, 8);
3461 
3462       ushll2(v18, T4S, v16, T8H, 8);
3463       ushll(v16, T4S, v16, T4H, 8);
3464 
3465       eor(v22, T16B, v23, v22);
3466       eor(v18, T16B, v19, v18);
3467       eor(v20, T16B, v21, v20);
3468       eor(v16, T16B, v17, v16);
3469 
3470       uzp1(v17, T2D, v16, v20);
3471       uzp2(v21, T2D, v16, v20);
3472       eor(v16, T16B, v17, v21);
3473 
3474       ushll2(v20, T2D, v16, T4S, 16);
3475       ushll(v16, T2D, v16, T2S, 16);
3476 
3477       eor(v20, T16B, v22, v20);
3478       eor(v16, T16B, v16, v18);
3479 
3480       uzp1(v17, T2D, v20, v16);
3481       uzp2(v21, T2D, v20, v16);
3482       eor(v20, T16B, v17, v21);
3483 
3484       shl(v16, T2D, v28, 1);
3485       shl(v17, T2D, v20, 1);
3486 
3487       eor(v0, T16B, v0, v16);
3488       eor(v1, T16B, v1, v17);
3489 
3490       subs(len, len, 32);
3491       br(Assembler::GE, L_fold);
3492 
3493       mov(crc, 0);
3494       mov(tmp, v0, T1D, 0);
3495       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3496       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3497       mov(tmp, v0, T1D, 1);
3498       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3499       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3500       mov(tmp, v1, T1D, 0);
3501       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3502       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3503       mov(tmp, v1, T1D, 1);
3504       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3505       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3506 
3507       add(len, len, 32);
3508   }
3509 
3510   BIND(L_by16);
3511     subs(len, len, 16);
3512     br(Assembler::GE, L_by16_loop);
3513     adds(len, len, 16-4);
3514     br(Assembler::GE, L_by4_loop);
3515     adds(len, len, 4);
3516     br(Assembler::GT, L_by1_loop);
3517     b(L_exit);
3518 
3519   BIND(L_by4_loop);
3520     ldrw(tmp, Address(post(buf, 4)));
3521     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3522     subs(len, len, 4);
3523     br(Assembler::GE, L_by4_loop);
3524     adds(len, len, 4);
3525     br(Assembler::LE, L_exit);
3526   BIND(L_by1_loop);
3527     subs(len, len, 1);
3528     ldrb(tmp, Address(post(buf, 1)));
3529     update_byte_crc32(crc, tmp, table0);
3530     br(Assembler::GT, L_by1_loop);
3531     b(L_exit);
3532 
3533     align(CodeEntryAlignment);
3534   BIND(L_by16_loop);
3535     subs(len, len, 16);
3536     ldp(tmp, tmp3, Address(post(buf, 16)));
3537     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3538     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3539     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3540     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3541     br(Assembler::GE, L_by16_loop);
3542     adds(len, len, 16-4);
3543     br(Assembler::GE, L_by4_loop);
3544     adds(len, len, 4);
3545     br(Assembler::GT, L_by1_loop);
3546   BIND(L_exit);
3547     mvnw(crc, crc);
3548 }
3549 
3550 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3551         Register len, Register tmp0, Register tmp1, Register tmp2,
3552         Register tmp3) {
3553     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3554     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3555 
3556     subs(len, len, 128);
3557     br(Assembler::GE, CRC_by64_pre);
3558   BIND(CRC_less64);
3559     adds(len, len, 128-32);
3560     br(Assembler::GE, CRC_by32_loop);
3561   BIND(CRC_less32);
3562     adds(len, len, 32-4);
3563     br(Assembler::GE, CRC_by4_loop);
3564     adds(len, len, 4);
3565     br(Assembler::GT, CRC_by1_loop);
3566     b(L_exit);
3567 
3568   BIND(CRC_by32_loop);
3569     ldp(tmp0, tmp1, Address(post(buf, 16)));
3570     subs(len, len, 32);
3571     crc32cx(crc, crc, tmp0);
3572     ldr(tmp2, Address(post(buf, 8)));
3573     crc32cx(crc, crc, tmp1);
3574     ldr(tmp3, Address(post(buf, 8)));
3575     crc32cx(crc, crc, tmp2);
3576     crc32cx(crc, crc, tmp3);
3577     br(Assembler::GE, CRC_by32_loop);
3578     cmn(len, 32);
3579     br(Assembler::NE, CRC_less32);
3580     b(L_exit);
3581 
3582   BIND(CRC_by4_loop);
3583     ldrw(tmp0, Address(post(buf, 4)));
3584     subs(len, len, 4);
3585     crc32cw(crc, crc, tmp0);
3586     br(Assembler::GE, CRC_by4_loop);
3587     adds(len, len, 4);
3588     br(Assembler::LE, L_exit);
3589   BIND(CRC_by1_loop);
3590     ldrb(tmp0, Address(post(buf, 1)));
3591     subs(len, len, 1);
3592     crc32cb(crc, crc, tmp0);
3593     br(Assembler::GT, CRC_by1_loop);
3594     b(L_exit);
3595 
3596   BIND(CRC_by64_pre);
3597     sub(buf, buf, 8);
3598     ldp(tmp0, tmp1, Address(buf, 8));
3599     crc32cx(crc, crc, tmp0);
3600     ldr(tmp2, Address(buf, 24));
3601     crc32cx(crc, crc, tmp1);
3602     ldr(tmp3, Address(buf, 32));
3603     crc32cx(crc, crc, tmp2);
3604     ldr(tmp0, Address(buf, 40));
3605     crc32cx(crc, crc, tmp3);
3606     ldr(tmp1, Address(buf, 48));
3607     crc32cx(crc, crc, tmp0);
3608     ldr(tmp2, Address(buf, 56));
3609     crc32cx(crc, crc, tmp1);
3610     ldr(tmp3, Address(pre(buf, 64)));
3611 
3612     b(CRC_by64_loop);
3613 
3614     align(CodeEntryAlignment);
3615   BIND(CRC_by64_loop);
3616     subs(len, len, 64);
3617     crc32cx(crc, crc, tmp2);
3618     ldr(tmp0, Address(buf, 8));
3619     crc32cx(crc, crc, tmp3);
3620     ldr(tmp1, Address(buf, 16));
3621     crc32cx(crc, crc, tmp0);
3622     ldr(tmp2, Address(buf, 24));
3623     crc32cx(crc, crc, tmp1);
3624     ldr(tmp3, Address(buf, 32));
3625     crc32cx(crc, crc, tmp2);
3626     ldr(tmp0, Address(buf, 40));
3627     crc32cx(crc, crc, tmp3);
3628     ldr(tmp1, Address(buf, 48));
3629     crc32cx(crc, crc, tmp0);
3630     ldr(tmp2, Address(buf, 56));
3631     crc32cx(crc, crc, tmp1);
3632     ldr(tmp3, Address(pre(buf, 64)));
3633     br(Assembler::GE, CRC_by64_loop);
3634 
3635     // post-loop
3636     crc32cx(crc, crc, tmp2);
3637     crc32cx(crc, crc, tmp3);
3638 
3639     sub(len, len, 64);
3640     add(buf, buf, 8);
3641     cmn(len, 128);
3642     br(Assembler::NE, CRC_less64);
3643   BIND(L_exit);
3644 }
3645 
3646 /**
3647  * @param crc   register containing existing CRC (32-bit)
3648  * @param buf   register pointing to input byte buffer (byte*)
3649  * @param len   register containing number of bytes
3650  * @param table register that will contain address of CRC table
3651  * @param tmp   scratch register
3652  */
3653 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3654         Register table0, Register table1, Register table2, Register table3,
3655         Register tmp, Register tmp2, Register tmp3) {
3656   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3657 }
3658 
3659 
3660 SkipIfEqual::SkipIfEqual(
3661     MacroAssembler* masm, const bool* flag_addr, bool value) {
3662   _masm = masm;
3663   unsigned long offset;
3664   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3665   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3666   _masm->cbzw(rscratch1, _label);
3667 }
3668 
3669 SkipIfEqual::~SkipIfEqual() {
3670   _masm->bind(_label);
3671 }
3672 
3673 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3674   Address adr;
3675   switch(dst.getMode()) {
3676   case Address::base_plus_offset:
3677     // This is the expected mode, although we allow all the other
3678     // forms below.
3679     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3680     break;
3681   default:
3682     lea(rscratch2, dst);
3683     adr = Address(rscratch2);
3684     break;
3685   }
3686   ldr(rscratch1, adr);
3687   add(rscratch1, rscratch1, src);
3688   str(rscratch1, adr);
3689 }
3690 
3691 void MacroAssembler::cmpptr(Register src1, Address src2) {
3692   unsigned long offset;
3693   adrp(rscratch1, src2, offset);
3694   ldr(rscratch1, Address(rscratch1, offset));
3695   cmp(src1, rscratch1);
3696 }
3697 
3698 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3699   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3700   bs->obj_equals(this, obj1, obj2);
3701 }
3702 
3703 void MacroAssembler::load_method_holder(Register holder, Register method) {
3704   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3705   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3706   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3707 }
3708 
3709 void MacroAssembler::load_klass(Register dst, Register src) {
3710   if (UseCompressedClassPointers) {
3711     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3712     decode_klass_not_null(dst);
3713   } else {
3714     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3715   }
3716 }
3717 
3718 // ((OopHandle)result).resolve();
3719 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3720   // OopHandle::resolve is an indirection.
3721   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3722 }
3723 
3724 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3725   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3726   ldr(dst, Address(rmethod, Method::const_offset()));
3727   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3728   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3729   ldr(dst, Address(dst, mirror_offset));
3730   resolve_oop_handle(dst, tmp);
3731 }
3732 
3733 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3734   if (UseCompressedClassPointers) {
3735     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3736     if (CompressedKlassPointers::base() == NULL) {
3737       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3738       return;
3739     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3740                && CompressedKlassPointers::shift() == 0) {
3741       // Only the bottom 32 bits matter
3742       cmpw(trial_klass, tmp);
3743       return;
3744     }
3745     decode_klass_not_null(tmp);
3746   } else {
3747     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3748   }
3749   cmp(trial_klass, tmp);
3750 }
3751 
3752 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3753   load_klass(dst, src);
3754   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3755 }
3756 
3757 void MacroAssembler::store_klass(Register dst, Register src) {
3758   // FIXME: Should this be a store release?  concurrent gcs assumes
3759   // klass length is valid if klass field is not null.
3760   if (UseCompressedClassPointers) {
3761     encode_klass_not_null(src);
3762     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3763   } else {
3764     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3765   }
3766 }
3767 
3768 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3769   if (UseCompressedClassPointers) {
3770     // Store to klass gap in destination
3771     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3772   }
3773 }
3774 
3775 // Algorithm must match CompressedOops::encode.
3776 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3777 #ifdef ASSERT
3778   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3779 #endif
3780   verify_oop(s, "broken oop in encode_heap_oop");
3781   if (CompressedOops::base() == NULL) {
3782     if (CompressedOops::shift() != 0) {
3783       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3784       lsr(d, s, LogMinObjAlignmentInBytes);
3785     } else {
3786       mov(d, s);
3787     }
3788   } else {
3789     subs(d, s, rheapbase);
3790     csel(d, d, zr, Assembler::HS);
3791     lsr(d, d, LogMinObjAlignmentInBytes);
3792 
3793     /*  Old algorithm: is this any worse?
3794     Label nonnull;
3795     cbnz(r, nonnull);
3796     sub(r, r, rheapbase);
3797     bind(nonnull);
3798     lsr(r, r, LogMinObjAlignmentInBytes);
3799     */
3800   }
3801 }
3802 
3803 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3804 #ifdef ASSERT
3805   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3806   if (CheckCompressedOops) {
3807     Label ok;
3808     cbnz(r, ok);
3809     stop("null oop passed to encode_heap_oop_not_null");
3810     bind(ok);
3811   }
3812 #endif
3813   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3814   if (CompressedOops::base() != NULL) {
3815     sub(r, r, rheapbase);
3816   }
3817   if (CompressedOops::shift() != 0) {
3818     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3819     lsr(r, r, LogMinObjAlignmentInBytes);
3820   }
3821 }
3822 
3823 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3824 #ifdef ASSERT
3825   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3826   if (CheckCompressedOops) {
3827     Label ok;
3828     cbnz(src, ok);
3829     stop("null oop passed to encode_heap_oop_not_null2");
3830     bind(ok);
3831   }
3832 #endif
3833   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3834 
3835   Register data = src;
3836   if (CompressedOops::base() != NULL) {
3837     sub(dst, src, rheapbase);
3838     data = dst;
3839   }
3840   if (CompressedOops::shift() != 0) {
3841     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3842     lsr(dst, data, LogMinObjAlignmentInBytes);
3843     data = dst;
3844   }
3845   if (data == src)
3846     mov(dst, src);
3847 }
3848 
3849 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3850 #ifdef ASSERT
3851   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3852 #endif
3853   if (CompressedOops::base() == NULL) {
3854     if (CompressedOops::shift() != 0 || d != s) {
3855       lsl(d, s, CompressedOops::shift());
3856     }
3857   } else {
3858     Label done;
3859     if (d != s)
3860       mov(d, s);
3861     cbz(s, done);
3862     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3863     bind(done);
3864   }
3865   verify_oop(d, "broken oop in decode_heap_oop");
3866 }
3867 
3868 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3869   assert (UseCompressedOops, "should only be used for compressed headers");
3870   assert (Universe::heap() != NULL, "java heap should be initialized");
3871   // Cannot assert, unverified entry point counts instructions (see .ad file)
3872   // vtableStubs also counts instructions in pd_code_size_limit.
3873   // Also do not verify_oop as this is called by verify_oop.
3874   if (CompressedOops::shift() != 0) {
3875     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3876     if (CompressedOops::base() != NULL) {
3877       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3878     } else {
3879       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3880     }
3881   } else {
3882     assert (CompressedOops::base() == NULL, "sanity");
3883   }
3884 }
3885 
3886 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3887   assert (UseCompressedOops, "should only be used for compressed headers");
3888   assert (Universe::heap() != NULL, "java heap should be initialized");
3889   // Cannot assert, unverified entry point counts instructions (see .ad file)
3890   // vtableStubs also counts instructions in pd_code_size_limit.
3891   // Also do not verify_oop as this is called by verify_oop.
3892   if (CompressedOops::shift() != 0) {
3893     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3894     if (CompressedOops::base() != NULL) {
3895       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3896     } else {
3897       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3898     }
3899   } else {
3900     assert (CompressedOops::base() == NULL, "sanity");
3901     if (dst != src) {
3902       mov(dst, src);
3903     }
3904   }
3905 }
3906 
3907 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3908   if (CompressedKlassPointers::base() == NULL) {
3909     if (CompressedKlassPointers::shift() != 0) {
3910       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3911       lsr(dst, src, LogKlassAlignmentInBytes);
3912     } else {
3913       if (dst != src) mov(dst, src);
3914     }
3915     return;
3916   }
3917 
3918   if (use_XOR_for_compressed_class_base) {
3919     if (CompressedKlassPointers::shift() != 0) {
3920       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3921       lsr(dst, dst, LogKlassAlignmentInBytes);
3922     } else {
3923       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3924     }
3925     return;
3926   }
3927 
3928   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3929       && CompressedKlassPointers::shift() == 0) {
3930     movw(dst, src);
3931     return;
3932   }
3933 
3934 #ifdef ASSERT
3935   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3936 #endif
3937 
3938   Register rbase = dst;
3939   if (dst == src) rbase = rheapbase;
3940   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3941   sub(dst, src, rbase);
3942   if (CompressedKlassPointers::shift() != 0) {
3943     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3944     lsr(dst, dst, LogKlassAlignmentInBytes);
3945   }
3946   if (dst == src) reinit_heapbase();
3947 }
3948 
3949 void MacroAssembler::encode_klass_not_null(Register r) {
3950   encode_klass_not_null(r, r);
3951 }
3952 
3953 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3954   Register rbase = dst;
3955   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3956 
3957   if (CompressedKlassPointers::base() == NULL) {
3958     if (CompressedKlassPointers::shift() != 0) {
3959       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3960       lsl(dst, src, LogKlassAlignmentInBytes);
3961     } else {
3962       if (dst != src) mov(dst, src);
3963     }
3964     return;
3965   }
3966 
3967   if (use_XOR_for_compressed_class_base) {
3968     if (CompressedKlassPointers::shift() != 0) {
3969       lsl(dst, src, LogKlassAlignmentInBytes);
3970       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3971     } else {
3972       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3973     }
3974     return;
3975   }
3976 
3977   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3978       && CompressedKlassPointers::shift() == 0) {
3979     if (dst != src)
3980       movw(dst, src);
3981     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
3982     return;
3983   }
3984 
3985   // Cannot assert, unverified entry point counts instructions (see .ad file)
3986   // vtableStubs also counts instructions in pd_code_size_limit.
3987   // Also do not verify_oop as this is called by verify_oop.
3988   if (dst == src) rbase = rheapbase;
3989   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3990   if (CompressedKlassPointers::shift() != 0) {
3991     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3992     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3993   } else {
3994     add(dst, rbase, src);
3995   }
3996   if (dst == src) reinit_heapbase();
3997 }
3998 
3999 void  MacroAssembler::decode_klass_not_null(Register r) {
4000   decode_klass_not_null(r, r);
4001 }
4002 
4003 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4004 #ifdef ASSERT
4005   {
4006     ThreadInVMfromUnknown tiv;
4007     assert (UseCompressedOops, "should only be used for compressed oops");
4008     assert (Universe::heap() != NULL, "java heap should be initialized");
4009     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4010     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4011   }
4012 #endif
4013   int oop_index = oop_recorder()->find_index(obj);
4014   InstructionMark im(this);
4015   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4016   code_section()->relocate(inst_mark(), rspec);
4017   movz(dst, 0xDEAD, 16);
4018   movk(dst, 0xBEEF);
4019 }
4020 
4021 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4022   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4023   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4024   int index = oop_recorder()->find_index(k);
4025   assert(! Universe::heap()->is_in(k), "should not be an oop");
4026 
4027   InstructionMark im(this);
4028   RelocationHolder rspec = metadata_Relocation::spec(index);
4029   code_section()->relocate(inst_mark(), rspec);
4030   narrowKlass nk = CompressedKlassPointers::encode(k);
4031   movz(dst, (nk >> 16), 16);
4032   movk(dst, nk & 0xffff);
4033 }
4034 
4035 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4036                                     Register dst, Address src,
4037                                     Register tmp1, Register thread_tmp) {
4038   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4039   decorators = AccessInternal::decorator_fixup(decorators);
4040   bool as_raw = (decorators & AS_RAW) != 0;
4041   if (as_raw) {
4042     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4043   } else {
4044     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4045   }
4046 }
4047 
4048 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4049                                      Address dst, Register src,
4050                                      Register tmp1, Register thread_tmp) {
4051   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4052   decorators = AccessInternal::decorator_fixup(decorators);
4053   bool as_raw = (decorators & AS_RAW) != 0;
4054   if (as_raw) {
4055     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4056   } else {
4057     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4058   }
4059 }
4060 
4061 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4062   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4063   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4064     decorators |= ACCESS_READ | ACCESS_WRITE;
4065   }
4066   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4067   return bs->resolve(this, decorators, obj);
4068 }
4069 
4070 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4071                                    Register thread_tmp, DecoratorSet decorators) {
4072   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4073 }
4074 
4075 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4076                                             Register thread_tmp, DecoratorSet decorators) {
4077   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4078 }
4079 
4080 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4081                                     Register thread_tmp, DecoratorSet decorators) {
4082   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4083 }
4084 
4085 // Used for storing NULLs.
4086 void MacroAssembler::store_heap_oop_null(Address dst) {
4087   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4088 }
4089 
4090 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4091   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4092   int index = oop_recorder()->allocate_metadata_index(obj);
4093   RelocationHolder rspec = metadata_Relocation::spec(index);
4094   return Address((address)obj, rspec);
4095 }
4096 
4097 // Move an oop into a register.  immediate is true if we want
4098 // immediate instrcutions, i.e. we are not going to patch this
4099 // instruction while the code is being executed by another thread.  In
4100 // that case we can use move immediates rather than the constant pool.
4101 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4102   int oop_index;
4103   if (obj == NULL) {
4104     oop_index = oop_recorder()->allocate_oop_index(obj);
4105   } else {
4106 #ifdef ASSERT
4107     {
4108       ThreadInVMfromUnknown tiv;
4109       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4110     }
4111 #endif
4112     oop_index = oop_recorder()->find_index(obj);
4113   }
4114   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4115   if (! immediate) {
4116     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4117     ldr_constant(dst, Address(dummy, rspec));
4118   } else
4119     mov(dst, Address((address)obj, rspec));
4120 }
4121 
4122 // Move a metadata address into a register.
4123 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4124   int oop_index;
4125   if (obj == NULL) {
4126     oop_index = oop_recorder()->allocate_metadata_index(obj);
4127   } else {
4128     oop_index = oop_recorder()->find_index(obj);
4129   }
4130   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4131   mov(dst, Address((address)obj, rspec));
4132 }
4133 
4134 Address MacroAssembler::constant_oop_address(jobject obj) {
4135 #ifdef ASSERT
4136   {
4137     ThreadInVMfromUnknown tiv;
4138     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4139     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4140   }
4141 #endif
4142   int oop_index = oop_recorder()->find_index(obj);
4143   return Address((address)obj, oop_Relocation::spec(oop_index));
4144 }
4145 
4146 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4147 void MacroAssembler::tlab_allocate(Register obj,
4148                                    Register var_size_in_bytes,
4149                                    int con_size_in_bytes,
4150                                    Register t1,
4151                                    Register t2,
4152                                    Label& slow_case) {
4153   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4154   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4155 }
4156 
4157 // Defines obj, preserves var_size_in_bytes
4158 void MacroAssembler::eden_allocate(Register obj,
4159                                    Register var_size_in_bytes,
4160                                    int con_size_in_bytes,
4161                                    Register t1,
4162                                    Label& slow_case) {
4163   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4164   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4165 }
4166 
4167 // Zero words; len is in bytes
4168 // Destroys all registers except addr
4169 // len must be a nonzero multiple of wordSize
4170 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4171   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4172 
4173 #ifdef ASSERT
4174   { Label L;
4175     tst(len, BytesPerWord - 1);
4176     br(Assembler::EQ, L);
4177     stop("len is not a multiple of BytesPerWord");
4178     bind(L);
4179   }
4180 #endif
4181 
4182 #ifndef PRODUCT
4183   block_comment("zero memory");
4184 #endif
4185 
4186   Label loop;
4187   Label entry;
4188 
4189 //  Algorithm:
4190 //
4191 //    scratch1 = cnt & 7;
4192 //    cnt -= scratch1;
4193 //    p += scratch1;
4194 //    switch (scratch1) {
4195 //      do {
4196 //        cnt -= 8;
4197 //          p[-8] = 0;
4198 //        case 7:
4199 //          p[-7] = 0;
4200 //        case 6:
4201 //          p[-6] = 0;
4202 //          // ...
4203 //        case 1:
4204 //          p[-1] = 0;
4205 //        case 0:
4206 //          p += 8;
4207 //      } while (cnt);
4208 //    }
4209 
4210   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4211 
4212   lsr(len, len, LogBytesPerWord);
4213   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4214   sub(len, len, rscratch1);      // cnt -= unroll
4215   // t1 always points to the end of the region we're about to zero
4216   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4217   adr(rscratch2, entry);
4218   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4219   br(rscratch2);
4220   bind(loop);
4221   sub(len, len, unroll);
4222   for (int i = -unroll; i < 0; i++)
4223     Assembler::str(zr, Address(t1, i * wordSize));
4224   bind(entry);
4225   add(t1, t1, unroll * wordSize);
4226   cbnz(len, loop);
4227 }
4228 
4229 void MacroAssembler::verify_tlab() {
4230 #ifdef ASSERT
4231   if (UseTLAB && VerifyOops) {
4232     Label next, ok;
4233 
4234     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4235 
4236     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4237     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4238     cmp(rscratch2, rscratch1);
4239     br(Assembler::HS, next);
4240     STOP("assert(top >= start)");
4241     should_not_reach_here();
4242 
4243     bind(next);
4244     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4245     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4246     cmp(rscratch2, rscratch1);
4247     br(Assembler::HS, ok);
4248     STOP("assert(top <= end)");
4249     should_not_reach_here();
4250 
4251     bind(ok);
4252     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4253   }
4254 #endif
4255 }
4256 
4257 // Writes to stack successive pages until offset reached to check for
4258 // stack overflow + shadow pages.  This clobbers tmp.
4259 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4260   assert_different_registers(tmp, size, rscratch1);
4261   mov(tmp, sp);
4262   // Bang stack for total size given plus shadow page size.
4263   // Bang one page at a time because large size can bang beyond yellow and
4264   // red zones.
4265   Label loop;
4266   mov(rscratch1, os::vm_page_size());
4267   bind(loop);
4268   lea(tmp, Address(tmp, -os::vm_page_size()));
4269   subsw(size, size, rscratch1);
4270   str(size, Address(tmp));
4271   br(Assembler::GT, loop);
4272 
4273   // Bang down shadow pages too.
4274   // At this point, (tmp-0) is the last address touched, so don't
4275   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4276   // was post-decremented.)  Skip this address by starting at i=1, and
4277   // touch a few more pages below.  N.B.  It is important to touch all
4278   // the way down to and including i=StackShadowPages.
4279   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4280     // this could be any sized move but this is can be a debugging crumb
4281     // so the bigger the better.
4282     lea(tmp, Address(tmp, -os::vm_page_size()));
4283     str(size, Address(tmp));
4284   }
4285 }
4286 
4287 
4288 // Move the address of the polling page into dest.
4289 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4290   if (SafepointMechanism::uses_thread_local_poll()) {
4291     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4292   } else {
4293     unsigned long off;
4294     adrp(dest, Address(page, rtype), off);
4295     assert(off == 0, "polling page must be page aligned");
4296   }
4297 }
4298 
4299 // Move the address of the polling page into r, then read the polling
4300 // page.
4301 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4302   get_polling_page(r, page, rtype);
4303   return read_polling_page(r, rtype);
4304 }
4305 
4306 // Read the polling page.  The address of the polling page must
4307 // already be in r.
4308 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4309   InstructionMark im(this);
4310   code_section()->relocate(inst_mark(), rtype);
4311   ldrw(zr, Address(r, 0));
4312   return inst_mark();
4313 }
4314 
4315 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4316   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4317   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4318   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4319   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4320   long offset_low = dest_page - low_page;
4321   long offset_high = dest_page - high_page;
4322 
4323   assert(is_valid_AArch64_address(dest.target()), "bad address");
4324   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4325 
4326   InstructionMark im(this);
4327   code_section()->relocate(inst_mark(), dest.rspec());
4328   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4329   // the code cache so that if it is relocated we know it will still reach
4330   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4331     _adrp(reg1, dest.target());
4332   } else {
4333     unsigned long target = (unsigned long)dest.target();
4334     unsigned long adrp_target
4335       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4336 
4337     _adrp(reg1, (address)adrp_target);
4338     movk(reg1, target >> 32, 32);
4339   }
4340   byte_offset = (unsigned long)dest.target() & 0xfff;
4341 }
4342 
4343 void MacroAssembler::load_byte_map_base(Register reg) {
4344   CardTable::CardValue* byte_map_base =
4345     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4346 
4347   if (is_valid_AArch64_address((address)byte_map_base)) {
4348     // Strictly speaking the byte_map_base isn't an address at all,
4349     // and it might even be negative.
4350     unsigned long offset;
4351     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4352     // We expect offset to be zero with most collectors.
4353     if (offset != 0) {
4354       add(reg, reg, offset);
4355     }
4356   } else {
4357     mov(reg, (uint64_t)byte_map_base);
4358   }
4359 }
4360 
4361 void MacroAssembler::build_frame(int framesize) {
4362   assert(framesize > 0, "framesize must be > 0");
4363   if (framesize < ((1 << 9) + 2 * wordSize)) {
4364     sub(sp, sp, framesize);
4365     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4366     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4367   } else {
4368     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4369     if (PreserveFramePointer) mov(rfp, sp);
4370     if (framesize < ((1 << 12) + 2 * wordSize))
4371       sub(sp, sp, framesize - 2 * wordSize);
4372     else {
4373       mov(rscratch1, framesize - 2 * wordSize);
4374       sub(sp, sp, rscratch1);
4375     }
4376   }
4377 }
4378 
4379 void MacroAssembler::remove_frame(int framesize) {
4380   assert(framesize > 0, "framesize must be > 0");
4381   if (framesize < ((1 << 9) + 2 * wordSize)) {
4382     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4383     add(sp, sp, framesize);
4384   } else {
4385     if (framesize < ((1 << 12) + 2 * wordSize))
4386       add(sp, sp, framesize - 2 * wordSize);
4387     else {
4388       mov(rscratch1, framesize - 2 * wordSize);
4389       add(sp, sp, rscratch1);
4390     }
4391     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4392   }
4393 }
4394 
4395 #ifdef COMPILER2
4396 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4397 
4398 // Search for str1 in str2 and return index or -1
4399 void MacroAssembler::string_indexof(Register str2, Register str1,
4400                                     Register cnt2, Register cnt1,
4401                                     Register tmp1, Register tmp2,
4402                                     Register tmp3, Register tmp4,
4403                                     Register tmp5, Register tmp6,
4404                                     int icnt1, Register result, int ae) {
4405   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4406   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4407 
4408   Register ch1 = rscratch1;
4409   Register ch2 = rscratch2;
4410   Register cnt1tmp = tmp1;
4411   Register cnt2tmp = tmp2;
4412   Register cnt1_neg = cnt1;
4413   Register cnt2_neg = cnt2;
4414   Register result_tmp = tmp4;
4415 
4416   bool isL = ae == StrIntrinsicNode::LL;
4417 
4418   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4419   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4420   int str1_chr_shift = str1_isL ? 0:1;
4421   int str2_chr_shift = str2_isL ? 0:1;
4422   int str1_chr_size = str1_isL ? 1:2;
4423   int str2_chr_size = str2_isL ? 1:2;
4424   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4425                                       (chr_insn)&MacroAssembler::ldrh;
4426   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4427                                       (chr_insn)&MacroAssembler::ldrh;
4428   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4429   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4430 
4431   // Note, inline_string_indexOf() generates checks:
4432   // if (substr.count > string.count) return -1;
4433   // if (substr.count == 0) return 0;
4434 
4435   // We have two strings, a source string in str2, cnt2 and a pattern string
4436   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4437 
4438   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4439   // With a small pattern and source we use linear scan.
4440 
4441   if (icnt1 == -1) {
4442     sub(result_tmp, cnt2, cnt1);
4443     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4444     br(LT, LINEARSEARCH);
4445     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4446     subs(zr, cnt1, 256);
4447     lsr(tmp1, cnt2, 2);
4448     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4449     br(GE, LINEARSTUB);
4450   }
4451 
4452 // The Boyer Moore alogorithm is based on the description here:-
4453 //
4454 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4455 //
4456 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4457 // and the 'Good Suffix' rule.
4458 //
4459 // These rules are essentially heuristics for how far we can shift the
4460 // pattern along the search string.
4461 //
4462 // The implementation here uses the 'Bad Character' rule only because of the
4463 // complexity of initialisation for the 'Good Suffix' rule.
4464 //
4465 // This is also known as the Boyer-Moore-Horspool algorithm:-
4466 //
4467 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4468 //
4469 // This particular implementation has few java-specific optimizations.
4470 //
4471 // #define ASIZE 256
4472 //
4473 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4474 //       int i, j;
4475 //       unsigned c;
4476 //       unsigned char bc[ASIZE];
4477 //
4478 //       /* Preprocessing */
4479 //       for (i = 0; i < ASIZE; ++i)
4480 //          bc[i] = m;
4481 //       for (i = 0; i < m - 1; ) {
4482 //          c = x[i];
4483 //          ++i;
4484 //          // c < 256 for Latin1 string, so, no need for branch
4485 //          #ifdef PATTERN_STRING_IS_LATIN1
4486 //          bc[c] = m - i;
4487 //          #else
4488 //          if (c < ASIZE) bc[c] = m - i;
4489 //          #endif
4490 //       }
4491 //
4492 //       /* Searching */
4493 //       j = 0;
4494 //       while (j <= n - m) {
4495 //          c = y[i+j];
4496 //          if (x[m-1] == c)
4497 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4498 //          if (i < 0) return j;
4499 //          // c < 256 for Latin1 string, so, no need for branch
4500 //          #ifdef SOURCE_STRING_IS_LATIN1
4501 //          // LL case: (c< 256) always true. Remove branch
4502 //          j += bc[y[j+m-1]];
4503 //          #endif
4504 //          #ifndef PATTERN_STRING_IS_UTF
4505 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4506 //          if (c < ASIZE)
4507 //            j += bc[y[j+m-1]];
4508 //          else
4509 //            j += 1
4510 //          #endif
4511 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4512 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4513 //          if (c < ASIZE)
4514 //            j += bc[y[j+m-1]];
4515 //          else
4516 //            j += m
4517 //          #endif
4518 //       }
4519 //    }
4520 
4521   if (icnt1 == -1) {
4522     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4523         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4524     Register cnt1end = tmp2;
4525     Register str2end = cnt2;
4526     Register skipch = tmp2;
4527 
4528     // str1 length is >=8, so, we can read at least 1 register for cases when
4529     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4530     // UL case. We'll re-read last character in inner pre-loop code to have
4531     // single outer pre-loop load
4532     const int firstStep = isL ? 7 : 3;
4533 
4534     const int ASIZE = 256;
4535     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4536     sub(sp, sp, ASIZE);
4537     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4538     mov(ch1, sp);
4539     BIND(BM_INIT_LOOP);
4540       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4541       subs(tmp5, tmp5, 1);
4542       br(GT, BM_INIT_LOOP);
4543 
4544       sub(cnt1tmp, cnt1, 1);
4545       mov(tmp5, str2);
4546       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4547       sub(ch2, cnt1, 1);
4548       mov(tmp3, str1);
4549     BIND(BCLOOP);
4550       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4551       if (!str1_isL) {
4552         subs(zr, ch1, ASIZE);
4553         br(HS, BCSKIP);
4554       }
4555       strb(ch2, Address(sp, ch1));
4556     BIND(BCSKIP);
4557       subs(ch2, ch2, 1);
4558       br(GT, BCLOOP);
4559 
4560       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4561       if (str1_isL == str2_isL) {
4562         // load last 8 bytes (8LL/4UU symbols)
4563         ldr(tmp6, Address(tmp6, -wordSize));
4564       } else {
4565         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4566         // convert Latin1 to UTF. We'll have to wait until load completed, but
4567         // it's still faster than per-character loads+checks
4568         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4569         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4570         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4571         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4572         orr(ch2, ch1, ch2, LSL, 16);
4573         orr(tmp6, tmp6, tmp3, LSL, 48);
4574         orr(tmp6, tmp6, ch2, LSL, 16);
4575       }
4576     BIND(BMLOOPSTR2);
4577       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4578       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4579       if (str1_isL == str2_isL) {
4580         // re-init tmp3. It's for free because it's executed in parallel with
4581         // load above. Alternative is to initialize it before loop, but it'll
4582         // affect performance on in-order systems with 2 or more ld/st pipelines
4583         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4584       }
4585       if (!isL) { // UU/UL case
4586         lsl(ch2, cnt1tmp, 1); // offset in bytes
4587       }
4588       cmp(tmp3, skipch);
4589       br(NE, BMSKIP);
4590       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4591       mov(ch1, tmp6);
4592       if (isL) {
4593         b(BMLOOPSTR1_AFTER_LOAD);
4594       } else {
4595         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4596         b(BMLOOPSTR1_CMP);
4597       }
4598     BIND(BMLOOPSTR1);
4599       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4600       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4601     BIND(BMLOOPSTR1_AFTER_LOAD);
4602       subs(cnt1tmp, cnt1tmp, 1);
4603       br(LT, BMLOOPSTR1_LASTCMP);
4604     BIND(BMLOOPSTR1_CMP);
4605       cmp(ch1, ch2);
4606       br(EQ, BMLOOPSTR1);
4607     BIND(BMSKIP);
4608       if (!isL) {
4609         // if we've met UTF symbol while searching Latin1 pattern, then we can
4610         // skip cnt1 symbols
4611         if (str1_isL != str2_isL) {
4612           mov(result_tmp, cnt1);
4613         } else {
4614           mov(result_tmp, 1);
4615         }
4616         subs(zr, skipch, ASIZE);
4617         br(HS, BMADV);
4618       }
4619       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4620     BIND(BMADV);
4621       sub(cnt1tmp, cnt1, 1);
4622       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4623       cmp(str2, str2end);
4624       br(LE, BMLOOPSTR2);
4625       add(sp, sp, ASIZE);
4626       b(NOMATCH);
4627     BIND(BMLOOPSTR1_LASTCMP);
4628       cmp(ch1, ch2);
4629       br(NE, BMSKIP);
4630     BIND(BMMATCH);
4631       sub(result, str2, tmp5);
4632       if (!str2_isL) lsr(result, result, 1);
4633       add(sp, sp, ASIZE);
4634       b(DONE);
4635 
4636     BIND(LINEARSTUB);
4637     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4638     br(LT, LINEAR_MEDIUM);
4639     mov(result, zr);
4640     RuntimeAddress stub = NULL;
4641     if (isL) {
4642       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4643       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4644     } else if (str1_isL) {
4645       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4646        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4647     } else {
4648       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4649       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4650     }
4651     trampoline_call(stub);
4652     b(DONE);
4653   }
4654 
4655   BIND(LINEARSEARCH);
4656   {
4657     Label DO1, DO2, DO3;
4658 
4659     Register str2tmp = tmp2;
4660     Register first = tmp3;
4661 
4662     if (icnt1 == -1)
4663     {
4664         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4665 
4666         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4667         br(LT, DOSHORT);
4668       BIND(LINEAR_MEDIUM);
4669         (this->*str1_load_1chr)(first, Address(str1));
4670         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4671         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4672         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4673         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4674 
4675       BIND(FIRST_LOOP);
4676         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4677         cmp(first, ch2);
4678         br(EQ, STR1_LOOP);
4679       BIND(STR2_NEXT);
4680         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4681         br(LE, FIRST_LOOP);
4682         b(NOMATCH);
4683 
4684       BIND(STR1_LOOP);
4685         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4686         add(cnt2tmp, cnt2_neg, str2_chr_size);
4687         br(GE, MATCH);
4688 
4689       BIND(STR1_NEXT);
4690         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4691         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4692         cmp(ch1, ch2);
4693         br(NE, STR2_NEXT);
4694         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4695         add(cnt2tmp, cnt2tmp, str2_chr_size);
4696         br(LT, STR1_NEXT);
4697         b(MATCH);
4698 
4699       BIND(DOSHORT);
4700       if (str1_isL == str2_isL) {
4701         cmp(cnt1, (u1)2);
4702         br(LT, DO1);
4703         br(GT, DO3);
4704       }
4705     }
4706 
4707     if (icnt1 == 4) {
4708       Label CH1_LOOP;
4709 
4710         (this->*load_4chr)(ch1, str1);
4711         sub(result_tmp, cnt2, 4);
4712         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4713         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4714 
4715       BIND(CH1_LOOP);
4716         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4717         cmp(ch1, ch2);
4718         br(EQ, MATCH);
4719         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4720         br(LE, CH1_LOOP);
4721         b(NOMATCH);
4722       }
4723 
4724     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4725       Label CH1_LOOP;
4726 
4727       BIND(DO2);
4728         (this->*load_2chr)(ch1, str1);
4729         if (icnt1 == 2) {
4730           sub(result_tmp, cnt2, 2);
4731         }
4732         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4733         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4734       BIND(CH1_LOOP);
4735         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4736         cmp(ch1, ch2);
4737         br(EQ, MATCH);
4738         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4739         br(LE, CH1_LOOP);
4740         b(NOMATCH);
4741     }
4742 
4743     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4744       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4745 
4746       BIND(DO3);
4747         (this->*load_2chr)(first, str1);
4748         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4749         if (icnt1 == 3) {
4750           sub(result_tmp, cnt2, 3);
4751         }
4752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4753         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4754       BIND(FIRST_LOOP);
4755         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4756         cmpw(first, ch2);
4757         br(EQ, STR1_LOOP);
4758       BIND(STR2_NEXT);
4759         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4760         br(LE, FIRST_LOOP);
4761         b(NOMATCH);
4762 
4763       BIND(STR1_LOOP);
4764         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4765         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4766         cmp(ch1, ch2);
4767         br(NE, STR2_NEXT);
4768         b(MATCH);
4769     }
4770 
4771     if (icnt1 == -1 || icnt1 == 1) {
4772       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4773 
4774       BIND(DO1);
4775         (this->*str1_load_1chr)(ch1, str1);
4776         cmp(cnt2, (u1)8);
4777         br(LT, DO1_SHORT);
4778 
4779         sub(result_tmp, cnt2, 8/str2_chr_size);
4780         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4781         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4782         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4783 
4784         if (str2_isL) {
4785           orr(ch1, ch1, ch1, LSL, 8);
4786         }
4787         orr(ch1, ch1, ch1, LSL, 16);
4788         orr(ch1, ch1, ch1, LSL, 32);
4789       BIND(CH1_LOOP);
4790         ldr(ch2, Address(str2, cnt2_neg));
4791         eor(ch2, ch1, ch2);
4792         sub(tmp1, ch2, tmp3);
4793         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4794         bics(tmp1, tmp1, tmp2);
4795         br(NE, HAS_ZERO);
4796         adds(cnt2_neg, cnt2_neg, 8);
4797         br(LT, CH1_LOOP);
4798 
4799         cmp(cnt2_neg, (u1)8);
4800         mov(cnt2_neg, 0);
4801         br(LT, CH1_LOOP);
4802         b(NOMATCH);
4803 
4804       BIND(HAS_ZERO);
4805         rev(tmp1, tmp1);
4806         clz(tmp1, tmp1);
4807         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4808         b(MATCH);
4809 
4810       BIND(DO1_SHORT);
4811         mov(result_tmp, cnt2);
4812         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4813         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4814       BIND(DO1_LOOP);
4815         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4816         cmpw(ch1, ch2);
4817         br(EQ, MATCH);
4818         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4819         br(LT, DO1_LOOP);
4820     }
4821   }
4822   BIND(NOMATCH);
4823     mov(result, -1);
4824     b(DONE);
4825   BIND(MATCH);
4826     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4827   BIND(DONE);
4828 }
4829 
4830 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4831 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4832 
4833 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4834                                          Register ch, Register result,
4835                                          Register tmp1, Register tmp2, Register tmp3)
4836 {
4837   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4838   Register cnt1_neg = cnt1;
4839   Register ch1 = rscratch1;
4840   Register result_tmp = rscratch2;
4841 
4842   cmp(cnt1, (u1)4);
4843   br(LT, DO1_SHORT);
4844 
4845   orr(ch, ch, ch, LSL, 16);
4846   orr(ch, ch, ch, LSL, 32);
4847 
4848   sub(cnt1, cnt1, 4);
4849   mov(result_tmp, cnt1);
4850   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4851   sub(cnt1_neg, zr, cnt1, LSL, 1);
4852 
4853   mov(tmp3, 0x0001000100010001);
4854 
4855   BIND(CH1_LOOP);
4856     ldr(ch1, Address(str1, cnt1_neg));
4857     eor(ch1, ch, ch1);
4858     sub(tmp1, ch1, tmp3);
4859     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4860     bics(tmp1, tmp1, tmp2);
4861     br(NE, HAS_ZERO);
4862     adds(cnt1_neg, cnt1_neg, 8);
4863     br(LT, CH1_LOOP);
4864 
4865     cmp(cnt1_neg, (u1)8);
4866     mov(cnt1_neg, 0);
4867     br(LT, CH1_LOOP);
4868     b(NOMATCH);
4869 
4870   BIND(HAS_ZERO);
4871     rev(tmp1, tmp1);
4872     clz(tmp1, tmp1);
4873     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4874     b(MATCH);
4875 
4876   BIND(DO1_SHORT);
4877     mov(result_tmp, cnt1);
4878     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4879     sub(cnt1_neg, zr, cnt1, LSL, 1);
4880   BIND(DO1_LOOP);
4881     ldrh(ch1, Address(str1, cnt1_neg));
4882     cmpw(ch, ch1);
4883     br(EQ, MATCH);
4884     adds(cnt1_neg, cnt1_neg, 2);
4885     br(LT, DO1_LOOP);
4886   BIND(NOMATCH);
4887     mov(result, -1);
4888     b(DONE);
4889   BIND(MATCH);
4890     add(result, result_tmp, cnt1_neg, ASR, 1);
4891   BIND(DONE);
4892 }
4893 
4894 // Compare strings.
4895 void MacroAssembler::string_compare(Register str1, Register str2,
4896     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4897     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4898   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4899       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4900       SHORT_LOOP_START, TAIL_CHECK;
4901 
4902   const u1 STUB_THRESHOLD = 64 + 8;
4903   bool isLL = ae == StrIntrinsicNode::LL;
4904   bool isLU = ae == StrIntrinsicNode::LU;
4905   bool isUL = ae == StrIntrinsicNode::UL;
4906 
4907   bool str1_isL = isLL || isLU;
4908   bool str2_isL = isLL || isUL;
4909 
4910   int str1_chr_shift = str1_isL ? 0 : 1;
4911   int str2_chr_shift = str2_isL ? 0 : 1;
4912   int str1_chr_size = str1_isL ? 1 : 2;
4913   int str2_chr_size = str2_isL ? 1 : 2;
4914   int minCharsInWord = isLL ? wordSize : wordSize/2;
4915 
4916   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4917   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4918                                       (chr_insn)&MacroAssembler::ldrh;
4919   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4920                                       (chr_insn)&MacroAssembler::ldrh;
4921   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4922                             (uxt_insn)&MacroAssembler::uxthw;
4923 
4924   BLOCK_COMMENT("string_compare {");
4925 
4926   // Bizzarely, the counts are passed in bytes, regardless of whether they
4927   // are L or U strings, however the result is always in characters.
4928   if (!str1_isL) asrw(cnt1, cnt1, 1);
4929   if (!str2_isL) asrw(cnt2, cnt2, 1);
4930 
4931   // Compute the minimum of the string lengths and save the difference.
4932   subsw(result, cnt1, cnt2);
4933   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4934 
4935   // A very short string
4936   cmpw(cnt2, minCharsInWord);
4937   br(Assembler::LE, SHORT_STRING);
4938 
4939   // Compare longwords
4940   // load first parts of strings and finish initialization while loading
4941   {
4942     if (str1_isL == str2_isL) { // LL or UU
4943       ldr(tmp1, Address(str1));
4944       cmp(str1, str2);
4945       br(Assembler::EQ, DONE);
4946       ldr(tmp2, Address(str2));
4947       cmp(cnt2, STUB_THRESHOLD);
4948       br(GE, STUB);
4949       subsw(cnt2, cnt2, minCharsInWord);
4950       br(EQ, TAIL_CHECK);
4951       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4952       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4953       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4954     } else if (isLU) {
4955       ldrs(vtmp, Address(str1));
4956       cmp(str1, str2);
4957       br(Assembler::EQ, DONE);
4958       ldr(tmp2, Address(str2));
4959       cmp(cnt2, STUB_THRESHOLD);
4960       br(GE, STUB);
4961       subw(cnt2, cnt2, 4);
4962       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4963       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4964       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4965       zip1(vtmp, T8B, vtmp, vtmpZ);
4966       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4967       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4968       add(cnt1, cnt1, 4);
4969       fmovd(tmp1, vtmp);
4970     } else { // UL case
4971       ldr(tmp1, Address(str1));
4972       cmp(str1, str2);
4973       br(Assembler::EQ, DONE);
4974       ldrs(vtmp, Address(str2));
4975       cmp(cnt2, STUB_THRESHOLD);
4976       br(GE, STUB);
4977       subw(cnt2, cnt2, 4);
4978       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4979       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4980       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4981       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4982       zip1(vtmp, T8B, vtmp, vtmpZ);
4983       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4984       add(cnt1, cnt1, 8);
4985       fmovd(tmp2, vtmp);
4986     }
4987     adds(cnt2, cnt2, isUL ? 4 : 8);
4988     br(GE, TAIL);
4989     eor(rscratch2, tmp1, tmp2);
4990     cbnz(rscratch2, DIFFERENCE);
4991     // main loop
4992     bind(NEXT_WORD);
4993     if (str1_isL == str2_isL) {
4994       ldr(tmp1, Address(str1, cnt2));
4995       ldr(tmp2, Address(str2, cnt2));
4996       adds(cnt2, cnt2, 8);
4997     } else if (isLU) {
4998       ldrs(vtmp, Address(str1, cnt1));
4999       ldr(tmp2, Address(str2, cnt2));
5000       add(cnt1, cnt1, 4);
5001       zip1(vtmp, T8B, vtmp, vtmpZ);
5002       fmovd(tmp1, vtmp);
5003       adds(cnt2, cnt2, 8);
5004     } else { // UL
5005       ldrs(vtmp, Address(str2, cnt2));
5006       ldr(tmp1, Address(str1, cnt1));
5007       zip1(vtmp, T8B, vtmp, vtmpZ);
5008       add(cnt1, cnt1, 8);
5009       fmovd(tmp2, vtmp);
5010       adds(cnt2, cnt2, 4);
5011     }
5012     br(GE, TAIL);
5013 
5014     eor(rscratch2, tmp1, tmp2);
5015     cbz(rscratch2, NEXT_WORD);
5016     b(DIFFERENCE);
5017     bind(TAIL);
5018     eor(rscratch2, tmp1, tmp2);
5019     cbnz(rscratch2, DIFFERENCE);
5020     // Last longword.  In the case where length == 4 we compare the
5021     // same longword twice, but that's still faster than another
5022     // conditional branch.
5023     if (str1_isL == str2_isL) {
5024       ldr(tmp1, Address(str1));
5025       ldr(tmp2, Address(str2));
5026     } else if (isLU) {
5027       ldrs(vtmp, Address(str1));
5028       ldr(tmp2, Address(str2));
5029       zip1(vtmp, T8B, vtmp, vtmpZ);
5030       fmovd(tmp1, vtmp);
5031     } else { // UL
5032       ldrs(vtmp, Address(str2));
5033       ldr(tmp1, Address(str1));
5034       zip1(vtmp, T8B, vtmp, vtmpZ);
5035       fmovd(tmp2, vtmp);
5036     }
5037     bind(TAIL_CHECK);
5038     eor(rscratch2, tmp1, tmp2);
5039     cbz(rscratch2, DONE);
5040 
5041     // Find the first different characters in the longwords and
5042     // compute their difference.
5043     bind(DIFFERENCE);
5044     rev(rscratch2, rscratch2);
5045     clz(rscratch2, rscratch2);
5046     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5047     lsrv(tmp1, tmp1, rscratch2);
5048     (this->*ext_chr)(tmp1, tmp1);
5049     lsrv(tmp2, tmp2, rscratch2);
5050     (this->*ext_chr)(tmp2, tmp2);
5051     subw(result, tmp1, tmp2);
5052     b(DONE);
5053   }
5054 
5055   bind(STUB);
5056     RuntimeAddress stub = NULL;
5057     switch(ae) {
5058       case StrIntrinsicNode::LL:
5059         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5060         break;
5061       case StrIntrinsicNode::UU:
5062         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5063         break;
5064       case StrIntrinsicNode::LU:
5065         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5066         break;
5067       case StrIntrinsicNode::UL:
5068         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5069         break;
5070       default:
5071         ShouldNotReachHere();
5072      }
5073     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5074     trampoline_call(stub);
5075     b(DONE);
5076 
5077   bind(SHORT_STRING);
5078   // Is the minimum length zero?
5079   cbz(cnt2, DONE);
5080   // arrange code to do most branches while loading and loading next characters
5081   // while comparing previous
5082   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5083   subs(cnt2, cnt2, 1);
5084   br(EQ, SHORT_LAST_INIT);
5085   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5086   b(SHORT_LOOP_START);
5087   bind(SHORT_LOOP);
5088   subs(cnt2, cnt2, 1);
5089   br(EQ, SHORT_LAST);
5090   bind(SHORT_LOOP_START);
5091   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5092   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5093   cmp(tmp1, cnt1);
5094   br(NE, SHORT_LOOP_TAIL);
5095   subs(cnt2, cnt2, 1);
5096   br(EQ, SHORT_LAST2);
5097   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5098   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5099   cmp(tmp2, rscratch1);
5100   br(EQ, SHORT_LOOP);
5101   sub(result, tmp2, rscratch1);
5102   b(DONE);
5103   bind(SHORT_LOOP_TAIL);
5104   sub(result, tmp1, cnt1);
5105   b(DONE);
5106   bind(SHORT_LAST2);
5107   cmp(tmp2, rscratch1);
5108   br(EQ, DONE);
5109   sub(result, tmp2, rscratch1);
5110 
5111   b(DONE);
5112   bind(SHORT_LAST_INIT);
5113   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5114   bind(SHORT_LAST);
5115   cmp(tmp1, cnt1);
5116   br(EQ, DONE);
5117   sub(result, tmp1, cnt1);
5118 
5119   bind(DONE);
5120 
5121   BLOCK_COMMENT("} string_compare");
5122 }
5123 #endif // COMPILER2
5124 
5125 // This method checks if provided byte array contains byte with highest bit set.
5126 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5127     // Simple and most common case of aligned small array which is not at the
5128     // end of memory page is placed here. All other cases are in stub.
5129     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5130     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5131     assert_different_registers(ary1, len, result);
5132 
5133     cmpw(len, 0);
5134     br(LE, SET_RESULT);
5135     cmpw(len, 4 * wordSize);
5136     br(GE, STUB_LONG); // size > 32 then go to stub
5137 
5138     int shift = 64 - exact_log2(os::vm_page_size());
5139     lsl(rscratch1, ary1, shift);
5140     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5141     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5142     br(CS, STUB); // at the end of page then go to stub
5143     subs(len, len, wordSize);
5144     br(LT, END);
5145 
5146   BIND(LOOP);
5147     ldr(rscratch1, Address(post(ary1, wordSize)));
5148     tst(rscratch1, UPPER_BIT_MASK);
5149     br(NE, SET_RESULT);
5150     subs(len, len, wordSize);
5151     br(GE, LOOP);
5152     cmpw(len, -wordSize);
5153     br(EQ, SET_RESULT);
5154 
5155   BIND(END);
5156     ldr(result, Address(ary1));
5157     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5158     lslv(result, result, len);
5159     tst(result, UPPER_BIT_MASK);
5160     b(SET_RESULT);
5161 
5162   BIND(STUB);
5163     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5164     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5165     trampoline_call(has_neg);
5166     b(DONE);
5167 
5168   BIND(STUB_LONG);
5169     RuntimeAddress has_neg_long =  RuntimeAddress(
5170             StubRoutines::aarch64::has_negatives_long());
5171     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5172     trampoline_call(has_neg_long);
5173     b(DONE);
5174 
5175   BIND(SET_RESULT);
5176     cset(result, NE); // set true or false
5177 
5178   BIND(DONE);
5179 }
5180 
5181 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5182                                    Register tmp4, Register tmp5, Register result,
5183                                    Register cnt1, int elem_size) {
5184   Label DONE, SAME;
5185   Register tmp1 = rscratch1;
5186   Register tmp2 = rscratch2;
5187   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5188   int elem_per_word = wordSize/elem_size;
5189   int log_elem_size = exact_log2(elem_size);
5190   int length_offset = arrayOopDesc::length_offset_in_bytes();
5191   int base_offset
5192     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5193   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5194 
5195   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5196   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5197 
5198 #ifndef PRODUCT
5199   {
5200     const char kind = (elem_size == 2) ? 'U' : 'L';
5201     char comment[64];
5202     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5203     BLOCK_COMMENT(comment);
5204   }
5205 #endif
5206 
5207   // if (a1 == a2)
5208   //     return true;
5209   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5210   br(EQ, SAME);
5211 
5212   if (UseSimpleArrayEquals) {
5213     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5214     // if (a1 == null || a2 == null)
5215     //     return false;
5216     // a1 & a2 == 0 means (some-pointer is null) or
5217     // (very-rare-or-even-probably-impossible-pointer-values)
5218     // so, we can save one branch in most cases
5219     tst(a1, a2);
5220     mov(result, false);
5221     br(EQ, A_MIGHT_BE_NULL);
5222     // if (a1.length != a2.length)
5223     //      return false;
5224     bind(A_IS_NOT_NULL);
5225     ldrw(cnt1, Address(a1, length_offset));
5226     ldrw(cnt2, Address(a2, length_offset));
5227     eorw(tmp5, cnt1, cnt2);
5228     cbnzw(tmp5, DONE);
5229     lea(a1, Address(a1, base_offset));
5230     lea(a2, Address(a2, base_offset));
5231     // Check for short strings, i.e. smaller than wordSize.
5232     subs(cnt1, cnt1, elem_per_word);
5233     br(Assembler::LT, SHORT);
5234     // Main 8 byte comparison loop.
5235     bind(NEXT_WORD); {
5236       ldr(tmp1, Address(post(a1, wordSize)));
5237       ldr(tmp2, Address(post(a2, wordSize)));
5238       subs(cnt1, cnt1, elem_per_word);
5239       eor(tmp5, tmp1, tmp2);
5240       cbnz(tmp5, DONE);
5241     } br(GT, NEXT_WORD);
5242     // Last longword.  In the case where length == 4 we compare the
5243     // same longword twice, but that's still faster than another
5244     // conditional branch.
5245     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5246     // length == 4.
5247     if (log_elem_size > 0)
5248       lsl(cnt1, cnt1, log_elem_size);
5249     ldr(tmp3, Address(a1, cnt1));
5250     ldr(tmp4, Address(a2, cnt1));
5251     eor(tmp5, tmp3, tmp4);
5252     cbnz(tmp5, DONE);
5253     b(SAME);
5254     bind(A_MIGHT_BE_NULL);
5255     // in case both a1 and a2 are not-null, proceed with loads
5256     cbz(a1, DONE);
5257     cbz(a2, DONE);
5258     b(A_IS_NOT_NULL);
5259     bind(SHORT);
5260 
5261     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5262     {
5263       ldrw(tmp1, Address(post(a1, 4)));
5264       ldrw(tmp2, Address(post(a2, 4)));
5265       eorw(tmp5, tmp1, tmp2);
5266       cbnzw(tmp5, DONE);
5267     }
5268     bind(TAIL03);
5269     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5270     {
5271       ldrh(tmp3, Address(post(a1, 2)));
5272       ldrh(tmp4, Address(post(a2, 2)));
5273       eorw(tmp5, tmp3, tmp4);
5274       cbnzw(tmp5, DONE);
5275     }
5276     bind(TAIL01);
5277     if (elem_size == 1) { // Only needed when comparing byte arrays.
5278       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5279       {
5280         ldrb(tmp1, a1);
5281         ldrb(tmp2, a2);
5282         eorw(tmp5, tmp1, tmp2);
5283         cbnzw(tmp5, DONE);
5284       }
5285     }
5286   } else {
5287     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5288         CSET_EQ, LAST_CHECK;
5289     mov(result, false);
5290     cbz(a1, DONE);
5291     ldrw(cnt1, Address(a1, length_offset));
5292     cbz(a2, DONE);
5293     ldrw(cnt2, Address(a2, length_offset));
5294     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5295     // faster to perform another branch before comparing a1 and a2
5296     cmp(cnt1, (u1)elem_per_word);
5297     br(LE, SHORT); // short or same
5298     ldr(tmp3, Address(pre(a1, base_offset)));
5299     subs(zr, cnt1, stubBytesThreshold);
5300     br(GE, STUB);
5301     ldr(tmp4, Address(pre(a2, base_offset)));
5302     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5303     cmp(cnt2, cnt1);
5304     br(NE, DONE);
5305 
5306     // Main 16 byte comparison loop with 2 exits
5307     bind(NEXT_DWORD); {
5308       ldr(tmp1, Address(pre(a1, wordSize)));
5309       ldr(tmp2, Address(pre(a2, wordSize)));
5310       subs(cnt1, cnt1, 2 * elem_per_word);
5311       br(LE, TAIL);
5312       eor(tmp4, tmp3, tmp4);
5313       cbnz(tmp4, DONE);
5314       ldr(tmp3, Address(pre(a1, wordSize)));
5315       ldr(tmp4, Address(pre(a2, wordSize)));
5316       cmp(cnt1, (u1)elem_per_word);
5317       br(LE, TAIL2);
5318       cmp(tmp1, tmp2);
5319     } br(EQ, NEXT_DWORD);
5320     b(DONE);
5321 
5322     bind(TAIL);
5323     eor(tmp4, tmp3, tmp4);
5324     eor(tmp2, tmp1, tmp2);
5325     lslv(tmp2, tmp2, tmp5);
5326     orr(tmp5, tmp4, tmp2);
5327     cmp(tmp5, zr);
5328     b(CSET_EQ);
5329 
5330     bind(TAIL2);
5331     eor(tmp2, tmp1, tmp2);
5332     cbnz(tmp2, DONE);
5333     b(LAST_CHECK);
5334 
5335     bind(STUB);
5336     ldr(tmp4, Address(pre(a2, base_offset)));
5337     cmp(cnt2, cnt1);
5338     br(NE, DONE);
5339     if (elem_size == 2) { // convert to byte counter
5340       lsl(cnt1, cnt1, 1);
5341     }
5342     eor(tmp5, tmp3, tmp4);
5343     cbnz(tmp5, DONE);
5344     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5345     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5346     trampoline_call(stub);
5347     b(DONE);
5348 
5349     bind(EARLY_OUT);
5350     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5351     // so, if a2 == null => return false(0), else return true, so we can return a2
5352     mov(result, a2);
5353     b(DONE);
5354     bind(SHORT);
5355     cmp(cnt2, cnt1);
5356     br(NE, DONE);
5357     cbz(cnt1, SAME);
5358     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5359     ldr(tmp3, Address(a1, base_offset));
5360     ldr(tmp4, Address(a2, base_offset));
5361     bind(LAST_CHECK);
5362     eor(tmp4, tmp3, tmp4);
5363     lslv(tmp5, tmp4, tmp5);
5364     cmp(tmp5, zr);
5365     bind(CSET_EQ);
5366     cset(result, EQ);
5367     b(DONE);
5368   }
5369 
5370   bind(SAME);
5371   mov(result, true);
5372   // That's it.
5373   bind(DONE);
5374 
5375   BLOCK_COMMENT("} array_equals");
5376 }
5377 
5378 // Compare Strings
5379 
5380 // For Strings we're passed the address of the first characters in a1
5381 // and a2 and the length in cnt1.
5382 // elem_size is the element size in bytes: either 1 or 2.
5383 // There are two implementations.  For arrays >= 8 bytes, all
5384 // comparisons (including the final one, which may overlap) are
5385 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5386 // halfword, then a short, and then a byte.
5387 
5388 void MacroAssembler::string_equals(Register a1, Register a2,
5389                                    Register result, Register cnt1, int elem_size)
5390 {
5391   Label SAME, DONE, SHORT, NEXT_WORD;
5392   Register tmp1 = rscratch1;
5393   Register tmp2 = rscratch2;
5394   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5395 
5396   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5397   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5398 
5399 #ifndef PRODUCT
5400   {
5401     const char kind = (elem_size == 2) ? 'U' : 'L';
5402     char comment[64];
5403     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5404     BLOCK_COMMENT(comment);
5405   }
5406 #endif
5407 
5408   mov(result, false);
5409 
5410   // Check for short strings, i.e. smaller than wordSize.
5411   subs(cnt1, cnt1, wordSize);
5412   br(Assembler::LT, SHORT);
5413   // Main 8 byte comparison loop.
5414   bind(NEXT_WORD); {
5415     ldr(tmp1, Address(post(a1, wordSize)));
5416     ldr(tmp2, Address(post(a2, wordSize)));
5417     subs(cnt1, cnt1, wordSize);
5418     eor(tmp1, tmp1, tmp2);
5419     cbnz(tmp1, DONE);
5420   } br(GT, NEXT_WORD);
5421   // Last longword.  In the case where length == 4 we compare the
5422   // same longword twice, but that's still faster than another
5423   // conditional branch.
5424   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5425   // length == 4.
5426   ldr(tmp1, Address(a1, cnt1));
5427   ldr(tmp2, Address(a2, cnt1));
5428   eor(tmp2, tmp1, tmp2);
5429   cbnz(tmp2, DONE);
5430   b(SAME);
5431 
5432   bind(SHORT);
5433   Label TAIL03, TAIL01;
5434 
5435   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5436   {
5437     ldrw(tmp1, Address(post(a1, 4)));
5438     ldrw(tmp2, Address(post(a2, 4)));
5439     eorw(tmp1, tmp1, tmp2);
5440     cbnzw(tmp1, DONE);
5441   }
5442   bind(TAIL03);
5443   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5444   {
5445     ldrh(tmp1, Address(post(a1, 2)));
5446     ldrh(tmp2, Address(post(a2, 2)));
5447     eorw(tmp1, tmp1, tmp2);
5448     cbnzw(tmp1, DONE);
5449   }
5450   bind(TAIL01);
5451   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5452     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5453     {
5454       ldrb(tmp1, a1);
5455       ldrb(tmp2, a2);
5456       eorw(tmp1, tmp1, tmp2);
5457       cbnzw(tmp1, DONE);
5458     }
5459   }
5460   // Arrays are equal.
5461   bind(SAME);
5462   mov(result, true);
5463 
5464   // That's it.
5465   bind(DONE);
5466   BLOCK_COMMENT("} string_equals");
5467 }
5468 
5469 
5470 // The size of the blocks erased by the zero_blocks stub.  We must
5471 // handle anything smaller than this ourselves in zero_words().
5472 const int MacroAssembler::zero_words_block_size = 8;
5473 
5474 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5475 // possible, handling small word counts locally and delegating
5476 // anything larger to the zero_blocks stub.  It is expanded many times
5477 // in compiled code, so it is important to keep it short.
5478 
5479 // ptr:   Address of a buffer to be zeroed.
5480 // cnt:   Count in HeapWords.
5481 //
5482 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5483 void MacroAssembler::zero_words(Register ptr, Register cnt)
5484 {
5485   assert(is_power_of_2(zero_words_block_size), "adjust this");
5486   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5487 
5488   BLOCK_COMMENT("zero_words {");
5489   cmp(cnt, (u1)zero_words_block_size);
5490   Label around;
5491   br(LO, around);
5492   {
5493     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5494     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5495     if (StubRoutines::aarch64::complete()) {
5496       trampoline_call(zero_blocks);
5497     } else {
5498       bl(zero_blocks);
5499     }
5500   }
5501   bind(around);
5502   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5503     Label l;
5504     tbz(cnt, exact_log2(i), l);
5505     for (int j = 0; j < i; j += 2) {
5506       stp(zr, zr, post(ptr, 16));
5507     }
5508     bind(l);
5509   }
5510   {
5511     Label l;
5512     tbz(cnt, 0, l);
5513     str(zr, Address(ptr));
5514     bind(l);
5515   }
5516   BLOCK_COMMENT("} zero_words");
5517 }
5518 
5519 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5520 // cnt:          Immediate count in HeapWords.
5521 #define SmallArraySize (18 * BytesPerLong)
5522 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5523 {
5524   BLOCK_COMMENT("zero_words {");
5525   int i = cnt & 1;  // store any odd word to start
5526   if (i) str(zr, Address(base));
5527 
5528   if (cnt <= SmallArraySize / BytesPerLong) {
5529     for (; i < (int)cnt; i += 2)
5530       stp(zr, zr, Address(base, i * wordSize));
5531   } else {
5532     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5533     int remainder = cnt % (2 * unroll);
5534     for (; i < remainder; i += 2)
5535       stp(zr, zr, Address(base, i * wordSize));
5536 
5537     Label loop;
5538     Register cnt_reg = rscratch1;
5539     Register loop_base = rscratch2;
5540     cnt = cnt - remainder;
5541     mov(cnt_reg, cnt);
5542     // adjust base and prebias by -2 * wordSize so we can pre-increment
5543     add(loop_base, base, (remainder - 2) * wordSize);
5544     bind(loop);
5545     sub(cnt_reg, cnt_reg, 2 * unroll);
5546     for (i = 1; i < unroll; i++)
5547       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5548     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5549     cbnz(cnt_reg, loop);
5550   }
5551   BLOCK_COMMENT("} zero_words");
5552 }
5553 
5554 // Zero blocks of memory by using DC ZVA.
5555 //
5556 // Aligns the base address first sufficently for DC ZVA, then uses
5557 // DC ZVA repeatedly for every full block.  cnt is the size to be
5558 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5559 // in cnt.
5560 //
5561 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5562 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5563 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5564   Register tmp = rscratch1;
5565   Register tmp2 = rscratch2;
5566   int zva_length = VM_Version::zva_length();
5567   Label initial_table_end, loop_zva;
5568   Label fini;
5569 
5570   // Base must be 16 byte aligned. If not just return and let caller handle it
5571   tst(base, 0x0f);
5572   br(Assembler::NE, fini);
5573   // Align base with ZVA length.
5574   neg(tmp, base);
5575   andr(tmp, tmp, zva_length - 1);
5576 
5577   // tmp: the number of bytes to be filled to align the base with ZVA length.
5578   add(base, base, tmp);
5579   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5580   adr(tmp2, initial_table_end);
5581   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5582   br(tmp2);
5583 
5584   for (int i = -zva_length + 16; i < 0; i += 16)
5585     stp(zr, zr, Address(base, i));
5586   bind(initial_table_end);
5587 
5588   sub(cnt, cnt, zva_length >> 3);
5589   bind(loop_zva);
5590   dc(Assembler::ZVA, base);
5591   subs(cnt, cnt, zva_length >> 3);
5592   add(base, base, zva_length);
5593   br(Assembler::GE, loop_zva);
5594   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5595   bind(fini);
5596 }
5597 
5598 // base:   Address of a buffer to be filled, 8 bytes aligned.
5599 // cnt:    Count in 8-byte unit.
5600 // value:  Value to be filled with.
5601 // base will point to the end of the buffer after filling.
5602 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5603 {
5604 //  Algorithm:
5605 //
5606 //    scratch1 = cnt & 7;
5607 //    cnt -= scratch1;
5608 //    p += scratch1;
5609 //    switch (scratch1) {
5610 //      do {
5611 //        cnt -= 8;
5612 //          p[-8] = v;
5613 //        case 7:
5614 //          p[-7] = v;
5615 //        case 6:
5616 //          p[-6] = v;
5617 //          // ...
5618 //        case 1:
5619 //          p[-1] = v;
5620 //        case 0:
5621 //          p += 8;
5622 //      } while (cnt);
5623 //    }
5624 
5625   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5626 
5627   Label fini, skip, entry, loop;
5628   const int unroll = 8; // Number of stp instructions we'll unroll
5629 
5630   cbz(cnt, fini);
5631   tbz(base, 3, skip);
5632   str(value, Address(post(base, 8)));
5633   sub(cnt, cnt, 1);
5634   bind(skip);
5635 
5636   andr(rscratch1, cnt, (unroll-1) * 2);
5637   sub(cnt, cnt, rscratch1);
5638   add(base, base, rscratch1, Assembler::LSL, 3);
5639   adr(rscratch2, entry);
5640   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5641   br(rscratch2);
5642 
5643   bind(loop);
5644   add(base, base, unroll * 16);
5645   for (int i = -unroll; i < 0; i++)
5646     stp(value, value, Address(base, i * 16));
5647   bind(entry);
5648   subs(cnt, cnt, unroll * 2);
5649   br(Assembler::GE, loop);
5650 
5651   tbz(cnt, 0, fini);
5652   str(value, Address(post(base, 8)));
5653   bind(fini);
5654 }
5655 
5656 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5657 // java/lang/StringUTF16.compress.
5658 void MacroAssembler::encode_iso_array(Register src, Register dst,
5659                       Register len, Register result,
5660                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5661                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5662 {
5663     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5664         NEXT_32_START, NEXT_32_PRFM_START;
5665     Register tmp1 = rscratch1, tmp2 = rscratch2;
5666 
5667       mov(result, len); // Save initial len
5668 
5669       cmp(len, (u1)8); // handle shortest strings first
5670       br(LT, LOOP_1);
5671       cmp(len, (u1)32);
5672       br(LT, NEXT_8);
5673       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5674       // to convert chars to bytes
5675       if (SoftwarePrefetchHintDistance >= 0) {
5676         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5677         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5678         br(LE, NEXT_32_START);
5679         b(NEXT_32_PRFM_START);
5680         BIND(NEXT_32_PRFM);
5681           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5682         BIND(NEXT_32_PRFM_START);
5683           prfm(Address(src, SoftwarePrefetchHintDistance));
5684           orr(v4, T16B, Vtmp1, Vtmp2);
5685           orr(v5, T16B, Vtmp3, Vtmp4);
5686           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5687           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5688           uzp2(v5, T16B, v4, v5); // high bytes
5689           umov(tmp2, v5, D, 1);
5690           fmovd(tmp1, v5);
5691           orr(tmp1, tmp1, tmp2);
5692           cbnz(tmp1, LOOP_8);
5693           stpq(Vtmp1, Vtmp3, dst);
5694           sub(len, len, 32);
5695           add(dst, dst, 32);
5696           add(src, src, 64);
5697           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5698           br(GE, NEXT_32_PRFM);
5699           cmp(len, (u1)32);
5700           br(LT, LOOP_8);
5701         BIND(NEXT_32);
5702           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5703         BIND(NEXT_32_START);
5704       } else {
5705         BIND(NEXT_32);
5706           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5707       }
5708       prfm(Address(src, SoftwarePrefetchHintDistance));
5709       uzp1(v4, T16B, Vtmp1, Vtmp2);
5710       uzp1(v5, T16B, Vtmp3, Vtmp4);
5711       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5712       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5713       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5714       umov(tmp2, Vtmp1, D, 1);
5715       fmovd(tmp1, Vtmp1);
5716       orr(tmp1, tmp1, tmp2);
5717       cbnz(tmp1, LOOP_8);
5718       stpq(v4, v5, dst);
5719       sub(len, len, 32);
5720       add(dst, dst, 32);
5721       add(src, src, 64);
5722       cmp(len, (u1)32);
5723       br(GE, NEXT_32);
5724       cbz(len, DONE);
5725 
5726     BIND(LOOP_8);
5727       cmp(len, (u1)8);
5728       br(LT, LOOP_1);
5729     BIND(NEXT_8);
5730       ld1(Vtmp1, T8H, src);
5731       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5732       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5733       fmovd(tmp1, Vtmp3);
5734       cbnz(tmp1, NEXT_1);
5735       strd(Vtmp2, dst);
5736 
5737       sub(len, len, 8);
5738       add(dst, dst, 8);
5739       add(src, src, 16);
5740       cmp(len, (u1)8);
5741       br(GE, NEXT_8);
5742 
5743     BIND(LOOP_1);
5744 
5745     cbz(len, DONE);
5746     BIND(NEXT_1);
5747       ldrh(tmp1, Address(post(src, 2)));
5748       tst(tmp1, 0xff00);
5749       br(NE, SET_RESULT);
5750       strb(tmp1, Address(post(dst, 1)));
5751       subs(len, len, 1);
5752       br(GT, NEXT_1);
5753 
5754     BIND(SET_RESULT);
5755       sub(result, result, len); // Return index where we stopped
5756                                 // Return len == 0 if we processed all
5757                                 // characters
5758     BIND(DONE);
5759 }
5760 
5761 
5762 // Inflate byte[] array to char[].
5763 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5764                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5765                                         Register tmp4) {
5766   Label big, done, after_init, to_stub;
5767 
5768   assert_different_registers(src, dst, len, tmp4, rscratch1);
5769 
5770   fmovd(vtmp1, zr);
5771   lsrw(tmp4, len, 3);
5772   bind(after_init);
5773   cbnzw(tmp4, big);
5774   // Short string: less than 8 bytes.
5775   {
5776     Label loop, tiny;
5777 
5778     cmpw(len, 4);
5779     br(LT, tiny);
5780     // Use SIMD to do 4 bytes.
5781     ldrs(vtmp2, post(src, 4));
5782     zip1(vtmp3, T8B, vtmp2, vtmp1);
5783     subw(len, len, 4);
5784     strd(vtmp3, post(dst, 8));
5785 
5786     cbzw(len, done);
5787 
5788     // Do the remaining bytes by steam.
5789     bind(loop);
5790     ldrb(tmp4, post(src, 1));
5791     strh(tmp4, post(dst, 2));
5792     subw(len, len, 1);
5793 
5794     bind(tiny);
5795     cbnz(len, loop);
5796 
5797     b(done);
5798   }
5799 
5800   if (SoftwarePrefetchHintDistance >= 0) {
5801     bind(to_stub);
5802       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5803       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5804       trampoline_call(stub);
5805       b(after_init);
5806   }
5807 
5808   // Unpack the bytes 8 at a time.
5809   bind(big);
5810   {
5811     Label loop, around, loop_last, loop_start;
5812 
5813     if (SoftwarePrefetchHintDistance >= 0) {
5814       const int large_loop_threshold = (64 + 16)/8;
5815       ldrd(vtmp2, post(src, 8));
5816       andw(len, len, 7);
5817       cmp(tmp4, (u1)large_loop_threshold);
5818       br(GE, to_stub);
5819       b(loop_start);
5820 
5821       bind(loop);
5822       ldrd(vtmp2, post(src, 8));
5823       bind(loop_start);
5824       subs(tmp4, tmp4, 1);
5825       br(EQ, loop_last);
5826       zip1(vtmp2, T16B, vtmp2, vtmp1);
5827       ldrd(vtmp3, post(src, 8));
5828       st1(vtmp2, T8H, post(dst, 16));
5829       subs(tmp4, tmp4, 1);
5830       zip1(vtmp3, T16B, vtmp3, vtmp1);
5831       st1(vtmp3, T8H, post(dst, 16));
5832       br(NE, loop);
5833       b(around);
5834       bind(loop_last);
5835       zip1(vtmp2, T16B, vtmp2, vtmp1);
5836       st1(vtmp2, T8H, post(dst, 16));
5837       bind(around);
5838       cbz(len, done);
5839     } else {
5840       andw(len, len, 7);
5841       bind(loop);
5842       ldrd(vtmp2, post(src, 8));
5843       sub(tmp4, tmp4, 1);
5844       zip1(vtmp3, T16B, vtmp2, vtmp1);
5845       st1(vtmp3, T8H, post(dst, 16));
5846       cbnz(tmp4, loop);
5847     }
5848   }
5849 
5850   // Do the tail of up to 8 bytes.
5851   add(src, src, len);
5852   ldrd(vtmp3, Address(src, -8));
5853   add(dst, dst, len, ext::uxtw, 1);
5854   zip1(vtmp3, T16B, vtmp3, vtmp1);
5855   strq(vtmp3, Address(dst, -16));
5856 
5857   bind(done);
5858 }
5859 
5860 // Compress char[] array to byte[].
5861 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5862                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5863                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5864                                          Register result) {
5865   encode_iso_array(src, dst, len, result,
5866                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5867   cmp(len, zr);
5868   csel(result, result, zr, EQ);
5869 }
5870 
5871 // get_thread() can be called anywhere inside generated code so we
5872 // need to save whatever non-callee save context might get clobbered
5873 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5874 // the call setup code.
5875 //
5876 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5877 //
5878 void MacroAssembler::get_thread(Register dst) {
5879   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5880   push(saved_regs, sp);
5881 
5882   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5883   blr(lr);
5884   if (dst != c_rarg0) {
5885     mov(dst, c_rarg0);
5886   }
5887 
5888   pop(saved_regs, sp);
5889 }
5890 
5891 void MacroAssembler::cache_wb(Address line) {
5892   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5893   assert(line.index() == noreg, "index should be noreg");
5894   assert(line.offset() == 0, "offset should be 0");
5895   // would like to assert this
5896   // assert(line._ext.shift == 0, "shift should be zero");
5897   if (VM_Version::supports_dcpop()) {
5898     // writeback using clear virtual address to point of persistence
5899     dc(Assembler::CVAP, line.base());
5900   } else {
5901     // no need to generate anything as Unsafe.writebackMemory should
5902     // never invoke this stub
5903   }
5904 }
5905 
5906 void MacroAssembler::cache_wbsync(bool is_pre) {
5907   // we only need a barrier post sync
5908   if (!is_pre) {
5909     membar(Assembler::AnyAny);
5910   }
5911 }