1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   assert(last_java_pc != NULL, "must provide a valid PC");
 377 
 378   adr(scratch, last_java_pc);
 379   str(scratch, Address(rthread,
 380                        JavaThread::frame_anchor_offset()
 381                        + JavaFrameAnchor::last_Java_pc_offset()));
 382 
 383   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          Label &L,
 389                                          Register scratch) {
 390   if (L.is_bound()) {
 391     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 392   } else {
 393     InstructionMark im(this);
 394     L.add_patch_at(code(), locator());
 395     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 396   }
 397 }
 398 
 399 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 400   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 401   assert(CodeCache::find_blob(entry.target()) != NULL,
 402          "destination of far call not found in code cache");
 403   if (far_branches()) {
 404     unsigned long offset;
 405     // We can use ADRP here because we know that the total size of
 406     // the code cache cannot exceed 2Gb.
 407     adrp(tmp, entry, offset);
 408     add(tmp, tmp, offset);
 409     if (cbuf) cbuf->set_insts_mark();
 410     blr(tmp);
 411   } else {
 412     if (cbuf) cbuf->set_insts_mark();
 413     bl(entry);
 414   }
 415 }
 416 
 417 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 418   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 419   assert(CodeCache::find_blob(entry.target()) != NULL,
 420          "destination of far call not found in code cache");
 421   if (far_branches()) {
 422     unsigned long offset;
 423     // We can use ADRP here because we know that the total size of
 424     // the code cache cannot exceed 2Gb.
 425     adrp(tmp, entry, offset);
 426     add(tmp, tmp, offset);
 427     if (cbuf) cbuf->set_insts_mark();
 428     br(tmp);
 429   } else {
 430     if (cbuf) cbuf->set_insts_mark();
 431     b(entry);
 432   }
 433 }
 434 
 435 void MacroAssembler::reserved_stack_check() {
 436     // testing if reserved zone needs to be enabled
 437     Label no_reserved_zone_enabling;
 438 
 439     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 440     cmp(sp, rscratch1);
 441     br(Assembler::LO, no_reserved_zone_enabling);
 442 
 443     enter();   // LR and FP are live.
 444     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 445     mov(c_rarg0, rthread);
 446     blr(rscratch1);
 447     leave();
 448 
 449     // We have already removed our own frame.
 450     // throw_delayed_StackOverflowError will think that it's been
 451     // called by our caller.
 452     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 453     br(rscratch1);
 454     should_not_reach_here();
 455 
 456     bind(no_reserved_zone_enabling);
 457 }
 458 
 459 int MacroAssembler::biased_locking_enter(Register lock_reg,
 460                                          Register obj_reg,
 461                                          Register swap_reg,
 462                                          Register tmp_reg,
 463                                          bool swap_reg_contains_mark,
 464                                          Label& done,
 465                                          Label* slow_case,
 466                                          BiasedLockingCounters* counters) {
 467   assert(UseBiasedLocking, "why call this otherwise?");
 468   assert_different_registers(lock_reg, obj_reg, swap_reg);
 469 
 470   if (PrintBiasedLockingStatistics && counters == NULL)
 471     counters = BiasedLocking::counters();
 472 
 473   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 474   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 475   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 476   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 477   Address saved_mark_addr(lock_reg, 0);
 478 
 479   // Biased locking
 480   // See whether the lock is currently biased toward our thread and
 481   // whether the epoch is still valid
 482   // Note that the runtime guarantees sufficient alignment of JavaThread
 483   // pointers to allow age to be placed into low bits
 484   // First check to see whether biasing is even enabled for this object
 485   Label cas_label;
 486   int null_check_offset = -1;
 487   if (!swap_reg_contains_mark) {
 488     null_check_offset = offset();
 489     ldr(swap_reg, mark_addr);
 490   }
 491   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 492   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 493   br(Assembler::NE, cas_label);
 494   // The bias pattern is present in the object's header. Need to check
 495   // whether the bias owner and the epoch are both still current.
 496   load_prototype_header(tmp_reg, obj_reg);
 497   orr(tmp_reg, tmp_reg, rthread);
 498   eor(tmp_reg, swap_reg, tmp_reg);
 499   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 500   if (counters != NULL) {
 501     Label around;
 502     cbnz(tmp_reg, around);
 503     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 504     b(done);
 505     bind(around);
 506   } else {
 507     cbz(tmp_reg, done);
 508   }
 509 
 510   Label try_revoke_bias;
 511   Label try_rebias;
 512 
 513   // At this point we know that the header has the bias pattern and
 514   // that we are not the bias owner in the current epoch. We need to
 515   // figure out more details about the state of the header in order to
 516   // know what operations can be legally performed on the object's
 517   // header.
 518 
 519   // If the low three bits in the xor result aren't clear, that means
 520   // the prototype header is no longer biased and we have to revoke
 521   // the bias on this object.
 522   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 523   cbnz(rscratch1, try_revoke_bias);
 524 
 525   // Biasing is still enabled for this data type. See whether the
 526   // epoch of the current bias is still valid, meaning that the epoch
 527   // bits of the mark word are equal to the epoch bits of the
 528   // prototype header. (Note that the prototype header's epoch bits
 529   // only change at a safepoint.) If not, attempt to rebias the object
 530   // toward the current thread. Note that we must be absolutely sure
 531   // that the current epoch is invalid in order to do this because
 532   // otherwise the manipulations it performs on the mark word are
 533   // illegal.
 534   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 535   cbnz(rscratch1, try_rebias);
 536 
 537   // The epoch of the current bias is still valid but we know nothing
 538   // about the owner; it might be set or it might be clear. Try to
 539   // acquire the bias of the object using an atomic operation. If this
 540   // fails we will go in to the runtime to revoke the object's bias.
 541   // Note that we first construct the presumed unbiased header so we
 542   // don't accidentally blow away another thread's valid bias.
 543   {
 544     Label here;
 545     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 546     andr(swap_reg, swap_reg, rscratch1);
 547     orr(tmp_reg, swap_reg, rthread);
 548     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 549     // If the biasing toward our thread failed, this means that
 550     // another thread succeeded in biasing it toward itself and we
 551     // need to revoke that bias. The revocation will occur in the
 552     // interpreter runtime in the slow case.
 553     bind(here);
 554     if (counters != NULL) {
 555       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 556                   tmp_reg, rscratch1, rscratch2);
 557     }
 558   }
 559   b(done);
 560 
 561   bind(try_rebias);
 562   // At this point we know the epoch has expired, meaning that the
 563   // current "bias owner", if any, is actually invalid. Under these
 564   // circumstances _only_, we are allowed to use the current header's
 565   // value as the comparison value when doing the cas to acquire the
 566   // bias in the current epoch. In other words, we allow transfer of
 567   // the bias from one thread to another directly in this situation.
 568   //
 569   // FIXME: due to a lack of registers we currently blow away the age
 570   // bits in this situation. Should attempt to preserve them.
 571   {
 572     Label here;
 573     load_prototype_header(tmp_reg, obj_reg);
 574     orr(tmp_reg, rthread, tmp_reg);
 575     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 576     // If the biasing toward our thread failed, then another thread
 577     // succeeded in biasing it toward itself and we need to revoke that
 578     // bias. The revocation will occur in the runtime in the slow case.
 579     bind(here);
 580     if (counters != NULL) {
 581       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 582                   tmp_reg, rscratch1, rscratch2);
 583     }
 584   }
 585   b(done);
 586 
 587   bind(try_revoke_bias);
 588   // The prototype mark in the klass doesn't have the bias bit set any
 589   // more, indicating that objects of this data type are not supposed
 590   // to be biased any more. We are going to try to reset the mark of
 591   // this object to the prototype value and fall through to the
 592   // CAS-based locking scheme. Note that if our CAS fails, it means
 593   // that another thread raced us for the privilege of revoking the
 594   // bias of this particular object, so it's okay to continue in the
 595   // normal locking code.
 596   //
 597   // FIXME: due to a lack of registers we currently blow away the age
 598   // bits in this situation. Should attempt to preserve them.
 599   {
 600     Label here, nope;
 601     load_prototype_header(tmp_reg, obj_reg);
 602     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 603     bind(here);
 604 
 605     // Fall through to the normal CAS-based lock, because no matter what
 606     // the result of the above CAS, some thread must have succeeded in
 607     // removing the bias bit from the object's header.
 608     if (counters != NULL) {
 609       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 610                   rscratch1, rscratch2);
 611     }
 612     bind(nope);
 613   }
 614 
 615   bind(cas_label);
 616 
 617   return null_check_offset;
 618 }
 619 
 620 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 621   assert(UseBiasedLocking, "why call this otherwise?");
 622 
 623   // Check for biased locking unlock case, which is a no-op
 624   // Note: we do not have to check the thread ID for two reasons.
 625   // First, the interpreter checks for IllegalMonitorStateException at
 626   // a higher level. Second, if the bias was revoked while we held the
 627   // lock, the object could not be rebiased toward another thread, so
 628   // the bias bit would be clear.
 629   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 630   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 631   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 632   br(Assembler::EQ, done);
 633 }
 634 
 635 static void pass_arg0(MacroAssembler* masm, Register arg) {
 636   if (c_rarg0 != arg ) {
 637     masm->mov(c_rarg0, arg);
 638   }
 639 }
 640 
 641 static void pass_arg1(MacroAssembler* masm, Register arg) {
 642   if (c_rarg1 != arg ) {
 643     masm->mov(c_rarg1, arg);
 644   }
 645 }
 646 
 647 static void pass_arg2(MacroAssembler* masm, Register arg) {
 648   if (c_rarg2 != arg ) {
 649     masm->mov(c_rarg2, arg);
 650   }
 651 }
 652 
 653 static void pass_arg3(MacroAssembler* masm, Register arg) {
 654   if (c_rarg3 != arg ) {
 655     masm->mov(c_rarg3, arg);
 656   }
 657 }
 658 
 659 void MacroAssembler::call_VM_base(Register oop_result,
 660                                   Register java_thread,
 661                                   Register last_java_sp,
 662                                   address  entry_point,
 663                                   int      number_of_arguments,
 664                                   bool     check_exceptions) {
 665    // determine java_thread register
 666   if (!java_thread->is_valid()) {
 667     java_thread = rthread;
 668   }
 669 
 670   // determine last_java_sp register
 671   if (!last_java_sp->is_valid()) {
 672     last_java_sp = esp;
 673   }
 674 
 675   // debugging support
 676   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 677   assert(java_thread == rthread, "unexpected register");
 678 #ifdef ASSERT
 679   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 680   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 681 #endif // ASSERT
 682 
 683   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 684   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 685 
 686   // push java thread (becomes first argument of C function)
 687 
 688   mov(c_rarg0, java_thread);
 689 
 690   // set last Java frame before call
 691   assert(last_java_sp != rfp, "can't use rfp");
 692 
 693   Label l;
 694   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 695 
 696   // do the call, remove parameters
 697   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 698 
 699   // reset last Java frame
 700   // Only interpreter should have to clear fp
 701   reset_last_Java_frame(true);
 702 
 703    // C++ interp handles this in the interpreter
 704   check_and_handle_popframe(java_thread);
 705   check_and_handle_earlyret(java_thread);
 706 
 707   if (check_exceptions) {
 708     // check for pending exceptions (java_thread is set upon return)
 709     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 710     Label ok;
 711     cbz(rscratch1, ok);
 712     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 713     br(rscratch1);
 714     bind(ok);
 715   }
 716 
 717   // get oop result if there is one and reset the value in the thread
 718   if (oop_result->is_valid()) {
 719     get_vm_result(oop_result, java_thread);
 720   }
 721 }
 722 
 723 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 724   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 725 }
 726 
 727 // Maybe emit a call via a trampoline.  If the code cache is small
 728 // trampolines won't be emitted.
 729 
 730 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 731   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 732   assert(entry.rspec().type() == relocInfo::runtime_call_type
 733          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 734          || entry.rspec().type() == relocInfo::static_call_type
 735          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 736 
 737   // We need a trampoline if branches are far.
 738   if (far_branches()) {
 739     bool in_scratch_emit_size = false;
 740 #ifdef COMPILER2
 741     // We don't want to emit a trampoline if C2 is generating dummy
 742     // code during its branch shortening phase.
 743     CompileTask* task = ciEnv::current()->task();
 744     in_scratch_emit_size =
 745       (task != NULL && is_c2_compile(task->comp_level()) &&
 746        Compile::current()->in_scratch_emit_size());
 747 #endif
 748     if (!in_scratch_emit_size) {
 749       address stub = emit_trampoline_stub(offset(), entry.target());
 750       if (stub == NULL) {
 751         return NULL; // CodeCache is full
 752       }
 753     }
 754   }
 755 
 756   if (cbuf) cbuf->set_insts_mark();
 757   relocate(entry.rspec());
 758   if (!far_branches()) {
 759     bl(entry.target());
 760   } else {
 761     bl(pc());
 762   }
 763   // just need to return a non-null address
 764   return pc();
 765 }
 766 
 767 
 768 // Emit a trampoline stub for a call to a target which is too far away.
 769 //
 770 // code sequences:
 771 //
 772 // call-site:
 773 //   branch-and-link to <destination> or <trampoline stub>
 774 //
 775 // Related trampoline stub for this call site in the stub section:
 776 //   load the call target from the constant pool
 777 //   branch (LR still points to the call site above)
 778 
 779 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 780                                              address dest) {
 781   // Max stub size: alignment nop, TrampolineStub.
 782   address stub = start_a_stub(NativeInstruction::instruction_size
 783                    + NativeCallTrampolineStub::instruction_size);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 void MacroAssembler::c2bool(Register x) {
 816   // implements x == 0 ? 0 : 1
 817   // note: must only look at least-significant byte of x
 818   //       since C-style booleans are stored in one byte
 819   //       only! (was bug)
 820   tst(x, 0xff);
 821   cset(x, Assembler::NE);
 822 }
 823 
 824 address MacroAssembler::ic_call(address entry, jint method_index) {
 825   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 826   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 827   // unsigned long offset;
 828   // ldr_constant(rscratch2, const_ptr);
 829   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 830   return trampoline_call(Address(entry, rh));
 831 }
 832 
 833 // Implementation of call_VM versions
 834 
 835 void MacroAssembler::call_VM(Register oop_result,
 836                              address entry_point,
 837                              bool check_exceptions) {
 838   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 839 }
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              Register arg_1,
 844                              bool check_exceptions) {
 845   pass_arg1(this, arg_1);
 846   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 847 }
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              Register arg_1,
 852                              Register arg_2,
 853                              bool check_exceptions) {
 854   assert(arg_1 != c_rarg2, "smashed arg");
 855   pass_arg2(this, arg_2);
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              address entry_point,
 862                              Register arg_1,
 863                              Register arg_2,
 864                              Register arg_3,
 865                              bool check_exceptions) {
 866   assert(arg_1 != c_rarg3, "smashed arg");
 867   assert(arg_2 != c_rarg3, "smashed arg");
 868   pass_arg3(this, arg_3);
 869 
 870   assert(arg_1 != c_rarg2, "smashed arg");
 871   pass_arg2(this, arg_2);
 872 
 873   pass_arg1(this, arg_1);
 874   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              int number_of_arguments,
 881                              bool check_exceptions) {
 882   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 883 }
 884 
 885 void MacroAssembler::call_VM(Register oop_result,
 886                              Register last_java_sp,
 887                              address entry_point,
 888                              Register arg_1,
 889                              bool check_exceptions) {
 890   pass_arg1(this, arg_1);
 891   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 892 }
 893 
 894 void MacroAssembler::call_VM(Register oop_result,
 895                              Register last_java_sp,
 896                              address entry_point,
 897                              Register arg_1,
 898                              Register arg_2,
 899                              bool check_exceptions) {
 900 
 901   assert(arg_1 != c_rarg2, "smashed arg");
 902   pass_arg2(this, arg_2);
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              Register arg_3,
 913                              bool check_exceptions) {
 914   assert(arg_1 != c_rarg3, "smashed arg");
 915   assert(arg_2 != c_rarg3, "smashed arg");
 916   pass_arg3(this, arg_3);
 917   assert(arg_1 != c_rarg2, "smashed arg");
 918   pass_arg2(this, arg_2);
 919   pass_arg1(this, arg_1);
 920   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 921 }
 922 
 923 
 924 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 925   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 926   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 927   verify_oop(oop_result, "broken oop in call_VM_base");
 928 }
 929 
 930 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 931   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 933 }
 934 
 935 void MacroAssembler::align(int modulus) {
 936   while (offset() % modulus != 0) nop();
 937 }
 938 
 939 // these are no-ops overridden by InterpreterMacroAssembler
 940 
 941 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 942 
 943 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 944 
 945 
 946 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 947                                                       Register tmp,
 948                                                       int offset) {
 949   intptr_t value = *delayed_value_addr;
 950   if (value != 0)
 951     return RegisterOrConstant(value + offset);
 952 
 953   // load indirectly to solve generation ordering problem
 954   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 955 
 956   if (offset != 0)
 957     add(tmp, tmp, offset);
 958 
 959   return RegisterOrConstant(tmp);
 960 }
 961 
 962 
 963 void MacroAssembler:: notify(int type) {
 964   if (type == bytecode_start) {
 965     // set_last_Java_frame(esp, rfp, (address)NULL);
 966     Assembler:: notify(type);
 967     // reset_last_Java_frame(true);
 968   }
 969   else
 970     Assembler:: notify(type);
 971 }
 972 
 973 // Look up the method for a megamorphic invokeinterface call.
 974 // The target method is determined by <intf_klass, itable_index>.
 975 // The receiver klass is in recv_klass.
 976 // On success, the result will be in method_result, and execution falls through.
 977 // On failure, execution transfers to the given label.
 978 void MacroAssembler::lookup_interface_method(Register recv_klass,
 979                                              Register intf_klass,
 980                                              RegisterOrConstant itable_index,
 981                                              Register method_result,
 982                                              Register scan_temp,
 983                                              Label& L_no_such_interface,
 984                          bool return_method) {
 985   assert_different_registers(recv_klass, intf_klass, scan_temp);
 986   assert_different_registers(method_result, intf_klass, scan_temp);
 987   assert(recv_klass != method_result || !return_method,
 988      "recv_klass can be destroyed when method isn't needed");
 989   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 990          "caller must use same register for non-constant itable index as for method");
 991 
 992   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 993   int vtable_base = in_bytes(Klass::vtable_start_offset());
 994   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 995   int scan_step   = itableOffsetEntry::size() * wordSize;
 996   int vte_size    = vtableEntry::size_in_bytes();
 997   assert(vte_size == wordSize, "else adjust times_vte_scale");
 998 
 999   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1000 
1001   // %%% Could store the aligned, prescaled offset in the klassoop.
1002   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1003   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1004   add(scan_temp, scan_temp, vtable_base);
1005 
1006   if (return_method) {
1007     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1008     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1009     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1010     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1011     if (itentry_off)
1012       add(recv_klass, recv_klass, itentry_off);
1013   }
1014 
1015   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1016   //   if (scan->interface() == intf) {
1017   //     result = (klass + scan->offset() + itable_index);
1018   //   }
1019   // }
1020   Label search, found_method;
1021 
1022   for (int peel = 1; peel >= 0; peel--) {
1023     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1024     cmp(intf_klass, method_result);
1025 
1026     if (peel) {
1027       br(Assembler::EQ, found_method);
1028     } else {
1029       br(Assembler::NE, search);
1030       // (invert the test to fall through to found_method...)
1031     }
1032 
1033     if (!peel)  break;
1034 
1035     bind(search);
1036 
1037     // Check that the previous entry is non-null.  A null entry means that
1038     // the receiver class doesn't implement the interface, and wasn't the
1039     // same as when the caller was compiled.
1040     cbz(method_result, L_no_such_interface);
1041     add(scan_temp, scan_temp, scan_step);
1042   }
1043 
1044   bind(found_method);
1045 
1046   // Got a hit.
1047   if (return_method) {
1048     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1049     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1050   }
1051 }
1052 
1053 // virtual method calling
1054 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1055                                            RegisterOrConstant vtable_index,
1056                                            Register method_result) {
1057   const int base = in_bytes(Klass::vtable_start_offset());
1058   assert(vtableEntry::size() * wordSize == 8,
1059          "adjust the scaling in the code below");
1060   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1061 
1062   if (vtable_index.is_register()) {
1063     lea(method_result, Address(recv_klass,
1064                                vtable_index.as_register(),
1065                                Address::lsl(LogBytesPerWord)));
1066     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1067   } else {
1068     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1069     ldr(method_result,
1070         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1071   }
1072 }
1073 
1074 void MacroAssembler::check_klass_subtype(Register sub_klass,
1075                            Register super_klass,
1076                            Register temp_reg,
1077                            Label& L_success) {
1078   Label L_failure;
1079   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1080   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1081   bind(L_failure);
1082 }
1083 
1084 
1085 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1086                                                    Register super_klass,
1087                                                    Register temp_reg,
1088                                                    Label* L_success,
1089                                                    Label* L_failure,
1090                                                    Label* L_slow_path,
1091                                         RegisterOrConstant super_check_offset) {
1092   assert_different_registers(sub_klass, super_klass, temp_reg);
1093   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1094   if (super_check_offset.is_register()) {
1095     assert_different_registers(sub_klass, super_klass,
1096                                super_check_offset.as_register());
1097   } else if (must_load_sco) {
1098     assert(temp_reg != noreg, "supply either a temp or a register offset");
1099   }
1100 
1101   Label L_fallthrough;
1102   int label_nulls = 0;
1103   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1104   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1105   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1106   assert(label_nulls <= 1, "at most one NULL in the batch");
1107 
1108   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1109   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1110   Address super_check_offset_addr(super_klass, sco_offset);
1111 
1112   // Hacked jmp, which may only be used just before L_fallthrough.
1113 #define final_jmp(label)                                                \
1114   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1115   else                            b(label)                /*omit semi*/
1116 
1117   // If the pointers are equal, we are done (e.g., String[] elements).
1118   // This self-check enables sharing of secondary supertype arrays among
1119   // non-primary types such as array-of-interface.  Otherwise, each such
1120   // type would need its own customized SSA.
1121   // We move this check to the front of the fast path because many
1122   // type checks are in fact trivially successful in this manner,
1123   // so we get a nicely predicted branch right at the start of the check.
1124   cmp(sub_klass, super_klass);
1125   br(Assembler::EQ, *L_success);
1126 
1127   // Check the supertype display:
1128   if (must_load_sco) {
1129     ldrw(temp_reg, super_check_offset_addr);
1130     super_check_offset = RegisterOrConstant(temp_reg);
1131   }
1132   Address super_check_addr(sub_klass, super_check_offset);
1133   ldr(rscratch1, super_check_addr);
1134   cmp(super_klass, rscratch1); // load displayed supertype
1135 
1136   // This check has worked decisively for primary supers.
1137   // Secondary supers are sought in the super_cache ('super_cache_addr').
1138   // (Secondary supers are interfaces and very deeply nested subtypes.)
1139   // This works in the same check above because of a tricky aliasing
1140   // between the super_cache and the primary super display elements.
1141   // (The 'super_check_addr' can address either, as the case requires.)
1142   // Note that the cache is updated below if it does not help us find
1143   // what we need immediately.
1144   // So if it was a primary super, we can just fail immediately.
1145   // Otherwise, it's the slow path for us (no success at this point).
1146 
1147   if (super_check_offset.is_register()) {
1148     br(Assembler::EQ, *L_success);
1149     subs(zr, super_check_offset.as_register(), sc_offset);
1150     if (L_failure == &L_fallthrough) {
1151       br(Assembler::EQ, *L_slow_path);
1152     } else {
1153       br(Assembler::NE, *L_failure);
1154       final_jmp(*L_slow_path);
1155     }
1156   } else if (super_check_offset.as_constant() == sc_offset) {
1157     // Need a slow path; fast failure is impossible.
1158     if (L_slow_path == &L_fallthrough) {
1159       br(Assembler::EQ, *L_success);
1160     } else {
1161       br(Assembler::NE, *L_slow_path);
1162       final_jmp(*L_success);
1163     }
1164   } else {
1165     // No slow path; it's a fast decision.
1166     if (L_failure == &L_fallthrough) {
1167       br(Assembler::EQ, *L_success);
1168     } else {
1169       br(Assembler::NE, *L_failure);
1170       final_jmp(*L_success);
1171     }
1172   }
1173 
1174   bind(L_fallthrough);
1175 
1176 #undef final_jmp
1177 }
1178 
1179 // These two are taken from x86, but they look generally useful
1180 
1181 // scans count pointer sized words at [addr] for occurence of value,
1182 // generic
1183 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1184                                 Register scratch) {
1185   Label Lloop, Lexit;
1186   cbz(count, Lexit);
1187   bind(Lloop);
1188   ldr(scratch, post(addr, wordSize));
1189   cmp(value, scratch);
1190   br(EQ, Lexit);
1191   sub(count, count, 1);
1192   cbnz(count, Lloop);
1193   bind(Lexit);
1194 }
1195 
1196 // scans count 4 byte words at [addr] for occurence of value,
1197 // generic
1198 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1199                                 Register scratch) {
1200   Label Lloop, Lexit;
1201   cbz(count, Lexit);
1202   bind(Lloop);
1203   ldrw(scratch, post(addr, wordSize));
1204   cmpw(value, scratch);
1205   br(EQ, Lexit);
1206   sub(count, count, 1);
1207   cbnz(count, Lloop);
1208   bind(Lexit);
1209 }
1210 
1211 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1212                                                    Register super_klass,
1213                                                    Register temp_reg,
1214                                                    Register temp2_reg,
1215                                                    Label* L_success,
1216                                                    Label* L_failure,
1217                                                    bool set_cond_codes) {
1218   assert_different_registers(sub_klass, super_klass, temp_reg);
1219   if (temp2_reg != noreg)
1220     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1221 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1222 
1223   Label L_fallthrough;
1224   int label_nulls = 0;
1225   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1226   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1227   assert(label_nulls <= 1, "at most one NULL in the batch");
1228 
1229   // a couple of useful fields in sub_klass:
1230   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1231   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1232   Address secondary_supers_addr(sub_klass, ss_offset);
1233   Address super_cache_addr(     sub_klass, sc_offset);
1234 
1235   BLOCK_COMMENT("check_klass_subtype_slow_path");
1236 
1237   // Do a linear scan of the secondary super-klass chain.
1238   // This code is rarely used, so simplicity is a virtue here.
1239   // The repne_scan instruction uses fixed registers, which we must spill.
1240   // Don't worry too much about pre-existing connections with the input regs.
1241 
1242   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1243   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1244 
1245   RegSet pushed_registers;
1246   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1247   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1248 
1249   if (super_klass != r0 || UseCompressedOops) {
1250     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1251   }
1252 
1253   push(pushed_registers, sp);
1254 
1255   // Get super_klass value into r0 (even if it was in r5 or r2).
1256   if (super_klass != r0) {
1257     mov(r0, super_klass);
1258   }
1259 
1260 #ifndef PRODUCT
1261   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1262   Address pst_counter_addr(rscratch2);
1263   ldr(rscratch1, pst_counter_addr);
1264   add(rscratch1, rscratch1, 1);
1265   str(rscratch1, pst_counter_addr);
1266 #endif //PRODUCT
1267 
1268   // We will consult the secondary-super array.
1269   ldr(r5, secondary_supers_addr);
1270   // Load the array length.
1271   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1272   // Skip to start of data.
1273   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1274 
1275   cmp(sp, zr); // Clear Z flag; SP is never zero
1276   // Scan R2 words at [R5] for an occurrence of R0.
1277   // Set NZ/Z based on last compare.
1278   repne_scan(r5, r0, r2, rscratch1);
1279 
1280   // Unspill the temp. registers:
1281   pop(pushed_registers, sp);
1282 
1283   br(Assembler::NE, *L_failure);
1284 
1285   // Success.  Cache the super we found and proceed in triumph.
1286   str(super_klass, super_cache_addr);
1287 
1288   if (L_success != &L_fallthrough) {
1289     b(*L_success);
1290   }
1291 
1292 #undef IS_A_TEMP
1293 
1294   bind(L_fallthrough);
1295 }
1296 
1297 
1298 void MacroAssembler::verify_oop(Register reg, const char* s) {
1299   if (!VerifyOops) return;
1300 
1301   // Pass register number to verify_oop_subroutine
1302   const char* b = NULL;
1303   {
1304     ResourceMark rm;
1305     stringStream ss;
1306     ss.print("verify_oop: %s: %s", reg->name(), s);
1307     b = code_string(ss.as_string());
1308   }
1309   BLOCK_COMMENT("verify_oop {");
1310 
1311   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1312   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1313 
1314   mov(r0, reg);
1315   mov(rscratch1, (address)b);
1316 
1317   // call indirectly to solve generation ordering problem
1318   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1319   ldr(rscratch2, Address(rscratch2));
1320   blr(rscratch2);
1321 
1322   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1323   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1324 
1325   BLOCK_COMMENT("} verify_oop");
1326 }
1327 
1328 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1329   if (!VerifyOops) return;
1330 
1331   const char* b = NULL;
1332   {
1333     ResourceMark rm;
1334     stringStream ss;
1335     ss.print("verify_oop_addr: %s", s);
1336     b = code_string(ss.as_string());
1337   }
1338   BLOCK_COMMENT("verify_oop_addr {");
1339 
1340   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1341   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1342 
1343   // addr may contain sp so we will have to adjust it based on the
1344   // pushes that we just did.
1345   if (addr.uses(sp)) {
1346     lea(r0, addr);
1347     ldr(r0, Address(r0, 4 * wordSize));
1348   } else {
1349     ldr(r0, addr);
1350   }
1351   mov(rscratch1, (address)b);
1352 
1353   // call indirectly to solve generation ordering problem
1354   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1355   ldr(rscratch2, Address(rscratch2));
1356   blr(rscratch2);
1357 
1358   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1359   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1360 
1361   BLOCK_COMMENT("} verify_oop_addr");
1362 }
1363 
1364 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1365                                          int extra_slot_offset) {
1366   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1367   int stackElementSize = Interpreter::stackElementSize;
1368   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1369 #ifdef ASSERT
1370   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1371   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1372 #endif
1373   if (arg_slot.is_constant()) {
1374     return Address(esp, arg_slot.as_constant() * stackElementSize
1375                    + offset);
1376   } else {
1377     add(rscratch1, esp, arg_slot.as_register(),
1378         ext::uxtx, exact_log2(stackElementSize));
1379     return Address(rscratch1, offset);
1380   }
1381 }
1382 
1383 void MacroAssembler::call_VM_leaf_base(address entry_point,
1384                                        int number_of_arguments,
1385                                        Label *retaddr) {
1386   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1390                                         int number_of_gp_arguments,
1391                                         int number_of_fp_arguments,
1392                                         ret_type type,
1393                                         Label *retaddr) {
1394   Label E, L;
1395 
1396   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1397 
1398   // We add 1 to number_of_arguments because the thread in arg0 is
1399   // not counted
1400   mov(rscratch1, entry_point);
1401   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1402   if (retaddr)
1403     bind(*retaddr);
1404 
1405   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1406   maybe_isb();
1407 }
1408 
1409 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1410   call_VM_leaf_base(entry_point, number_of_arguments);
1411 }
1412 
1413 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1414   pass_arg0(this, arg_0);
1415   call_VM_leaf_base(entry_point, 1);
1416 }
1417 
1418 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1419   pass_arg0(this, arg_0);
1420   pass_arg1(this, arg_1);
1421   call_VM_leaf_base(entry_point, 2);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1425                                   Register arg_1, Register arg_2) {
1426   pass_arg0(this, arg_0);
1427   pass_arg1(this, arg_1);
1428   pass_arg2(this, arg_2);
1429   call_VM_leaf_base(entry_point, 3);
1430 }
1431 
1432 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1433   pass_arg0(this, arg_0);
1434   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1435 }
1436 
1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1438 
1439   assert(arg_0 != c_rarg1, "smashed arg");
1440   pass_arg1(this, arg_1);
1441   pass_arg0(this, arg_0);
1442   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1443 }
1444 
1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1446   assert(arg_0 != c_rarg2, "smashed arg");
1447   assert(arg_1 != c_rarg2, "smashed arg");
1448   pass_arg2(this, arg_2);
1449   assert(arg_0 != c_rarg1, "smashed arg");
1450   pass_arg1(this, arg_1);
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1456   assert(arg_0 != c_rarg3, "smashed arg");
1457   assert(arg_1 != c_rarg3, "smashed arg");
1458   assert(arg_2 != c_rarg3, "smashed arg");
1459   pass_arg3(this, arg_3);
1460   assert(arg_0 != c_rarg2, "smashed arg");
1461   assert(arg_1 != c_rarg2, "smashed arg");
1462   pass_arg2(this, arg_2);
1463   assert(arg_0 != c_rarg1, "smashed arg");
1464   pass_arg1(this, arg_1);
1465   pass_arg0(this, arg_0);
1466   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1467 }
1468 
1469 void MacroAssembler::null_check(Register reg, int offset) {
1470   if (needs_explicit_null_check(offset)) {
1471     // provoke OS NULL exception if reg = NULL by
1472     // accessing M[reg] w/o changing any registers
1473     // NOTE: this is plenty to provoke a segv
1474     ldr(zr, Address(reg));
1475   } else {
1476     // nothing to do, (later) access of M[reg + offset]
1477     // will provoke OS NULL exception if reg = NULL
1478   }
1479 }
1480 
1481 // MacroAssembler protected routines needed to implement
1482 // public methods
1483 
1484 void MacroAssembler::mov(Register r, Address dest) {
1485   code_section()->relocate(pc(), dest.rspec());
1486   u_int64_t imm64 = (u_int64_t)dest.target();
1487   movptr(r, imm64);
1488 }
1489 
1490 // Move a constant pointer into r.  In AArch64 mode the virtual
1491 // address space is 48 bits in size, so we only need three
1492 // instructions to create a patchable instruction sequence that can
1493 // reach anywhere.
1494 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1495 #ifndef PRODUCT
1496   {
1497     char buffer[64];
1498     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1499     block_comment(buffer);
1500   }
1501 #endif
1502   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1503   movz(r, imm64 & 0xffff);
1504   imm64 >>= 16;
1505   movk(r, imm64 & 0xffff, 16);
1506   imm64 >>= 16;
1507   movk(r, imm64 & 0xffff, 32);
1508 }
1509 
1510 // Macro to mov replicated immediate to vector register.
1511 //  Vd will get the following values for different arrangements in T
1512 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1513 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1514 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1515 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1516 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1517 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1518 //   T1D/T2D: invalid
1519 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1520   assert(T != T1D && T != T2D, "invalid arrangement");
1521   if (T == T8B || T == T16B) {
1522     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1523     movi(Vd, T, imm32 & 0xff, 0);
1524     return;
1525   }
1526   u_int32_t nimm32 = ~imm32;
1527   if (T == T4H || T == T8H) {
1528     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1529     imm32 &= 0xffff;
1530     nimm32 &= 0xffff;
1531   }
1532   u_int32_t x = imm32;
1533   int movi_cnt = 0;
1534   int movn_cnt = 0;
1535   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1536   x = nimm32;
1537   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1538   if (movn_cnt < movi_cnt) imm32 = nimm32;
1539   unsigned lsl = 0;
1540   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1541   if (movn_cnt < movi_cnt)
1542     mvni(Vd, T, imm32 & 0xff, lsl);
1543   else
1544     movi(Vd, T, imm32 & 0xff, lsl);
1545   imm32 >>= 8; lsl += 8;
1546   while (imm32) {
1547     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1548     if (movn_cnt < movi_cnt)
1549       bici(Vd, T, imm32 & 0xff, lsl);
1550     else
1551       orri(Vd, T, imm32 & 0xff, lsl);
1552     lsl += 8; imm32 >>= 8;
1553   }
1554 }
1555 
1556 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1557 {
1558 #ifndef PRODUCT
1559   {
1560     char buffer[64];
1561     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1562     block_comment(buffer);
1563   }
1564 #endif
1565   if (operand_valid_for_logical_immediate(false, imm64)) {
1566     orr(dst, zr, imm64);
1567   } else {
1568     // we can use a combination of MOVZ or MOVN with
1569     // MOVK to build up the constant
1570     u_int64_t imm_h[4];
1571     int zero_count = 0;
1572     int neg_count = 0;
1573     int i;
1574     for (i = 0; i < 4; i++) {
1575       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1576       if (imm_h[i] == 0) {
1577         zero_count++;
1578       } else if (imm_h[i] == 0xffffL) {
1579         neg_count++;
1580       }
1581     }
1582     if (zero_count == 4) {
1583       // one MOVZ will do
1584       movz(dst, 0);
1585     } else if (neg_count == 4) {
1586       // one MOVN will do
1587       movn(dst, 0);
1588     } else if (zero_count == 3) {
1589       for (i = 0; i < 4; i++) {
1590         if (imm_h[i] != 0L) {
1591           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1592           break;
1593         }
1594       }
1595     } else if (neg_count == 3) {
1596       // one MOVN will do
1597       for (int i = 0; i < 4; i++) {
1598         if (imm_h[i] != 0xffffL) {
1599           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1600           break;
1601         }
1602       }
1603     } else if (zero_count == 2) {
1604       // one MOVZ and one MOVK will do
1605       for (i = 0; i < 3; i++) {
1606         if (imm_h[i] != 0L) {
1607           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1608           i++;
1609           break;
1610         }
1611       }
1612       for (;i < 4; i++) {
1613         if (imm_h[i] != 0L) {
1614           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1615         }
1616       }
1617     } else if (neg_count == 2) {
1618       // one MOVN and one MOVK will do
1619       for (i = 0; i < 4; i++) {
1620         if (imm_h[i] != 0xffffL) {
1621           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1622           i++;
1623           break;
1624         }
1625       }
1626       for (;i < 4; i++) {
1627         if (imm_h[i] != 0xffffL) {
1628           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1629         }
1630       }
1631     } else if (zero_count == 1) {
1632       // one MOVZ and two MOVKs will do
1633       for (i = 0; i < 4; i++) {
1634         if (imm_h[i] != 0L) {
1635           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1636           i++;
1637           break;
1638         }
1639       }
1640       for (;i < 4; i++) {
1641         if (imm_h[i] != 0x0L) {
1642           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1643         }
1644       }
1645     } else if (neg_count == 1) {
1646       // one MOVN and two MOVKs will do
1647       for (i = 0; i < 4; i++) {
1648         if (imm_h[i] != 0xffffL) {
1649           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1650           i++;
1651           break;
1652         }
1653       }
1654       for (;i < 4; i++) {
1655         if (imm_h[i] != 0xffffL) {
1656           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1657         }
1658       }
1659     } else {
1660       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1661       movz(dst, (u_int32_t)imm_h[0], 0);
1662       for (i = 1; i < 4; i++) {
1663         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1664       }
1665     }
1666   }
1667 }
1668 
1669 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1670 {
1671 #ifndef PRODUCT
1672     {
1673       char buffer[64];
1674       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1675       block_comment(buffer);
1676     }
1677 #endif
1678   if (operand_valid_for_logical_immediate(true, imm32)) {
1679     orrw(dst, zr, imm32);
1680   } else {
1681     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1682     // constant
1683     u_int32_t imm_h[2];
1684     imm_h[0] = imm32 & 0xffff;
1685     imm_h[1] = ((imm32 >> 16) & 0xffff);
1686     if (imm_h[0] == 0) {
1687       movzw(dst, imm_h[1], 16);
1688     } else if (imm_h[0] == 0xffff) {
1689       movnw(dst, imm_h[1] ^ 0xffff, 16);
1690     } else if (imm_h[1] == 0) {
1691       movzw(dst, imm_h[0], 0);
1692     } else if (imm_h[1] == 0xffff) {
1693       movnw(dst, imm_h[0] ^ 0xffff, 0);
1694     } else {
1695       // use a MOVZ and MOVK (makes it easier to debug)
1696       movzw(dst, imm_h[0], 0);
1697       movkw(dst, imm_h[1], 16);
1698     }
1699   }
1700 }
1701 
1702 // Form an address from base + offset in Rd.  Rd may or may
1703 // not actually be used: you must use the Address that is returned.
1704 // It is up to you to ensure that the shift provided matches the size
1705 // of your data.
1706 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1707   if (Address::offset_ok_for_immed(byte_offset, shift))
1708     // It fits; no need for any heroics
1709     return Address(base, byte_offset);
1710 
1711   // Don't do anything clever with negative or misaligned offsets
1712   unsigned mask = (1 << shift) - 1;
1713   if (byte_offset < 0 || byte_offset & mask) {
1714     mov(Rd, byte_offset);
1715     add(Rd, base, Rd);
1716     return Address(Rd);
1717   }
1718 
1719   // See if we can do this with two 12-bit offsets
1720   {
1721     unsigned long word_offset = byte_offset >> shift;
1722     unsigned long masked_offset = word_offset & 0xfff000;
1723     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1724         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1725       add(Rd, base, masked_offset << shift);
1726       word_offset -= masked_offset;
1727       return Address(Rd, word_offset << shift);
1728     }
1729   }
1730 
1731   // Do it the hard way
1732   mov(Rd, byte_offset);
1733   add(Rd, base, Rd);
1734   return Address(Rd);
1735 }
1736 
1737 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1738   if (UseLSE) {
1739     mov(tmp, 1);
1740     ldadd(Assembler::word, tmp, zr, counter_addr);
1741     return;
1742   }
1743   Label retry_load;
1744   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1745     prfm(Address(counter_addr), PSTL1STRM);
1746   bind(retry_load);
1747   // flush and load exclusive from the memory location
1748   ldxrw(tmp, counter_addr);
1749   addw(tmp, tmp, 1);
1750   // if we store+flush with no intervening write tmp wil be zero
1751   stxrw(tmp2, tmp, counter_addr);
1752   cbnzw(tmp2, retry_load);
1753 }
1754 
1755 
1756 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1757                                     bool want_remainder, Register scratch)
1758 {
1759   // Full implementation of Java idiv and irem.  The function
1760   // returns the (pc) offset of the div instruction - may be needed
1761   // for implicit exceptions.
1762   //
1763   // constraint : ra/rb =/= scratch
1764   //         normal case
1765   //
1766   // input : ra: dividend
1767   //         rb: divisor
1768   //
1769   // result: either
1770   //         quotient  (= ra idiv rb)
1771   //         remainder (= ra irem rb)
1772 
1773   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1774 
1775   int idivl_offset = offset();
1776   if (! want_remainder) {
1777     sdivw(result, ra, rb);
1778   } else {
1779     sdivw(scratch, ra, rb);
1780     Assembler::msubw(result, scratch, rb, ra);
1781   }
1782 
1783   return idivl_offset;
1784 }
1785 
1786 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1787                                     bool want_remainder, Register scratch)
1788 {
1789   // Full implementation of Java ldiv and lrem.  The function
1790   // returns the (pc) offset of the div instruction - may be needed
1791   // for implicit exceptions.
1792   //
1793   // constraint : ra/rb =/= scratch
1794   //         normal case
1795   //
1796   // input : ra: dividend
1797   //         rb: divisor
1798   //
1799   // result: either
1800   //         quotient  (= ra idiv rb)
1801   //         remainder (= ra irem rb)
1802 
1803   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1804 
1805   int idivq_offset = offset();
1806   if (! want_remainder) {
1807     sdiv(result, ra, rb);
1808   } else {
1809     sdiv(scratch, ra, rb);
1810     Assembler::msub(result, scratch, rb, ra);
1811   }
1812 
1813   return idivq_offset;
1814 }
1815 
1816 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1817   address prev = pc() - NativeMembar::instruction_size;
1818   address last = code()->last_insn();
1819   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1820     NativeMembar *bar = NativeMembar_at(prev);
1821     // We are merging two memory barrier instructions.  On AArch64 we
1822     // can do this simply by ORing them together.
1823     bar->set_kind(bar->get_kind() | order_constraint);
1824     BLOCK_COMMENT("merged membar");
1825   } else {
1826     code()->set_last_insn(pc());
1827     dmb(Assembler::barrier(order_constraint));
1828   }
1829 }
1830 
1831 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1832   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1833     merge_ldst(rt, adr, size_in_bytes, is_store);
1834     code()->clear_last_insn();
1835     return true;
1836   } else {
1837     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1838     const unsigned mask = size_in_bytes - 1;
1839     if (adr.getMode() == Address::base_plus_offset &&
1840         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1841       code()->set_last_insn(pc());
1842     }
1843     return false;
1844   }
1845 }
1846 
1847 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1848   // We always try to merge two adjacent loads into one ldp.
1849   if (!try_merge_ldst(Rx, adr, 8, false)) {
1850     Assembler::ldr(Rx, adr);
1851   }
1852 }
1853 
1854 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1855   // We always try to merge two adjacent loads into one ldp.
1856   if (!try_merge_ldst(Rw, adr, 4, false)) {
1857     Assembler::ldrw(Rw, adr);
1858   }
1859 }
1860 
1861 void MacroAssembler::str(Register Rx, const Address &adr) {
1862   // We always try to merge two adjacent stores into one stp.
1863   if (!try_merge_ldst(Rx, adr, 8, true)) {
1864     Assembler::str(Rx, adr);
1865   }
1866 }
1867 
1868 void MacroAssembler::strw(Register Rw, const Address &adr) {
1869   // We always try to merge two adjacent stores into one stp.
1870   if (!try_merge_ldst(Rw, adr, 4, true)) {
1871     Assembler::strw(Rw, adr);
1872   }
1873 }
1874 
1875 // MacroAssembler routines found actually to be needed
1876 
1877 void MacroAssembler::push(Register src)
1878 {
1879   str(src, Address(pre(esp, -1 * wordSize)));
1880 }
1881 
1882 void MacroAssembler::pop(Register dst)
1883 {
1884   ldr(dst, Address(post(esp, 1 * wordSize)));
1885 }
1886 
1887 // Note: load_unsigned_short used to be called load_unsigned_word.
1888 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1889   int off = offset();
1890   ldrh(dst, src);
1891   return off;
1892 }
1893 
1894 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1895   int off = offset();
1896   ldrb(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_signed_short(Register dst, Address src) {
1901   int off = offset();
1902   ldrsh(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1907   int off = offset();
1908   ldrsb(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1913   int off = offset();
1914   ldrshw(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1919   int off = offset();
1920   ldrsbw(dst, src);
1921   return off;
1922 }
1923 
1924 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1925   switch (size_in_bytes) {
1926   case  8:  ldr(dst, src); break;
1927   case  4:  ldrw(dst, src); break;
1928   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1929   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1930   default:  ShouldNotReachHere();
1931   }
1932 }
1933 
1934 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1935   switch (size_in_bytes) {
1936   case  8:  str(src, dst); break;
1937   case  4:  strw(src, dst); break;
1938   case  2:  strh(src, dst); break;
1939   case  1:  strb(src, dst); break;
1940   default:  ShouldNotReachHere();
1941   }
1942 }
1943 
1944 void MacroAssembler::decrementw(Register reg, int value)
1945 {
1946   if (value < 0)  { incrementw(reg, -value);      return; }
1947   if (value == 0) {                               return; }
1948   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1949   /* else */ {
1950     guarantee(reg != rscratch2, "invalid dst for register decrement");
1951     movw(rscratch2, (unsigned)value);
1952     subw(reg, reg, rscratch2);
1953   }
1954 }
1955 
1956 void MacroAssembler::decrement(Register reg, int value)
1957 {
1958   if (value < 0)  { increment(reg, -value);      return; }
1959   if (value == 0) {                              return; }
1960   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1961   /* else */ {
1962     assert(reg != rscratch2, "invalid dst for register decrement");
1963     mov(rscratch2, (unsigned long)value);
1964     sub(reg, reg, rscratch2);
1965   }
1966 }
1967 
1968 void MacroAssembler::decrementw(Address dst, int value)
1969 {
1970   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1971   if (dst.getMode() == Address::literal) {
1972     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1973     lea(rscratch2, dst);
1974     dst = Address(rscratch2);
1975   }
1976   ldrw(rscratch1, dst);
1977   decrementw(rscratch1, value);
1978   strw(rscratch1, dst);
1979 }
1980 
1981 void MacroAssembler::decrement(Address dst, int value)
1982 {
1983   assert(!dst.uses(rscratch1), "invalid address for decrement");
1984   if (dst.getMode() == Address::literal) {
1985     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1986     lea(rscratch2, dst);
1987     dst = Address(rscratch2);
1988   }
1989   ldr(rscratch1, dst);
1990   decrement(rscratch1, value);
1991   str(rscratch1, dst);
1992 }
1993 
1994 void MacroAssembler::incrementw(Register reg, int value)
1995 {
1996   if (value < 0)  { decrementw(reg, -value);      return; }
1997   if (value == 0) {                               return; }
1998   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1999   /* else */ {
2000     assert(reg != rscratch2, "invalid dst for register increment");
2001     movw(rscratch2, (unsigned)value);
2002     addw(reg, reg, rscratch2);
2003   }
2004 }
2005 
2006 void MacroAssembler::increment(Register reg, int value)
2007 {
2008   if (value < 0)  { decrement(reg, -value);      return; }
2009   if (value == 0) {                              return; }
2010   if (value < (1 << 12)) { add(reg, reg, value); return; }
2011   /* else */ {
2012     assert(reg != rscratch2, "invalid dst for register increment");
2013     movw(rscratch2, (unsigned)value);
2014     add(reg, reg, rscratch2);
2015   }
2016 }
2017 
2018 void MacroAssembler::incrementw(Address dst, int value)
2019 {
2020   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2021   if (dst.getMode() == Address::literal) {
2022     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2023     lea(rscratch2, dst);
2024     dst = Address(rscratch2);
2025   }
2026   ldrw(rscratch1, dst);
2027   incrementw(rscratch1, value);
2028   strw(rscratch1, dst);
2029 }
2030 
2031 void MacroAssembler::increment(Address dst, int value)
2032 {
2033   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2034   if (dst.getMode() == Address::literal) {
2035     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2036     lea(rscratch2, dst);
2037     dst = Address(rscratch2);
2038   }
2039   ldr(rscratch1, dst);
2040   increment(rscratch1, value);
2041   str(rscratch1, dst);
2042 }
2043 
2044 
2045 void MacroAssembler::pusha() {
2046   push(0x7fffffff, sp);
2047 }
2048 
2049 void MacroAssembler::popa() {
2050   pop(0x7fffffff, sp);
2051 }
2052 
2053 // Push lots of registers in the bit set supplied.  Don't push sp.
2054 // Return the number of words pushed
2055 int MacroAssembler::push(unsigned int bitset, Register stack) {
2056   int words_pushed = 0;
2057 
2058   // Scan bitset to accumulate register pairs
2059   unsigned char regs[32];
2060   int count = 0;
2061   for (int reg = 0; reg <= 30; reg++) {
2062     if (1 & bitset)
2063       regs[count++] = reg;
2064     bitset >>= 1;
2065   }
2066   regs[count++] = zr->encoding_nocheck();
2067   count &= ~1;  // Only push an even nuber of regs
2068 
2069   if (count) {
2070     stp(as_Register(regs[0]), as_Register(regs[1]),
2071        Address(pre(stack, -count * wordSize)));
2072     words_pushed += 2;
2073   }
2074   for (int i = 2; i < count; i += 2) {
2075     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2076        Address(stack, i * wordSize));
2077     words_pushed += 2;
2078   }
2079 
2080   assert(words_pushed == count, "oops, pushed != count");
2081 
2082   return count;
2083 }
2084 
2085 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2086   int words_pushed = 0;
2087 
2088   // Scan bitset to accumulate register pairs
2089   unsigned char regs[32];
2090   int count = 0;
2091   for (int reg = 0; reg <= 30; reg++) {
2092     if (1 & bitset)
2093       regs[count++] = reg;
2094     bitset >>= 1;
2095   }
2096   regs[count++] = zr->encoding_nocheck();
2097   count &= ~1;
2098 
2099   for (int i = 2; i < count; i += 2) {
2100     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2101        Address(stack, i * wordSize));
2102     words_pushed += 2;
2103   }
2104   if (count) {
2105     ldp(as_Register(regs[0]), as_Register(regs[1]),
2106        Address(post(stack, count * wordSize)));
2107     words_pushed += 2;
2108   }
2109 
2110   assert(words_pushed == count, "oops, pushed != count");
2111 
2112   return count;
2113 }
2114 #ifdef ASSERT
2115 void MacroAssembler::verify_heapbase(const char* msg) {
2116 #if 0
2117   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2118   assert (Universe::heap() != NULL, "java heap should be initialized");
2119   if (CheckCompressedOops) {
2120     Label ok;
2121     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2122     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2123     br(Assembler::EQ, ok);
2124     stop(msg);
2125     bind(ok);
2126     pop(1 << rscratch1->encoding(), sp);
2127   }
2128 #endif
2129 }
2130 #endif
2131 
2132 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2133   Label done, not_weak;
2134   cbz(value, done);           // Use NULL as-is.
2135 
2136   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2137   tbz(r0, 0, not_weak);    // Test for jweak tag.
2138 
2139   // Resolve jweak.
2140   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2141                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2142   verify_oop(value);
2143   b(done);
2144 
2145   bind(not_weak);
2146   // Resolve (untagged) jobject.
2147   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2148   verify_oop(value);
2149   bind(done);
2150 }
2151 
2152 void MacroAssembler::stop(const char* msg) {
2153   address ip = pc();
2154   pusha();
2155   mov(c_rarg0, (address)msg);
2156   mov(c_rarg1, (address)ip);
2157   mov(c_rarg2, sp);
2158   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2159   // call(c_rarg3);
2160   blrt(c_rarg3, 3, 0, 1);
2161   hlt(0);
2162 }
2163 
2164 void MacroAssembler::warn(const char* msg) {
2165   pusha();
2166   mov(c_rarg0, (address)msg);
2167   mov(lr, CAST_FROM_FN_PTR(address, warning));
2168   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2169   popa();
2170 }
2171 
2172 void MacroAssembler::unimplemented(const char* what) {
2173   const char* buf = NULL;
2174   {
2175     ResourceMark rm;
2176     stringStream ss;
2177     ss.print("unimplemented: %s", what);
2178     buf = code_string(ss.as_string());
2179   }
2180   stop(buf);
2181 }
2182 
2183 // If a constant does not fit in an immediate field, generate some
2184 // number of MOV instructions and then perform the operation.
2185 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2186                                            add_sub_imm_insn insn1,
2187                                            add_sub_reg_insn insn2) {
2188   assert(Rd != zr, "Rd = zr and not setting flags?");
2189   if (operand_valid_for_add_sub_immediate((int)imm)) {
2190     (this->*insn1)(Rd, Rn, imm);
2191   } else {
2192     if (uabs(imm) < (1 << 24)) {
2193        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2194        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2195     } else {
2196        assert_different_registers(Rd, Rn);
2197        mov(Rd, (uint64_t)imm);
2198        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2199     }
2200   }
2201 }
2202 
2203 // Seperate vsn which sets the flags. Optimisations are more restricted
2204 // because we must set the flags correctly.
2205 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2206                                            add_sub_imm_insn insn1,
2207                                            add_sub_reg_insn insn2) {
2208   if (operand_valid_for_add_sub_immediate((int)imm)) {
2209     (this->*insn1)(Rd, Rn, imm);
2210   } else {
2211     assert_different_registers(Rd, Rn);
2212     assert(Rd != zr, "overflow in immediate operand");
2213     mov(Rd, (uint64_t)imm);
2214     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2215   }
2216 }
2217 
2218 
2219 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2220   if (increment.is_register()) {
2221     add(Rd, Rn, increment.as_register());
2222   } else {
2223     add(Rd, Rn, increment.as_constant());
2224   }
2225 }
2226 
2227 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2228   if (increment.is_register()) {
2229     addw(Rd, Rn, increment.as_register());
2230   } else {
2231     addw(Rd, Rn, increment.as_constant());
2232   }
2233 }
2234 
2235 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2236   if (decrement.is_register()) {
2237     sub(Rd, Rn, decrement.as_register());
2238   } else {
2239     sub(Rd, Rn, decrement.as_constant());
2240   }
2241 }
2242 
2243 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2244   if (decrement.is_register()) {
2245     subw(Rd, Rn, decrement.as_register());
2246   } else {
2247     subw(Rd, Rn, decrement.as_constant());
2248   }
2249 }
2250 
2251 void MacroAssembler::reinit_heapbase()
2252 {
2253   if (UseCompressedOops) {
2254     if (Universe::is_fully_initialized()) {
2255       mov(rheapbase, Universe::narrow_ptrs_base());
2256     } else {
2257       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2258       ldr(rheapbase, Address(rheapbase));
2259     }
2260   }
2261 }
2262 
2263 // this simulates the behaviour of the x86 cmpxchg instruction using a
2264 // load linked/store conditional pair. we use the acquire/release
2265 // versions of these instructions so that we flush pending writes as
2266 // per Java semantics.
2267 
2268 // n.b the x86 version assumes the old value to be compared against is
2269 // in rax and updates rax with the value located in memory if the
2270 // cmpxchg fails. we supply a register for the old value explicitly
2271 
2272 // the aarch64 load linked/store conditional instructions do not
2273 // accept an offset. so, unlike x86, we must provide a plain register
2274 // to identify the memory word to be compared/exchanged rather than a
2275 // register+offset Address.
2276 
2277 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2278                                 Label &succeed, Label *fail) {
2279   // oldv holds comparison value
2280   // newv holds value to write in exchange
2281   // addr identifies memory word to compare against/update
2282   if (UseLSE) {
2283     mov(tmp, oldv);
2284     casal(Assembler::xword, oldv, newv, addr);
2285     cmp(tmp, oldv);
2286     br(Assembler::EQ, succeed);
2287     membar(AnyAny);
2288   } else {
2289     Label retry_load, nope;
2290     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2291       prfm(Address(addr), PSTL1STRM);
2292     bind(retry_load);
2293     // flush and load exclusive from the memory location
2294     // and fail if it is not what we expect
2295     ldaxr(tmp, addr);
2296     cmp(tmp, oldv);
2297     br(Assembler::NE, nope);
2298     // if we store+flush with no intervening write tmp wil be zero
2299     stlxr(tmp, newv, addr);
2300     cbzw(tmp, succeed);
2301     // retry so we only ever return after a load fails to compare
2302     // ensures we don't return a stale value after a failed write.
2303     b(retry_load);
2304     // if the memory word differs we return it in oldv and signal a fail
2305     bind(nope);
2306     membar(AnyAny);
2307     mov(oldv, tmp);
2308   }
2309   if (fail)
2310     b(*fail);
2311 }
2312 
2313 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2314                                         Label &succeed, Label *fail) {
2315   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2316   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2317 }
2318 
2319 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2320                                 Label &succeed, Label *fail) {
2321   // oldv holds comparison value
2322   // newv holds value to write in exchange
2323   // addr identifies memory word to compare against/update
2324   // tmp returns 0/1 for success/failure
2325   if (UseLSE) {
2326     mov(tmp, oldv);
2327     casal(Assembler::word, oldv, newv, addr);
2328     cmp(tmp, oldv);
2329     br(Assembler::EQ, succeed);
2330     membar(AnyAny);
2331   } else {
2332     Label retry_load, nope;
2333     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2334       prfm(Address(addr), PSTL1STRM);
2335     bind(retry_load);
2336     // flush and load exclusive from the memory location
2337     // and fail if it is not what we expect
2338     ldaxrw(tmp, addr);
2339     cmp(tmp, oldv);
2340     br(Assembler::NE, nope);
2341     // if we store+flush with no intervening write tmp wil be zero
2342     stlxrw(tmp, newv, addr);
2343     cbzw(tmp, succeed);
2344     // retry so we only ever return after a load fails to compare
2345     // ensures we don't return a stale value after a failed write.
2346     b(retry_load);
2347     // if the memory word differs we return it in oldv and signal a fail
2348     bind(nope);
2349     membar(AnyAny);
2350     mov(oldv, tmp);
2351   }
2352   if (fail)
2353     b(*fail);
2354 }
2355 
2356 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2357 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2358 // Pass a register for the result, otherwise pass noreg.
2359 
2360 // Clobbers rscratch1
2361 void MacroAssembler::cmpxchg(Register addr, Register expected,
2362                              Register new_val,
2363                              enum operand_size size,
2364                              bool acquire, bool release,
2365                              bool weak,
2366                              Register result) {
2367   if (result == noreg)  result = rscratch1;
2368   BLOCK_COMMENT("cmpxchg {");
2369   if (UseLSE) {
2370     mov(result, expected);
2371     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2372     compare_eq(result, expected, size);
2373   } else {
2374     Label retry_load, done;
2375     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2376       prfm(Address(addr), PSTL1STRM);
2377     bind(retry_load);
2378     load_exclusive(result, addr, size, acquire);
2379     compare_eq(result, expected, size);
2380     br(Assembler::NE, done);
2381     store_exclusive(rscratch1, new_val, addr, size, release);
2382     if (weak) {
2383       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2384     } else {
2385       cbnzw(rscratch1, retry_load);
2386     }
2387     bind(done);
2388   }
2389   BLOCK_COMMENT("} cmpxchg");
2390 }
2391 
2392 // A generic comparison. Only compares for equality, clobbers rscratch1.
2393 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2394   if (size == xword) {
2395     cmp(rm, rn);
2396   } else if (size == word) {
2397     cmpw(rm, rn);
2398   } else if (size == halfword) {
2399     eorw(rscratch1, rm, rn);
2400     ands(zr, rscratch1, 0xffff);
2401   } else if (size == byte) {
2402     eorw(rscratch1, rm, rn);
2403     ands(zr, rscratch1, 0xff);
2404   } else {
2405     ShouldNotReachHere();
2406   }
2407 }
2408 
2409 
2410 static bool different(Register a, RegisterOrConstant b, Register c) {
2411   if (b.is_constant())
2412     return a != c;
2413   else
2414     return a != b.as_register() && a != c && b.as_register() != c;
2415 }
2416 
2417 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2418 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2419   if (UseLSE) {                                                         \
2420     prev = prev->is_valid() ? prev : zr;                                \
2421     if (incr.is_register()) {                                           \
2422       AOP(sz, incr.as_register(), prev, addr);                          \
2423     } else {                                                            \
2424       mov(rscratch2, incr.as_constant());                               \
2425       AOP(sz, rscratch2, prev, addr);                                   \
2426     }                                                                   \
2427     return;                                                             \
2428   }                                                                     \
2429   Register result = rscratch2;                                          \
2430   if (prev->is_valid())                                                 \
2431     result = different(prev, incr, addr) ? prev : rscratch2;            \
2432                                                                         \
2433   Label retry_load;                                                     \
2434   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2435     prfm(Address(addr), PSTL1STRM);                                     \
2436   bind(retry_load);                                                     \
2437   LDXR(result, addr);                                                   \
2438   OP(rscratch1, result, incr);                                          \
2439   STXR(rscratch2, rscratch1, addr);                                     \
2440   cbnzw(rscratch2, retry_load);                                         \
2441   if (prev->is_valid() && prev != result) {                             \
2442     IOP(prev, rscratch1, incr);                                         \
2443   }                                                                     \
2444 }
2445 
2446 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2447 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2448 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2449 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2450 
2451 #undef ATOMIC_OP
2452 
2453 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2454 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2455   if (UseLSE) {                                                         \
2456     prev = prev->is_valid() ? prev : zr;                                \
2457     AOP(sz, newv, prev, addr);                                          \
2458     return;                                                             \
2459   }                                                                     \
2460   Register result = rscratch2;                                          \
2461   if (prev->is_valid())                                                 \
2462     result = different(prev, newv, addr) ? prev : rscratch2;            \
2463                                                                         \
2464   Label retry_load;                                                     \
2465   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2466     prfm(Address(addr), PSTL1STRM);                                     \
2467   bind(retry_load);                                                     \
2468   LDXR(result, addr);                                                   \
2469   STXR(rscratch1, newv, addr);                                          \
2470   cbnzw(rscratch1, retry_load);                                         \
2471   if (prev->is_valid() && prev != result)                               \
2472     mov(prev, result);                                                  \
2473 }
2474 
2475 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2477 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2478 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2479 
2480 #undef ATOMIC_XCHG
2481 
2482 #ifndef PRODUCT
2483 extern "C" void findpc(intptr_t x);
2484 #endif
2485 
2486 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2487 {
2488   // In order to get locks to work, we need to fake a in_VM state
2489   if (ShowMessageBoxOnError ) {
2490     JavaThread* thread = JavaThread::current();
2491     JavaThreadState saved_state = thread->thread_state();
2492     thread->set_thread_state(_thread_in_vm);
2493 #ifndef PRODUCT
2494     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2495       ttyLocker ttyl;
2496       BytecodeCounter::print();
2497     }
2498 #endif
2499     if (os::message_box(msg, "Execution stopped, print registers?")) {
2500       ttyLocker ttyl;
2501       tty->print_cr(" pc = 0x%016lx", pc);
2502 #ifndef PRODUCT
2503       tty->cr();
2504       findpc(pc);
2505       tty->cr();
2506 #endif
2507       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2508       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2509       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2510       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2511       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2512       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2513       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2514       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2515       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2516       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2517       tty->print_cr("r10 = 0x%016lx", regs[10]);
2518       tty->print_cr("r11 = 0x%016lx", regs[11]);
2519       tty->print_cr("r12 = 0x%016lx", regs[12]);
2520       tty->print_cr("r13 = 0x%016lx", regs[13]);
2521       tty->print_cr("r14 = 0x%016lx", regs[14]);
2522       tty->print_cr("r15 = 0x%016lx", regs[15]);
2523       tty->print_cr("r16 = 0x%016lx", regs[16]);
2524       tty->print_cr("r17 = 0x%016lx", regs[17]);
2525       tty->print_cr("r18 = 0x%016lx", regs[18]);
2526       tty->print_cr("r19 = 0x%016lx", regs[19]);
2527       tty->print_cr("r20 = 0x%016lx", regs[20]);
2528       tty->print_cr("r21 = 0x%016lx", regs[21]);
2529       tty->print_cr("r22 = 0x%016lx", regs[22]);
2530       tty->print_cr("r23 = 0x%016lx", regs[23]);
2531       tty->print_cr("r24 = 0x%016lx", regs[24]);
2532       tty->print_cr("r25 = 0x%016lx", regs[25]);
2533       tty->print_cr("r26 = 0x%016lx", regs[26]);
2534       tty->print_cr("r27 = 0x%016lx", regs[27]);
2535       tty->print_cr("r28 = 0x%016lx", regs[28]);
2536       tty->print_cr("r30 = 0x%016lx", regs[30]);
2537       tty->print_cr("r31 = 0x%016lx", regs[31]);
2538       BREAKPOINT;
2539     }
2540     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2541   } else {
2542     ttyLocker ttyl;
2543     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2544                     msg);
2545     assert(false, "DEBUG MESSAGE: %s", msg);
2546   }
2547 }
2548 
2549 #ifdef BUILTIN_SIM
2550 // routine to generate an x86 prolog for a stub function which
2551 // bootstraps into the generated ARM code which directly follows the
2552 // stub
2553 //
2554 // the argument encodes the number of general and fp registers
2555 // passed by the caller and the callng convention (currently just
2556 // the number of general registers and assumes C argument passing)
2557 
2558 extern "C" {
2559 int aarch64_stub_prolog_size();
2560 void aarch64_stub_prolog();
2561 void aarch64_prolog();
2562 }
2563 
2564 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2565                                    address *prolog_ptr)
2566 {
2567   int calltype = (((ret_type & 0x3) << 8) |
2568                   ((fp_arg_count & 0xf) << 4) |
2569                   (gp_arg_count & 0xf));
2570 
2571   // the addresses for the x86 to ARM entry code we need to use
2572   address start = pc();
2573   // printf("start = %lx\n", start);
2574   int byteCount =  aarch64_stub_prolog_size();
2575   // printf("byteCount = %x\n", byteCount);
2576   int instructionCount = (byteCount + 3)/ 4;
2577   // printf("instructionCount = %x\n", instructionCount);
2578   for (int i = 0; i < instructionCount; i++) {
2579     nop();
2580   }
2581 
2582   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2583 
2584   // write the address of the setup routine and the call format at the
2585   // end of into the copied code
2586   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2587   if (prolog_ptr)
2588     patch_end[-2] = (u_int64_t)prolog_ptr;
2589   patch_end[-1] = calltype;
2590 }
2591 #endif
2592 
2593 void MacroAssembler::push_call_clobbered_registers() {
2594   int step = 4 * wordSize;
2595   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2596   sub(sp, sp, step);
2597   mov(rscratch1, -step);
2598   // Push v0-v7, v16-v31.
2599   for (int i = 31; i>= 4; i -= 4) {
2600     if (i <= v7->encoding() || i >= v16->encoding())
2601       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2602           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2603   }
2604   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2605       as_FloatRegister(3), T1D, Address(sp));
2606 }
2607 
2608 void MacroAssembler::pop_call_clobbered_registers() {
2609   for (int i = 0; i < 32; i += 4) {
2610     if (i <= v7->encoding() || i >= v16->encoding())
2611       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2612           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2613   }
2614 
2615   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2616 }
2617 
2618 void MacroAssembler::push_CPU_state(bool save_vectors) {
2619   int step = (save_vectors ? 8 : 4) * wordSize;
2620   push(0x3fffffff, sp);         // integer registers except lr & sp
2621   mov(rscratch1, -step);
2622   sub(sp, sp, step);
2623   for (int i = 28; i >= 4; i -= 4) {
2624     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2625         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2626   }
2627   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2628 }
2629 
2630 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2631   int step = (restore_vectors ? 8 : 4) * wordSize;
2632   for (int i = 0; i <= 28; i += 4)
2633     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2634         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2635   pop(0x3fffffff, sp);         // integer registers except lr & sp
2636 }
2637 
2638 /**
2639  * Helpers for multiply_to_len().
2640  */
2641 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2642                                      Register src1, Register src2) {
2643   adds(dest_lo, dest_lo, src1);
2644   adc(dest_hi, dest_hi, zr);
2645   adds(dest_lo, dest_lo, src2);
2646   adc(final_dest_hi, dest_hi, zr);
2647 }
2648 
2649 // Generate an address from (r + r1 extend offset).  "size" is the
2650 // size of the operand.  The result may be in rscratch2.
2651 Address MacroAssembler::offsetted_address(Register r, Register r1,
2652                                           Address::extend ext, int offset, int size) {
2653   if (offset || (ext.shift() % size != 0)) {
2654     lea(rscratch2, Address(r, r1, ext));
2655     return Address(rscratch2, offset);
2656   } else {
2657     return Address(r, r1, ext);
2658   }
2659 }
2660 
2661 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2662 {
2663   assert(offset >= 0, "spill to negative address?");
2664   // Offset reachable ?
2665   //   Not aligned - 9 bits signed offset
2666   //   Aligned - 12 bits unsigned offset shifted
2667   Register base = sp;
2668   if ((offset & (size-1)) && offset >= (1<<8)) {
2669     add(tmp, base, offset & ((1<<12)-1));
2670     base = tmp;
2671     offset &= -1<<12;
2672   }
2673 
2674   if (offset >= (1<<12) * size) {
2675     add(tmp, base, offset & (((1<<12)-1)<<12));
2676     base = tmp;
2677     offset &= ~(((1<<12)-1)<<12);
2678   }
2679 
2680   return Address(base, offset);
2681 }
2682 
2683 // Checks whether offset is aligned.
2684 // Returns true if it is, else false.
2685 bool MacroAssembler::merge_alignment_check(Register base,
2686                                            size_t size,
2687                                            long cur_offset,
2688                                            long prev_offset) const {
2689   if (AvoidUnalignedAccesses) {
2690     if (base == sp) {
2691       // Checks whether low offset if aligned to pair of registers.
2692       long pair_mask = size * 2 - 1;
2693       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2694       return (offset & pair_mask) == 0;
2695     } else { // If base is not sp, we can't guarantee the access is aligned.
2696       return false;
2697     }
2698   } else {
2699     long mask = size - 1;
2700     // Load/store pair instruction only supports element size aligned offset.
2701     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2702   }
2703 }
2704 
2705 // Checks whether current and previous loads/stores can be merged.
2706 // Returns true if it can be merged, else false.
2707 bool MacroAssembler::ldst_can_merge(Register rt,
2708                                     const Address &adr,
2709                                     size_t cur_size_in_bytes,
2710                                     bool is_store) const {
2711   address prev = pc() - NativeInstruction::instruction_size;
2712   address last = code()->last_insn();
2713 
2714   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2715     return false;
2716   }
2717 
2718   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2719     return false;
2720   }
2721 
2722   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2723   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2724 
2725   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2726   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2727 
2728   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2729     return false;
2730   }
2731 
2732   long max_offset = 63 * prev_size_in_bytes;
2733   long min_offset = -64 * prev_size_in_bytes;
2734 
2735   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2736 
2737   // Only same base can be merged.
2738   if (adr.base() != prev_ldst->base()) {
2739     return false;
2740   }
2741 
2742   long cur_offset = adr.offset();
2743   long prev_offset = prev_ldst->offset();
2744   size_t diff = abs(cur_offset - prev_offset);
2745   if (diff != prev_size_in_bytes) {
2746     return false;
2747   }
2748 
2749   // Following cases can not be merged:
2750   // ldr x2, [x2, #8]
2751   // ldr x3, [x2, #16]
2752   // or:
2753   // ldr x2, [x3, #8]
2754   // ldr x2, [x3, #16]
2755   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2756   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2757     return false;
2758   }
2759 
2760   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2761   // Offset range must be in ldp/stp instruction's range.
2762   if (low_offset > max_offset || low_offset < min_offset) {
2763     return false;
2764   }
2765 
2766   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2767     return true;
2768   }
2769 
2770   return false;
2771 }
2772 
2773 // Merge current load/store with previous load/store into ldp/stp.
2774 void MacroAssembler::merge_ldst(Register rt,
2775                                 const Address &adr,
2776                                 size_t cur_size_in_bytes,
2777                                 bool is_store) {
2778 
2779   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2780 
2781   Register rt_low, rt_high;
2782   address prev = pc() - NativeInstruction::instruction_size;
2783   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2784 
2785   long offset;
2786 
2787   if (adr.offset() < prev_ldst->offset()) {
2788     offset = adr.offset();
2789     rt_low = rt;
2790     rt_high = prev_ldst->target();
2791   } else {
2792     offset = prev_ldst->offset();
2793     rt_low = prev_ldst->target();
2794     rt_high = rt;
2795   }
2796 
2797   Address adr_p = Address(prev_ldst->base(), offset);
2798   // Overwrite previous generated binary.
2799   code_section()->set_end(prev);
2800 
2801   const int sz = prev_ldst->size_in_bytes();
2802   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2803   if (!is_store) {
2804     BLOCK_COMMENT("merged ldr pair");
2805     if (sz == 8) {
2806       ldp(rt_low, rt_high, adr_p);
2807     } else {
2808       ldpw(rt_low, rt_high, adr_p);
2809     }
2810   } else {
2811     BLOCK_COMMENT("merged str pair");
2812     if (sz == 8) {
2813       stp(rt_low, rt_high, adr_p);
2814     } else {
2815       stpw(rt_low, rt_high, adr_p);
2816     }
2817   }
2818 }
2819 
2820 /**
2821  * Multiply 64 bit by 64 bit first loop.
2822  */
2823 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2824                                            Register y, Register y_idx, Register z,
2825                                            Register carry, Register product,
2826                                            Register idx, Register kdx) {
2827   //
2828   //  jlong carry, x[], y[], z[];
2829   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2830   //    huge_128 product = y[idx] * x[xstart] + carry;
2831   //    z[kdx] = (jlong)product;
2832   //    carry  = (jlong)(product >>> 64);
2833   //  }
2834   //  z[xstart] = carry;
2835   //
2836 
2837   Label L_first_loop, L_first_loop_exit;
2838   Label L_one_x, L_one_y, L_multiply;
2839 
2840   subsw(xstart, xstart, 1);
2841   br(Assembler::MI, L_one_x);
2842 
2843   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2844   ldr(x_xstart, Address(rscratch1));
2845   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2846 
2847   bind(L_first_loop);
2848   subsw(idx, idx, 1);
2849   br(Assembler::MI, L_first_loop_exit);
2850   subsw(idx, idx, 1);
2851   br(Assembler::MI, L_one_y);
2852   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2853   ldr(y_idx, Address(rscratch1));
2854   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2855   bind(L_multiply);
2856 
2857   // AArch64 has a multiply-accumulate instruction that we can't use
2858   // here because it has no way to process carries, so we have to use
2859   // separate add and adc instructions.  Bah.
2860   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2861   mul(product, x_xstart, y_idx);
2862   adds(product, product, carry);
2863   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2864 
2865   subw(kdx, kdx, 2);
2866   ror(product, product, 32); // back to big-endian
2867   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2868 
2869   b(L_first_loop);
2870 
2871   bind(L_one_y);
2872   ldrw(y_idx, Address(y,  0));
2873   b(L_multiply);
2874 
2875   bind(L_one_x);
2876   ldrw(x_xstart, Address(x,  0));
2877   b(L_first_loop);
2878 
2879   bind(L_first_loop_exit);
2880 }
2881 
2882 /**
2883  * Multiply 128 bit by 128. Unrolled inner loop.
2884  *
2885  */
2886 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2887                                              Register carry, Register carry2,
2888                                              Register idx, Register jdx,
2889                                              Register yz_idx1, Register yz_idx2,
2890                                              Register tmp, Register tmp3, Register tmp4,
2891                                              Register tmp6, Register product_hi) {
2892 
2893   //   jlong carry, x[], y[], z[];
2894   //   int kdx = ystart+1;
2895   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2896   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2897   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2898   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2899   //     carry  = (jlong)(tmp4 >>> 64);
2900   //     z[kdx+idx+1] = (jlong)tmp3;
2901   //     z[kdx+idx] = (jlong)tmp4;
2902   //   }
2903   //   idx += 2;
2904   //   if (idx > 0) {
2905   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2906   //     z[kdx+idx] = (jlong)yz_idx1;
2907   //     carry  = (jlong)(yz_idx1 >>> 64);
2908   //   }
2909   //
2910 
2911   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2912 
2913   lsrw(jdx, idx, 2);
2914 
2915   bind(L_third_loop);
2916 
2917   subsw(jdx, jdx, 1);
2918   br(Assembler::MI, L_third_loop_exit);
2919   subw(idx, idx, 4);
2920 
2921   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2922 
2923   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2924 
2925   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2926 
2927   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2928   ror(yz_idx2, yz_idx2, 32);
2929 
2930   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2931 
2932   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2933   umulh(tmp4, product_hi, yz_idx1);
2934 
2935   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2936   ror(rscratch2, rscratch2, 32);
2937 
2938   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2939   umulh(carry2, product_hi, yz_idx2);
2940 
2941   // propagate sum of both multiplications into carry:tmp4:tmp3
2942   adds(tmp3, tmp3, carry);
2943   adc(tmp4, tmp4, zr);
2944   adds(tmp3, tmp3, rscratch1);
2945   adcs(tmp4, tmp4, tmp);
2946   adc(carry, carry2, zr);
2947   adds(tmp4, tmp4, rscratch2);
2948   adc(carry, carry, zr);
2949 
2950   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2951   ror(tmp4, tmp4, 32);
2952   stp(tmp4, tmp3, Address(tmp6, 0));
2953 
2954   b(L_third_loop);
2955   bind (L_third_loop_exit);
2956 
2957   andw (idx, idx, 0x3);
2958   cbz(idx, L_post_third_loop_done);
2959 
2960   Label L_check_1;
2961   subsw(idx, idx, 2);
2962   br(Assembler::MI, L_check_1);
2963 
2964   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2965   ldr(yz_idx1, Address(rscratch1, 0));
2966   ror(yz_idx1, yz_idx1, 32);
2967   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2968   umulh(tmp4, product_hi, yz_idx1);
2969   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2970   ldr(yz_idx2, Address(rscratch1, 0));
2971   ror(yz_idx2, yz_idx2, 32);
2972 
2973   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2974 
2975   ror(tmp3, tmp3, 32);
2976   str(tmp3, Address(rscratch1, 0));
2977 
2978   bind (L_check_1);
2979 
2980   andw (idx, idx, 0x1);
2981   subsw(idx, idx, 1);
2982   br(Assembler::MI, L_post_third_loop_done);
2983   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2984   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2985   umulh(carry2, tmp4, product_hi);
2986   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2987 
2988   add2_with_carry(carry2, tmp3, tmp4, carry);
2989 
2990   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2991   extr(carry, carry2, tmp3, 32);
2992 
2993   bind(L_post_third_loop_done);
2994 }
2995 
2996 /**
2997  * Code for BigInteger::multiplyToLen() instrinsic.
2998  *
2999  * r0: x
3000  * r1: xlen
3001  * r2: y
3002  * r3: ylen
3003  * r4:  z
3004  * r5: zlen
3005  * r10: tmp1
3006  * r11: tmp2
3007  * r12: tmp3
3008  * r13: tmp4
3009  * r14: tmp5
3010  * r15: tmp6
3011  * r16: tmp7
3012  *
3013  */
3014 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3015                                      Register z, Register zlen,
3016                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3017                                      Register tmp5, Register tmp6, Register product_hi) {
3018 
3019   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3020 
3021   const Register idx = tmp1;
3022   const Register kdx = tmp2;
3023   const Register xstart = tmp3;
3024 
3025   const Register y_idx = tmp4;
3026   const Register carry = tmp5;
3027   const Register product  = xlen;
3028   const Register x_xstart = zlen;  // reuse register
3029 
3030   // First Loop.
3031   //
3032   //  final static long LONG_MASK = 0xffffffffL;
3033   //  int xstart = xlen - 1;
3034   //  int ystart = ylen - 1;
3035   //  long carry = 0;
3036   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3037   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3038   //    z[kdx] = (int)product;
3039   //    carry = product >>> 32;
3040   //  }
3041   //  z[xstart] = (int)carry;
3042   //
3043 
3044   movw(idx, ylen);      // idx = ylen;
3045   movw(kdx, zlen);      // kdx = xlen+ylen;
3046   mov(carry, zr);       // carry = 0;
3047 
3048   Label L_done;
3049 
3050   movw(xstart, xlen);
3051   subsw(xstart, xstart, 1);
3052   br(Assembler::MI, L_done);
3053 
3054   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3055 
3056   Label L_second_loop;
3057   cbzw(kdx, L_second_loop);
3058 
3059   Label L_carry;
3060   subw(kdx, kdx, 1);
3061   cbzw(kdx, L_carry);
3062 
3063   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3064   lsr(carry, carry, 32);
3065   subw(kdx, kdx, 1);
3066 
3067   bind(L_carry);
3068   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3069 
3070   // Second and third (nested) loops.
3071   //
3072   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3073   //   carry = 0;
3074   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3075   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3076   //                    (z[k] & LONG_MASK) + carry;
3077   //     z[k] = (int)product;
3078   //     carry = product >>> 32;
3079   //   }
3080   //   z[i] = (int)carry;
3081   // }
3082   //
3083   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3084 
3085   const Register jdx = tmp1;
3086 
3087   bind(L_second_loop);
3088   mov(carry, zr);                // carry = 0;
3089   movw(jdx, ylen);               // j = ystart+1
3090 
3091   subsw(xstart, xstart, 1);      // i = xstart-1;
3092   br(Assembler::MI, L_done);
3093 
3094   str(z, Address(pre(sp, -4 * wordSize)));
3095 
3096   Label L_last_x;
3097   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3098   subsw(xstart, xstart, 1);       // i = xstart-1;
3099   br(Assembler::MI, L_last_x);
3100 
3101   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3102   ldr(product_hi, Address(rscratch1));
3103   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3104 
3105   Label L_third_loop_prologue;
3106   bind(L_third_loop_prologue);
3107 
3108   str(ylen, Address(sp, wordSize));
3109   stp(x, xstart, Address(sp, 2 * wordSize));
3110   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3111                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3112   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3113   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3114 
3115   addw(tmp3, xlen, 1);
3116   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3117   subsw(tmp3, tmp3, 1);
3118   br(Assembler::MI, L_done);
3119 
3120   lsr(carry, carry, 32);
3121   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3122   b(L_second_loop);
3123 
3124   // Next infrequent code is moved outside loops.
3125   bind(L_last_x);
3126   ldrw(product_hi, Address(x,  0));
3127   b(L_third_loop_prologue);
3128 
3129   bind(L_done);
3130 }
3131 
3132 // Code for BigInteger::mulAdd instrinsic
3133 // out     = r0
3134 // in      = r1
3135 // offset  = r2  (already out.length-offset)
3136 // len     = r3
3137 // k       = r4
3138 //
3139 // pseudo code from java implementation:
3140 // carry = 0;
3141 // offset = out.length-offset - 1;
3142 // for (int j=len-1; j >= 0; j--) {
3143 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3144 //     out[offset--] = (int)product;
3145 //     carry = product >>> 32;
3146 // }
3147 // return (int)carry;
3148 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3149       Register len, Register k) {
3150     Label LOOP, END;
3151     // pre-loop
3152     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3153     csel(out, zr, out, Assembler::EQ);
3154     br(Assembler::EQ, END);
3155     add(in, in, len, LSL, 2); // in[j+1] address
3156     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3157     mov(out, zr); // used to keep carry now
3158     BIND(LOOP);
3159     ldrw(rscratch1, Address(pre(in, -4)));
3160     madd(rscratch1, rscratch1, k, out);
3161     ldrw(rscratch2, Address(pre(offset, -4)));
3162     add(rscratch1, rscratch1, rscratch2);
3163     strw(rscratch1, Address(offset));
3164     lsr(out, rscratch1, 32);
3165     subs(len, len, 1);
3166     br(Assembler::NE, LOOP);
3167     BIND(END);
3168 }
3169 
3170 /**
3171  * Emits code to update CRC-32 with a byte value according to constants in table
3172  *
3173  * @param [in,out]crc   Register containing the crc.
3174  * @param [in]val       Register containing the byte to fold into the CRC.
3175  * @param [in]table     Register containing the table of crc constants.
3176  *
3177  * uint32_t crc;
3178  * val = crc_table[(val ^ crc) & 0xFF];
3179  * crc = val ^ (crc >> 8);
3180  *
3181  */
3182 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3183   eor(val, val, crc);
3184   andr(val, val, 0xff);
3185   ldrw(val, Address(table, val, Address::lsl(2)));
3186   eor(crc, val, crc, Assembler::LSR, 8);
3187 }
3188 
3189 /**
3190  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3191  *
3192  * @param [in,out]crc   Register containing the crc.
3193  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3194  * @param [in]table0    Register containing table 0 of crc constants.
3195  * @param [in]table1    Register containing table 1 of crc constants.
3196  * @param [in]table2    Register containing table 2 of crc constants.
3197  * @param [in]table3    Register containing table 3 of crc constants.
3198  *
3199  * uint32_t crc;
3200  *   v = crc ^ v
3201  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3202  *
3203  */
3204 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3205         Register table0, Register table1, Register table2, Register table3,
3206         bool upper) {
3207   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3208   uxtb(tmp, v);
3209   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3210   ubfx(tmp, v, 8, 8);
3211   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3212   eor(crc, crc, tmp);
3213   ubfx(tmp, v, 16, 8);
3214   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3215   eor(crc, crc, tmp);
3216   ubfx(tmp, v, 24, 8);
3217   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3218   eor(crc, crc, tmp);
3219 }
3220 
3221 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3222         Register len, Register tmp0, Register tmp1, Register tmp2,
3223         Register tmp3) {
3224     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3225     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3226 
3227     mvnw(crc, crc);
3228 
3229     subs(len, len, 128);
3230     br(Assembler::GE, CRC_by64_pre);
3231   BIND(CRC_less64);
3232     adds(len, len, 128-32);
3233     br(Assembler::GE, CRC_by32_loop);
3234   BIND(CRC_less32);
3235     adds(len, len, 32-4);
3236     br(Assembler::GE, CRC_by4_loop);
3237     adds(len, len, 4);
3238     br(Assembler::GT, CRC_by1_loop);
3239     b(L_exit);
3240 
3241   BIND(CRC_by32_loop);
3242     ldp(tmp0, tmp1, Address(post(buf, 16)));
3243     subs(len, len, 32);
3244     crc32x(crc, crc, tmp0);
3245     ldr(tmp2, Address(post(buf, 8)));
3246     crc32x(crc, crc, tmp1);
3247     ldr(tmp3, Address(post(buf, 8)));
3248     crc32x(crc, crc, tmp2);
3249     crc32x(crc, crc, tmp3);
3250     br(Assembler::GE, CRC_by32_loop);
3251     cmn(len, 32);
3252     br(Assembler::NE, CRC_less32);
3253     b(L_exit);
3254 
3255   BIND(CRC_by4_loop);
3256     ldrw(tmp0, Address(post(buf, 4)));
3257     subs(len, len, 4);
3258     crc32w(crc, crc, tmp0);
3259     br(Assembler::GE, CRC_by4_loop);
3260     adds(len, len, 4);
3261     br(Assembler::LE, L_exit);
3262   BIND(CRC_by1_loop);
3263     ldrb(tmp0, Address(post(buf, 1)));
3264     subs(len, len, 1);
3265     crc32b(crc, crc, tmp0);
3266     br(Assembler::GT, CRC_by1_loop);
3267     b(L_exit);
3268 
3269   BIND(CRC_by64_pre);
3270     sub(buf, buf, 8);
3271     ldp(tmp0, tmp1, Address(buf, 8));
3272     crc32x(crc, crc, tmp0);
3273     ldr(tmp2, Address(buf, 24));
3274     crc32x(crc, crc, tmp1);
3275     ldr(tmp3, Address(buf, 32));
3276     crc32x(crc, crc, tmp2);
3277     ldr(tmp0, Address(buf, 40));
3278     crc32x(crc, crc, tmp3);
3279     ldr(tmp1, Address(buf, 48));
3280     crc32x(crc, crc, tmp0);
3281     ldr(tmp2, Address(buf, 56));
3282     crc32x(crc, crc, tmp1);
3283     ldr(tmp3, Address(pre(buf, 64)));
3284 
3285     b(CRC_by64_loop);
3286 
3287     align(CodeEntryAlignment);
3288   BIND(CRC_by64_loop);
3289     subs(len, len, 64);
3290     crc32x(crc, crc, tmp2);
3291     ldr(tmp0, Address(buf, 8));
3292     crc32x(crc, crc, tmp3);
3293     ldr(tmp1, Address(buf, 16));
3294     crc32x(crc, crc, tmp0);
3295     ldr(tmp2, Address(buf, 24));
3296     crc32x(crc, crc, tmp1);
3297     ldr(tmp3, Address(buf, 32));
3298     crc32x(crc, crc, tmp2);
3299     ldr(tmp0, Address(buf, 40));
3300     crc32x(crc, crc, tmp3);
3301     ldr(tmp1, Address(buf, 48));
3302     crc32x(crc, crc, tmp0);
3303     ldr(tmp2, Address(buf, 56));
3304     crc32x(crc, crc, tmp1);
3305     ldr(tmp3, Address(pre(buf, 64)));
3306     br(Assembler::GE, CRC_by64_loop);
3307 
3308     // post-loop
3309     crc32x(crc, crc, tmp2);
3310     crc32x(crc, crc, tmp3);
3311 
3312     sub(len, len, 64);
3313     add(buf, buf, 8);
3314     cmn(len, 128);
3315     br(Assembler::NE, CRC_less64);
3316   BIND(L_exit);
3317     mvnw(crc, crc);
3318 }
3319 
3320 /**
3321  * @param crc   register containing existing CRC (32-bit)
3322  * @param buf   register pointing to input byte buffer (byte*)
3323  * @param len   register containing number of bytes
3324  * @param table register that will contain address of CRC table
3325  * @param tmp   scratch register
3326  */
3327 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3328         Register table0, Register table1, Register table2, Register table3,
3329         Register tmp, Register tmp2, Register tmp3) {
3330   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3331   unsigned long offset;
3332 
3333   if (UseCRC32) {
3334       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3335       return;
3336   }
3337 
3338     mvnw(crc, crc);
3339 
3340     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3341     if (offset) add(table0, table0, offset);
3342     add(table1, table0, 1*256*sizeof(juint));
3343     add(table2, table0, 2*256*sizeof(juint));
3344     add(table3, table0, 3*256*sizeof(juint));
3345 
3346   if (UseNeon) {
3347       cmp(len, (u1)64);
3348       br(Assembler::LT, L_by16);
3349       eor(v16, T16B, v16, v16);
3350 
3351     Label L_fold;
3352 
3353       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3354 
3355       ld1(v0, v1, T2D, post(buf, 32));
3356       ld1r(v4, T2D, post(tmp, 8));
3357       ld1r(v5, T2D, post(tmp, 8));
3358       ld1r(v6, T2D, post(tmp, 8));
3359       ld1r(v7, T2D, post(tmp, 8));
3360       mov(v16, T4S, 0, crc);
3361 
3362       eor(v0, T16B, v0, v16);
3363       sub(len, len, 64);
3364 
3365     BIND(L_fold);
3366       pmull(v22, T8H, v0, v5, T8B);
3367       pmull(v20, T8H, v0, v7, T8B);
3368       pmull(v23, T8H, v0, v4, T8B);
3369       pmull(v21, T8H, v0, v6, T8B);
3370 
3371       pmull2(v18, T8H, v0, v5, T16B);
3372       pmull2(v16, T8H, v0, v7, T16B);
3373       pmull2(v19, T8H, v0, v4, T16B);
3374       pmull2(v17, T8H, v0, v6, T16B);
3375 
3376       uzp1(v24, T8H, v20, v22);
3377       uzp2(v25, T8H, v20, v22);
3378       eor(v20, T16B, v24, v25);
3379 
3380       uzp1(v26, T8H, v16, v18);
3381       uzp2(v27, T8H, v16, v18);
3382       eor(v16, T16B, v26, v27);
3383 
3384       ushll2(v22, T4S, v20, T8H, 8);
3385       ushll(v20, T4S, v20, T4H, 8);
3386 
3387       ushll2(v18, T4S, v16, T8H, 8);
3388       ushll(v16, T4S, v16, T4H, 8);
3389 
3390       eor(v22, T16B, v23, v22);
3391       eor(v18, T16B, v19, v18);
3392       eor(v20, T16B, v21, v20);
3393       eor(v16, T16B, v17, v16);
3394 
3395       uzp1(v17, T2D, v16, v20);
3396       uzp2(v21, T2D, v16, v20);
3397       eor(v17, T16B, v17, v21);
3398 
3399       ushll2(v20, T2D, v17, T4S, 16);
3400       ushll(v16, T2D, v17, T2S, 16);
3401 
3402       eor(v20, T16B, v20, v22);
3403       eor(v16, T16B, v16, v18);
3404 
3405       uzp1(v17, T2D, v20, v16);
3406       uzp2(v21, T2D, v20, v16);
3407       eor(v28, T16B, v17, v21);
3408 
3409       pmull(v22, T8H, v1, v5, T8B);
3410       pmull(v20, T8H, v1, v7, T8B);
3411       pmull(v23, T8H, v1, v4, T8B);
3412       pmull(v21, T8H, v1, v6, T8B);
3413 
3414       pmull2(v18, T8H, v1, v5, T16B);
3415       pmull2(v16, T8H, v1, v7, T16B);
3416       pmull2(v19, T8H, v1, v4, T16B);
3417       pmull2(v17, T8H, v1, v6, T16B);
3418 
3419       ld1(v0, v1, T2D, post(buf, 32));
3420 
3421       uzp1(v24, T8H, v20, v22);
3422       uzp2(v25, T8H, v20, v22);
3423       eor(v20, T16B, v24, v25);
3424 
3425       uzp1(v26, T8H, v16, v18);
3426       uzp2(v27, T8H, v16, v18);
3427       eor(v16, T16B, v26, v27);
3428 
3429       ushll2(v22, T4S, v20, T8H, 8);
3430       ushll(v20, T4S, v20, T4H, 8);
3431 
3432       ushll2(v18, T4S, v16, T8H, 8);
3433       ushll(v16, T4S, v16, T4H, 8);
3434 
3435       eor(v22, T16B, v23, v22);
3436       eor(v18, T16B, v19, v18);
3437       eor(v20, T16B, v21, v20);
3438       eor(v16, T16B, v17, v16);
3439 
3440       uzp1(v17, T2D, v16, v20);
3441       uzp2(v21, T2D, v16, v20);
3442       eor(v16, T16B, v17, v21);
3443 
3444       ushll2(v20, T2D, v16, T4S, 16);
3445       ushll(v16, T2D, v16, T2S, 16);
3446 
3447       eor(v20, T16B, v22, v20);
3448       eor(v16, T16B, v16, v18);
3449 
3450       uzp1(v17, T2D, v20, v16);
3451       uzp2(v21, T2D, v20, v16);
3452       eor(v20, T16B, v17, v21);
3453 
3454       shl(v16, T2D, v28, 1);
3455       shl(v17, T2D, v20, 1);
3456 
3457       eor(v0, T16B, v0, v16);
3458       eor(v1, T16B, v1, v17);
3459 
3460       subs(len, len, 32);
3461       br(Assembler::GE, L_fold);
3462 
3463       mov(crc, 0);
3464       mov(tmp, v0, T1D, 0);
3465       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3466       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3467       mov(tmp, v0, T1D, 1);
3468       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3469       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3470       mov(tmp, v1, T1D, 0);
3471       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3472       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3473       mov(tmp, v1, T1D, 1);
3474       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3475       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3476 
3477       add(len, len, 32);
3478   }
3479 
3480   BIND(L_by16);
3481     subs(len, len, 16);
3482     br(Assembler::GE, L_by16_loop);
3483     adds(len, len, 16-4);
3484     br(Assembler::GE, L_by4_loop);
3485     adds(len, len, 4);
3486     br(Assembler::GT, L_by1_loop);
3487     b(L_exit);
3488 
3489   BIND(L_by4_loop);
3490     ldrw(tmp, Address(post(buf, 4)));
3491     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3492     subs(len, len, 4);
3493     br(Assembler::GE, L_by4_loop);
3494     adds(len, len, 4);
3495     br(Assembler::LE, L_exit);
3496   BIND(L_by1_loop);
3497     subs(len, len, 1);
3498     ldrb(tmp, Address(post(buf, 1)));
3499     update_byte_crc32(crc, tmp, table0);
3500     br(Assembler::GT, L_by1_loop);
3501     b(L_exit);
3502 
3503     align(CodeEntryAlignment);
3504   BIND(L_by16_loop);
3505     subs(len, len, 16);
3506     ldp(tmp, tmp3, Address(post(buf, 16)));
3507     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3508     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3509     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3510     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3511     br(Assembler::GE, L_by16_loop);
3512     adds(len, len, 16-4);
3513     br(Assembler::GE, L_by4_loop);
3514     adds(len, len, 4);
3515     br(Assembler::GT, L_by1_loop);
3516   BIND(L_exit);
3517     mvnw(crc, crc);
3518 }
3519 
3520 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3521         Register len, Register tmp0, Register tmp1, Register tmp2,
3522         Register tmp3) {
3523     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3524     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3525 
3526     subs(len, len, 128);
3527     br(Assembler::GE, CRC_by64_pre);
3528   BIND(CRC_less64);
3529     adds(len, len, 128-32);
3530     br(Assembler::GE, CRC_by32_loop);
3531   BIND(CRC_less32);
3532     adds(len, len, 32-4);
3533     br(Assembler::GE, CRC_by4_loop);
3534     adds(len, len, 4);
3535     br(Assembler::GT, CRC_by1_loop);
3536     b(L_exit);
3537 
3538   BIND(CRC_by32_loop);
3539     ldp(tmp0, tmp1, Address(post(buf, 16)));
3540     subs(len, len, 32);
3541     crc32cx(crc, crc, tmp0);
3542     ldr(tmp2, Address(post(buf, 8)));
3543     crc32cx(crc, crc, tmp1);
3544     ldr(tmp3, Address(post(buf, 8)));
3545     crc32cx(crc, crc, tmp2);
3546     crc32cx(crc, crc, tmp3);
3547     br(Assembler::GE, CRC_by32_loop);
3548     cmn(len, 32);
3549     br(Assembler::NE, CRC_less32);
3550     b(L_exit);
3551 
3552   BIND(CRC_by4_loop);
3553     ldrw(tmp0, Address(post(buf, 4)));
3554     subs(len, len, 4);
3555     crc32cw(crc, crc, tmp0);
3556     br(Assembler::GE, CRC_by4_loop);
3557     adds(len, len, 4);
3558     br(Assembler::LE, L_exit);
3559   BIND(CRC_by1_loop);
3560     ldrb(tmp0, Address(post(buf, 1)));
3561     subs(len, len, 1);
3562     crc32cb(crc, crc, tmp0);
3563     br(Assembler::GT, CRC_by1_loop);
3564     b(L_exit);
3565 
3566   BIND(CRC_by64_pre);
3567     sub(buf, buf, 8);
3568     ldp(tmp0, tmp1, Address(buf, 8));
3569     crc32cx(crc, crc, tmp0);
3570     ldr(tmp2, Address(buf, 24));
3571     crc32cx(crc, crc, tmp1);
3572     ldr(tmp3, Address(buf, 32));
3573     crc32cx(crc, crc, tmp2);
3574     ldr(tmp0, Address(buf, 40));
3575     crc32cx(crc, crc, tmp3);
3576     ldr(tmp1, Address(buf, 48));
3577     crc32cx(crc, crc, tmp0);
3578     ldr(tmp2, Address(buf, 56));
3579     crc32cx(crc, crc, tmp1);
3580     ldr(tmp3, Address(pre(buf, 64)));
3581 
3582     b(CRC_by64_loop);
3583 
3584     align(CodeEntryAlignment);
3585   BIND(CRC_by64_loop);
3586     subs(len, len, 64);
3587     crc32cx(crc, crc, tmp2);
3588     ldr(tmp0, Address(buf, 8));
3589     crc32cx(crc, crc, tmp3);
3590     ldr(tmp1, Address(buf, 16));
3591     crc32cx(crc, crc, tmp0);
3592     ldr(tmp2, Address(buf, 24));
3593     crc32cx(crc, crc, tmp1);
3594     ldr(tmp3, Address(buf, 32));
3595     crc32cx(crc, crc, tmp2);
3596     ldr(tmp0, Address(buf, 40));
3597     crc32cx(crc, crc, tmp3);
3598     ldr(tmp1, Address(buf, 48));
3599     crc32cx(crc, crc, tmp0);
3600     ldr(tmp2, Address(buf, 56));
3601     crc32cx(crc, crc, tmp1);
3602     ldr(tmp3, Address(pre(buf, 64)));
3603     br(Assembler::GE, CRC_by64_loop);
3604 
3605     // post-loop
3606     crc32cx(crc, crc, tmp2);
3607     crc32cx(crc, crc, tmp3);
3608 
3609     sub(len, len, 64);
3610     add(buf, buf, 8);
3611     cmn(len, 128);
3612     br(Assembler::NE, CRC_less64);
3613   BIND(L_exit);
3614 }
3615 
3616 /**
3617  * @param crc   register containing existing CRC (32-bit)
3618  * @param buf   register pointing to input byte buffer (byte*)
3619  * @param len   register containing number of bytes
3620  * @param table register that will contain address of CRC table
3621  * @param tmp   scratch register
3622  */
3623 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3624         Register table0, Register table1, Register table2, Register table3,
3625         Register tmp, Register tmp2, Register tmp3) {
3626   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3627 }
3628 
3629 
3630 SkipIfEqual::SkipIfEqual(
3631     MacroAssembler* masm, const bool* flag_addr, bool value) {
3632   _masm = masm;
3633   unsigned long offset;
3634   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3635   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3636   _masm->cbzw(rscratch1, _label);
3637 }
3638 
3639 SkipIfEqual::~SkipIfEqual() {
3640   _masm->bind(_label);
3641 }
3642 
3643 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3644   Address adr;
3645   switch(dst.getMode()) {
3646   case Address::base_plus_offset:
3647     // This is the expected mode, although we allow all the other
3648     // forms below.
3649     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3650     break;
3651   default:
3652     lea(rscratch2, dst);
3653     adr = Address(rscratch2);
3654     break;
3655   }
3656   ldr(rscratch1, adr);
3657   add(rscratch1, rscratch1, src);
3658   str(rscratch1, adr);
3659 }
3660 
3661 void MacroAssembler::cmpptr(Register src1, Address src2) {
3662   unsigned long offset;
3663   adrp(rscratch1, src2, offset);
3664   ldr(rscratch1, Address(rscratch1, offset));
3665   cmp(src1, rscratch1);
3666 }
3667 
3668 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3669   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3670   bs->obj_equals(this, obj1, obj2);
3671 }
3672 
3673 void MacroAssembler::load_klass(Register dst, Register src) {
3674   if (UseCompressedClassPointers) {
3675     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3676     decode_klass_not_null(dst);
3677   } else {
3678     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3679   }
3680 }
3681 
3682 // ((OopHandle)result).resolve();
3683 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3684   // OopHandle::resolve is an indirection.
3685   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3686 }
3687 
3688 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3689   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3690   ldr(dst, Address(rmethod, Method::const_offset()));
3691   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3692   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3693   ldr(dst, Address(dst, mirror_offset));
3694   resolve_oop_handle(dst, tmp);
3695 }
3696 
3697 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3698   if (UseCompressedClassPointers) {
3699     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3700     if (Universe::narrow_klass_base() == NULL) {
3701       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3702       return;
3703     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3704                && Universe::narrow_klass_shift() == 0) {
3705       // Only the bottom 32 bits matter
3706       cmpw(trial_klass, tmp);
3707       return;
3708     }
3709     decode_klass_not_null(tmp);
3710   } else {
3711     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3712   }
3713   cmp(trial_klass, tmp);
3714 }
3715 
3716 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3717   load_klass(dst, src);
3718   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3719 }
3720 
3721 void MacroAssembler::store_klass(Register dst, Register src) {
3722   // FIXME: Should this be a store release?  concurrent gcs assumes
3723   // klass length is valid if klass field is not null.
3724   if (UseCompressedClassPointers) {
3725     encode_klass_not_null(src);
3726     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3727   } else {
3728     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3729   }
3730 }
3731 
3732 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3733   if (UseCompressedClassPointers) {
3734     // Store to klass gap in destination
3735     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3736   }
3737 }
3738 
3739 // Algorithm must match CompressedOops::encode.
3740 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3741 #ifdef ASSERT
3742   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3743 #endif
3744   verify_oop(s, "broken oop in encode_heap_oop");
3745   if (Universe::narrow_oop_base() == NULL) {
3746     if (Universe::narrow_oop_shift() != 0) {
3747       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3748       lsr(d, s, LogMinObjAlignmentInBytes);
3749     } else {
3750       mov(d, s);
3751     }
3752   } else {
3753     subs(d, s, rheapbase);
3754     csel(d, d, zr, Assembler::HS);
3755     lsr(d, d, LogMinObjAlignmentInBytes);
3756 
3757     /*  Old algorithm: is this any worse?
3758     Label nonnull;
3759     cbnz(r, nonnull);
3760     sub(r, r, rheapbase);
3761     bind(nonnull);
3762     lsr(r, r, LogMinObjAlignmentInBytes);
3763     */
3764   }
3765 }
3766 
3767 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3768 #ifdef ASSERT
3769   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3770   if (CheckCompressedOops) {
3771     Label ok;
3772     cbnz(r, ok);
3773     stop("null oop passed to encode_heap_oop_not_null");
3774     bind(ok);
3775   }
3776 #endif
3777   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3778   if (Universe::narrow_oop_base() != NULL) {
3779     sub(r, r, rheapbase);
3780   }
3781   if (Universe::narrow_oop_shift() != 0) {
3782     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3783     lsr(r, r, LogMinObjAlignmentInBytes);
3784   }
3785 }
3786 
3787 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3788 #ifdef ASSERT
3789   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3790   if (CheckCompressedOops) {
3791     Label ok;
3792     cbnz(src, ok);
3793     stop("null oop passed to encode_heap_oop_not_null2");
3794     bind(ok);
3795   }
3796 #endif
3797   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3798 
3799   Register data = src;
3800   if (Universe::narrow_oop_base() != NULL) {
3801     sub(dst, src, rheapbase);
3802     data = dst;
3803   }
3804   if (Universe::narrow_oop_shift() != 0) {
3805     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3806     lsr(dst, data, LogMinObjAlignmentInBytes);
3807     data = dst;
3808   }
3809   if (data == src)
3810     mov(dst, src);
3811 }
3812 
3813 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3814 #ifdef ASSERT
3815   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3816 #endif
3817   if (Universe::narrow_oop_base() == NULL) {
3818     if (Universe::narrow_oop_shift() != 0 || d != s) {
3819       lsl(d, s, Universe::narrow_oop_shift());
3820     }
3821   } else {
3822     Label done;
3823     if (d != s)
3824       mov(d, s);
3825     cbz(s, done);
3826     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3827     bind(done);
3828   }
3829   verify_oop(d, "broken oop in decode_heap_oop");
3830 }
3831 
3832 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3833   assert (UseCompressedOops, "should only be used for compressed headers");
3834   assert (Universe::heap() != NULL, "java heap should be initialized");
3835   // Cannot assert, unverified entry point counts instructions (see .ad file)
3836   // vtableStubs also counts instructions in pd_code_size_limit.
3837   // Also do not verify_oop as this is called by verify_oop.
3838   if (Universe::narrow_oop_shift() != 0) {
3839     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3840     if (Universe::narrow_oop_base() != NULL) {
3841       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3842     } else {
3843       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3844     }
3845   } else {
3846     assert (Universe::narrow_oop_base() == NULL, "sanity");
3847   }
3848 }
3849 
3850 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3851   assert (UseCompressedOops, "should only be used for compressed headers");
3852   assert (Universe::heap() != NULL, "java heap should be initialized");
3853   // Cannot assert, unverified entry point counts instructions (see .ad file)
3854   // vtableStubs also counts instructions in pd_code_size_limit.
3855   // Also do not verify_oop as this is called by verify_oop.
3856   if (Universe::narrow_oop_shift() != 0) {
3857     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3858     if (Universe::narrow_oop_base() != NULL) {
3859       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3860     } else {
3861       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3862     }
3863   } else {
3864     assert (Universe::narrow_oop_base() == NULL, "sanity");
3865     if (dst != src) {
3866       mov(dst, src);
3867     }
3868   }
3869 }
3870 
3871 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3872   if (Universe::narrow_klass_base() == NULL) {
3873     if (Universe::narrow_klass_shift() != 0) {
3874       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3875       lsr(dst, src, LogKlassAlignmentInBytes);
3876     } else {
3877       if (dst != src) mov(dst, src);
3878     }
3879     return;
3880   }
3881 
3882   if (use_XOR_for_compressed_class_base) {
3883     if (Universe::narrow_klass_shift() != 0) {
3884       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3885       lsr(dst, dst, LogKlassAlignmentInBytes);
3886     } else {
3887       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3888     }
3889     return;
3890   }
3891 
3892   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3893       && Universe::narrow_klass_shift() == 0) {
3894     movw(dst, src);
3895     return;
3896   }
3897 
3898 #ifdef ASSERT
3899   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3900 #endif
3901 
3902   Register rbase = dst;
3903   if (dst == src) rbase = rheapbase;
3904   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3905   sub(dst, src, rbase);
3906   if (Universe::narrow_klass_shift() != 0) {
3907     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3908     lsr(dst, dst, LogKlassAlignmentInBytes);
3909   }
3910   if (dst == src) reinit_heapbase();
3911 }
3912 
3913 void MacroAssembler::encode_klass_not_null(Register r) {
3914   encode_klass_not_null(r, r);
3915 }
3916 
3917 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3918   Register rbase = dst;
3919   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3920 
3921   if (Universe::narrow_klass_base() == NULL) {
3922     if (Universe::narrow_klass_shift() != 0) {
3923       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3924       lsl(dst, src, LogKlassAlignmentInBytes);
3925     } else {
3926       if (dst != src) mov(dst, src);
3927     }
3928     return;
3929   }
3930 
3931   if (use_XOR_for_compressed_class_base) {
3932     if (Universe::narrow_klass_shift() != 0) {
3933       lsl(dst, src, LogKlassAlignmentInBytes);
3934       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3935     } else {
3936       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3937     }
3938     return;
3939   }
3940 
3941   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3942       && Universe::narrow_klass_shift() == 0) {
3943     if (dst != src)
3944       movw(dst, src);
3945     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3946     return;
3947   }
3948 
3949   // Cannot assert, unverified entry point counts instructions (see .ad file)
3950   // vtableStubs also counts instructions in pd_code_size_limit.
3951   // Also do not verify_oop as this is called by verify_oop.
3952   if (dst == src) rbase = rheapbase;
3953   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3954   if (Universe::narrow_klass_shift() != 0) {
3955     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3956     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3957   } else {
3958     add(dst, rbase, src);
3959   }
3960   if (dst == src) reinit_heapbase();
3961 }
3962 
3963 void  MacroAssembler::decode_klass_not_null(Register r) {
3964   decode_klass_not_null(r, r);
3965 }
3966 
3967 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3968 #ifdef ASSERT
3969   {
3970     ThreadInVMfromUnknown tiv;
3971     assert (UseCompressedOops, "should only be used for compressed oops");
3972     assert (Universe::heap() != NULL, "java heap should be initialized");
3973     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3974     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3975   }
3976 #endif
3977   int oop_index = oop_recorder()->find_index(obj);
3978   InstructionMark im(this);
3979   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3980   code_section()->relocate(inst_mark(), rspec);
3981   movz(dst, 0xDEAD, 16);
3982   movk(dst, 0xBEEF);
3983 }
3984 
3985 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3986   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3987   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3988   int index = oop_recorder()->find_index(k);
3989   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3990 
3991   InstructionMark im(this);
3992   RelocationHolder rspec = metadata_Relocation::spec(index);
3993   code_section()->relocate(inst_mark(), rspec);
3994   narrowKlass nk = Klass::encode_klass(k);
3995   movz(dst, (nk >> 16), 16);
3996   movk(dst, nk & 0xffff);
3997 }
3998 
3999 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4000                                     Register dst, Address src,
4001                                     Register tmp1, Register thread_tmp) {
4002   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4003   decorators = AccessInternal::decorator_fixup(decorators);
4004   bool as_raw = (decorators & AS_RAW) != 0;
4005   if (as_raw) {
4006     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4007   } else {
4008     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4009   }
4010 }
4011 
4012 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4013                                      Address dst, Register src,
4014                                      Register tmp1, Register thread_tmp) {
4015   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4016   decorators = AccessInternal::decorator_fixup(decorators);
4017   bool as_raw = (decorators & AS_RAW) != 0;
4018   if (as_raw) {
4019     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4020   } else {
4021     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4022   }
4023 }
4024 
4025 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4026   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4027   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4028     decorators |= ACCESS_READ | ACCESS_WRITE;
4029   }
4030   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4031   return bs->resolve(this, decorators, obj);
4032 }
4033 
4034 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4035                                    Register thread_tmp, DecoratorSet decorators) {
4036   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4037 }
4038 
4039 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4040                                             Register thread_tmp, DecoratorSet decorators) {
4041   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4042 }
4043 
4044 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4045                                     Register thread_tmp, DecoratorSet decorators) {
4046   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4047 }
4048 
4049 // Used for storing NULLs.
4050 void MacroAssembler::store_heap_oop_null(Address dst) {
4051   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4052 }
4053 
4054 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4055   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4056   int index = oop_recorder()->allocate_metadata_index(obj);
4057   RelocationHolder rspec = metadata_Relocation::spec(index);
4058   return Address((address)obj, rspec);
4059 }
4060 
4061 // Move an oop into a register.  immediate is true if we want
4062 // immediate instrcutions, i.e. we are not going to patch this
4063 // instruction while the code is being executed by another thread.  In
4064 // that case we can use move immediates rather than the constant pool.
4065 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4066   int oop_index;
4067   if (obj == NULL) {
4068     oop_index = oop_recorder()->allocate_oop_index(obj);
4069   } else {
4070 #ifdef ASSERT
4071     {
4072       ThreadInVMfromUnknown tiv;
4073       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4074     }
4075 #endif
4076     oop_index = oop_recorder()->find_index(obj);
4077   }
4078   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4079   if (! immediate) {
4080     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4081     ldr_constant(dst, Address(dummy, rspec));
4082   } else
4083     mov(dst, Address((address)obj, rspec));
4084 }
4085 
4086 // Move a metadata address into a register.
4087 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4088   int oop_index;
4089   if (obj == NULL) {
4090     oop_index = oop_recorder()->allocate_metadata_index(obj);
4091   } else {
4092     oop_index = oop_recorder()->find_index(obj);
4093   }
4094   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4095   mov(dst, Address((address)obj, rspec));
4096 }
4097 
4098 Address MacroAssembler::constant_oop_address(jobject obj) {
4099 #ifdef ASSERT
4100   {
4101     ThreadInVMfromUnknown tiv;
4102     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4103     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4104   }
4105 #endif
4106   int oop_index = oop_recorder()->find_index(obj);
4107   return Address((address)obj, oop_Relocation::spec(oop_index));
4108 }
4109 
4110 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4111 void MacroAssembler::tlab_allocate(Register obj,
4112                                    Register var_size_in_bytes,
4113                                    int con_size_in_bytes,
4114                                    Register t1,
4115                                    Register t2,
4116                                    Label& slow_case) {
4117   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4118   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4119 }
4120 
4121 // Defines obj, preserves var_size_in_bytes
4122 void MacroAssembler::eden_allocate(Register obj,
4123                                    Register var_size_in_bytes,
4124                                    int con_size_in_bytes,
4125                                    Register t1,
4126                                    Label& slow_case) {
4127   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4128   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4129 }
4130 
4131 // Zero words; len is in bytes
4132 // Destroys all registers except addr
4133 // len must be a nonzero multiple of wordSize
4134 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4135   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4136 
4137 #ifdef ASSERT
4138   { Label L;
4139     tst(len, BytesPerWord - 1);
4140     br(Assembler::EQ, L);
4141     stop("len is not a multiple of BytesPerWord");
4142     bind(L);
4143   }
4144 #endif
4145 
4146 #ifndef PRODUCT
4147   block_comment("zero memory");
4148 #endif
4149 
4150   Label loop;
4151   Label entry;
4152 
4153 //  Algorithm:
4154 //
4155 //    scratch1 = cnt & 7;
4156 //    cnt -= scratch1;
4157 //    p += scratch1;
4158 //    switch (scratch1) {
4159 //      do {
4160 //        cnt -= 8;
4161 //          p[-8] = 0;
4162 //        case 7:
4163 //          p[-7] = 0;
4164 //        case 6:
4165 //          p[-6] = 0;
4166 //          // ...
4167 //        case 1:
4168 //          p[-1] = 0;
4169 //        case 0:
4170 //          p += 8;
4171 //      } while (cnt);
4172 //    }
4173 
4174   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4175 
4176   lsr(len, len, LogBytesPerWord);
4177   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4178   sub(len, len, rscratch1);      // cnt -= unroll
4179   // t1 always points to the end of the region we're about to zero
4180   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4181   adr(rscratch2, entry);
4182   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4183   br(rscratch2);
4184   bind(loop);
4185   sub(len, len, unroll);
4186   for (int i = -unroll; i < 0; i++)
4187     Assembler::str(zr, Address(t1, i * wordSize));
4188   bind(entry);
4189   add(t1, t1, unroll * wordSize);
4190   cbnz(len, loop);
4191 }
4192 
4193 void MacroAssembler::verify_tlab() {
4194 #ifdef ASSERT
4195   if (UseTLAB && VerifyOops) {
4196     Label next, ok;
4197 
4198     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4199 
4200     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4201     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4202     cmp(rscratch2, rscratch1);
4203     br(Assembler::HS, next);
4204     STOP("assert(top >= start)");
4205     should_not_reach_here();
4206 
4207     bind(next);
4208     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4209     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4210     cmp(rscratch2, rscratch1);
4211     br(Assembler::HS, ok);
4212     STOP("assert(top <= end)");
4213     should_not_reach_here();
4214 
4215     bind(ok);
4216     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4217   }
4218 #endif
4219 }
4220 
4221 // Writes to stack successive pages until offset reached to check for
4222 // stack overflow + shadow pages.  This clobbers tmp.
4223 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4224   assert_different_registers(tmp, size, rscratch1);
4225   mov(tmp, sp);
4226   // Bang stack for total size given plus shadow page size.
4227   // Bang one page at a time because large size can bang beyond yellow and
4228   // red zones.
4229   Label loop;
4230   mov(rscratch1, os::vm_page_size());
4231   bind(loop);
4232   lea(tmp, Address(tmp, -os::vm_page_size()));
4233   subsw(size, size, rscratch1);
4234   str(size, Address(tmp));
4235   br(Assembler::GT, loop);
4236 
4237   // Bang down shadow pages too.
4238   // At this point, (tmp-0) is the last address touched, so don't
4239   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4240   // was post-decremented.)  Skip this address by starting at i=1, and
4241   // touch a few more pages below.  N.B.  It is important to touch all
4242   // the way down to and including i=StackShadowPages.
4243   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4244     // this could be any sized move but this is can be a debugging crumb
4245     // so the bigger the better.
4246     lea(tmp, Address(tmp, -os::vm_page_size()));
4247     str(size, Address(tmp));
4248   }
4249 }
4250 
4251 
4252 // Move the address of the polling page into dest.
4253 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4254   if (SafepointMechanism::uses_thread_local_poll()) {
4255     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4256   } else {
4257     unsigned long off;
4258     adrp(dest, Address(page, rtype), off);
4259     assert(off == 0, "polling page must be page aligned");
4260   }
4261 }
4262 
4263 // Move the address of the polling page into r, then read the polling
4264 // page.
4265 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4266   get_polling_page(r, page, rtype);
4267   return read_polling_page(r, rtype);
4268 }
4269 
4270 // Read the polling page.  The address of the polling page must
4271 // already be in r.
4272 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4273   InstructionMark im(this);
4274   code_section()->relocate(inst_mark(), rtype);
4275   ldrw(zr, Address(r, 0));
4276   return inst_mark();
4277 }
4278 
4279 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4280   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4281   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4282   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4283   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4284   long offset_low = dest_page - low_page;
4285   long offset_high = dest_page - high_page;
4286 
4287   assert(is_valid_AArch64_address(dest.target()), "bad address");
4288   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4289 
4290   InstructionMark im(this);
4291   code_section()->relocate(inst_mark(), dest.rspec());
4292   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4293   // the code cache so that if it is relocated we know it will still reach
4294   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4295     _adrp(reg1, dest.target());
4296   } else {
4297     unsigned long target = (unsigned long)dest.target();
4298     unsigned long adrp_target
4299       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4300 
4301     _adrp(reg1, (address)adrp_target);
4302     movk(reg1, target >> 32, 32);
4303   }
4304   byte_offset = (unsigned long)dest.target() & 0xfff;
4305 }
4306 
4307 void MacroAssembler::load_byte_map_base(Register reg) {
4308   CardTable::CardValue* byte_map_base =
4309     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4310 
4311   if (is_valid_AArch64_address((address)byte_map_base)) {
4312     // Strictly speaking the byte_map_base isn't an address at all,
4313     // and it might even be negative.
4314     unsigned long offset;
4315     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4316     // We expect offset to be zero with most collectors.
4317     if (offset != 0) {
4318       add(reg, reg, offset);
4319     }
4320   } else {
4321     mov(reg, (uint64_t)byte_map_base);
4322   }
4323 }
4324 
4325 void MacroAssembler::build_frame(int framesize) {
4326   assert(framesize > 0, "framesize must be > 0");
4327   if (framesize < ((1 << 9) + 2 * wordSize)) {
4328     sub(sp, sp, framesize);
4329     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4330     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4331   } else {
4332     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4333     if (PreserveFramePointer) mov(rfp, sp);
4334     if (framesize < ((1 << 12) + 2 * wordSize))
4335       sub(sp, sp, framesize - 2 * wordSize);
4336     else {
4337       mov(rscratch1, framesize - 2 * wordSize);
4338       sub(sp, sp, rscratch1);
4339     }
4340   }
4341 }
4342 
4343 void MacroAssembler::remove_frame(int framesize) {
4344   assert(framesize > 0, "framesize must be > 0");
4345   if (framesize < ((1 << 9) + 2 * wordSize)) {
4346     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4347     add(sp, sp, framesize);
4348   } else {
4349     if (framesize < ((1 << 12) + 2 * wordSize))
4350       add(sp, sp, framesize - 2 * wordSize);
4351     else {
4352       mov(rscratch1, framesize - 2 * wordSize);
4353       add(sp, sp, rscratch1);
4354     }
4355     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4356   }
4357 }
4358 
4359 #ifdef COMPILER2
4360 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4361 
4362 // Search for str1 in str2 and return index or -1
4363 void MacroAssembler::string_indexof(Register str2, Register str1,
4364                                     Register cnt2, Register cnt1,
4365                                     Register tmp1, Register tmp2,
4366                                     Register tmp3, Register tmp4,
4367                                     Register tmp5, Register tmp6,
4368                                     int icnt1, Register result, int ae) {
4369   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4370   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4371 
4372   Register ch1 = rscratch1;
4373   Register ch2 = rscratch2;
4374   Register cnt1tmp = tmp1;
4375   Register cnt2tmp = tmp2;
4376   Register cnt1_neg = cnt1;
4377   Register cnt2_neg = cnt2;
4378   Register result_tmp = tmp4;
4379 
4380   bool isL = ae == StrIntrinsicNode::LL;
4381 
4382   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4383   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4384   int str1_chr_shift = str1_isL ? 0:1;
4385   int str2_chr_shift = str2_isL ? 0:1;
4386   int str1_chr_size = str1_isL ? 1:2;
4387   int str2_chr_size = str2_isL ? 1:2;
4388   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4389                                       (chr_insn)&MacroAssembler::ldrh;
4390   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4391                                       (chr_insn)&MacroAssembler::ldrh;
4392   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4393   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4394 
4395   // Note, inline_string_indexOf() generates checks:
4396   // if (substr.count > string.count) return -1;
4397   // if (substr.count == 0) return 0;
4398 
4399   // We have two strings, a source string in str2, cnt2 and a pattern string
4400   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4401 
4402   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4403   // With a small pattern and source we use linear scan.
4404 
4405   if (icnt1 == -1) {
4406     sub(result_tmp, cnt2, cnt1);
4407     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4408     br(LT, LINEARSEARCH);
4409     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4410     subs(zr, cnt1, 256);
4411     lsr(tmp1, cnt2, 2);
4412     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4413     br(GE, LINEARSTUB);
4414   }
4415 
4416 // The Boyer Moore alogorithm is based on the description here:-
4417 //
4418 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4419 //
4420 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4421 // and the 'Good Suffix' rule.
4422 //
4423 // These rules are essentially heuristics for how far we can shift the
4424 // pattern along the search string.
4425 //
4426 // The implementation here uses the 'Bad Character' rule only because of the
4427 // complexity of initialisation for the 'Good Suffix' rule.
4428 //
4429 // This is also known as the Boyer-Moore-Horspool algorithm:-
4430 //
4431 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4432 //
4433 // This particular implementation has few java-specific optimizations.
4434 //
4435 // #define ASIZE 256
4436 //
4437 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4438 //       int i, j;
4439 //       unsigned c;
4440 //       unsigned char bc[ASIZE];
4441 //
4442 //       /* Preprocessing */
4443 //       for (i = 0; i < ASIZE; ++i)
4444 //          bc[i] = m;
4445 //       for (i = 0; i < m - 1; ) {
4446 //          c = x[i];
4447 //          ++i;
4448 //          // c < 256 for Latin1 string, so, no need for branch
4449 //          #ifdef PATTERN_STRING_IS_LATIN1
4450 //          bc[c] = m - i;
4451 //          #else
4452 //          if (c < ASIZE) bc[c] = m - i;
4453 //          #endif
4454 //       }
4455 //
4456 //       /* Searching */
4457 //       j = 0;
4458 //       while (j <= n - m) {
4459 //          c = y[i+j];
4460 //          if (x[m-1] == c)
4461 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4462 //          if (i < 0) return j;
4463 //          // c < 256 for Latin1 string, so, no need for branch
4464 //          #ifdef SOURCE_STRING_IS_LATIN1
4465 //          // LL case: (c< 256) always true. Remove branch
4466 //          j += bc[y[j+m-1]];
4467 //          #endif
4468 //          #ifndef PATTERN_STRING_IS_UTF
4469 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4470 //          if (c < ASIZE)
4471 //            j += bc[y[j+m-1]];
4472 //          else
4473 //            j += 1
4474 //          #endif
4475 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4476 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4477 //          if (c < ASIZE)
4478 //            j += bc[y[j+m-1]];
4479 //          else
4480 //            j += m
4481 //          #endif
4482 //       }
4483 //    }
4484 
4485   if (icnt1 == -1) {
4486     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4487         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4488     Register cnt1end = tmp2;
4489     Register str2end = cnt2;
4490     Register skipch = tmp2;
4491 
4492     // str1 length is >=8, so, we can read at least 1 register for cases when
4493     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4494     // UL case. We'll re-read last character in inner pre-loop code to have
4495     // single outer pre-loop load
4496     const int firstStep = isL ? 7 : 3;
4497 
4498     const int ASIZE = 256;
4499     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4500     sub(sp, sp, ASIZE);
4501     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4502     mov(ch1, sp);
4503     BIND(BM_INIT_LOOP);
4504       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4505       subs(tmp5, tmp5, 1);
4506       br(GT, BM_INIT_LOOP);
4507 
4508       sub(cnt1tmp, cnt1, 1);
4509       mov(tmp5, str2);
4510       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4511       sub(ch2, cnt1, 1);
4512       mov(tmp3, str1);
4513     BIND(BCLOOP);
4514       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4515       if (!str1_isL) {
4516         subs(zr, ch1, ASIZE);
4517         br(HS, BCSKIP);
4518       }
4519       strb(ch2, Address(sp, ch1));
4520     BIND(BCSKIP);
4521       subs(ch2, ch2, 1);
4522       br(GT, BCLOOP);
4523 
4524       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4525       if (str1_isL == str2_isL) {
4526         // load last 8 bytes (8LL/4UU symbols)
4527         ldr(tmp6, Address(tmp6, -wordSize));
4528       } else {
4529         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4530         // convert Latin1 to UTF. We'll have to wait until load completed, but
4531         // it's still faster than per-character loads+checks
4532         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4533         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4534         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4535         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4536         orr(ch2, ch1, ch2, LSL, 16);
4537         orr(tmp6, tmp6, tmp3, LSL, 48);
4538         orr(tmp6, tmp6, ch2, LSL, 16);
4539       }
4540     BIND(BMLOOPSTR2);
4541       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4542       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4543       if (str1_isL == str2_isL) {
4544         // re-init tmp3. It's for free because it's executed in parallel with
4545         // load above. Alternative is to initialize it before loop, but it'll
4546         // affect performance on in-order systems with 2 or more ld/st pipelines
4547         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4548       }
4549       if (!isL) { // UU/UL case
4550         lsl(ch2, cnt1tmp, 1); // offset in bytes
4551       }
4552       cmp(tmp3, skipch);
4553       br(NE, BMSKIP);
4554       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4555       mov(ch1, tmp6);
4556       if (isL) {
4557         b(BMLOOPSTR1_AFTER_LOAD);
4558       } else {
4559         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4560         b(BMLOOPSTR1_CMP);
4561       }
4562     BIND(BMLOOPSTR1);
4563       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4564       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4565     BIND(BMLOOPSTR1_AFTER_LOAD);
4566       subs(cnt1tmp, cnt1tmp, 1);
4567       br(LT, BMLOOPSTR1_LASTCMP);
4568     BIND(BMLOOPSTR1_CMP);
4569       cmp(ch1, ch2);
4570       br(EQ, BMLOOPSTR1);
4571     BIND(BMSKIP);
4572       if (!isL) {
4573         // if we've met UTF symbol while searching Latin1 pattern, then we can
4574         // skip cnt1 symbols
4575         if (str1_isL != str2_isL) {
4576           mov(result_tmp, cnt1);
4577         } else {
4578           mov(result_tmp, 1);
4579         }
4580         subs(zr, skipch, ASIZE);
4581         br(HS, BMADV);
4582       }
4583       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4584     BIND(BMADV);
4585       sub(cnt1tmp, cnt1, 1);
4586       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4587       cmp(str2, str2end);
4588       br(LE, BMLOOPSTR2);
4589       add(sp, sp, ASIZE);
4590       b(NOMATCH);
4591     BIND(BMLOOPSTR1_LASTCMP);
4592       cmp(ch1, ch2);
4593       br(NE, BMSKIP);
4594     BIND(BMMATCH);
4595       sub(result, str2, tmp5);
4596       if (!str2_isL) lsr(result, result, 1);
4597       add(sp, sp, ASIZE);
4598       b(DONE);
4599 
4600     BIND(LINEARSTUB);
4601     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4602     br(LT, LINEAR_MEDIUM);
4603     mov(result, zr);
4604     RuntimeAddress stub = NULL;
4605     if (isL) {
4606       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4607       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4608     } else if (str1_isL) {
4609       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4610        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4611     } else {
4612       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4613       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4614     }
4615     trampoline_call(stub);
4616     b(DONE);
4617   }
4618 
4619   BIND(LINEARSEARCH);
4620   {
4621     Label DO1, DO2, DO3;
4622 
4623     Register str2tmp = tmp2;
4624     Register first = tmp3;
4625 
4626     if (icnt1 == -1)
4627     {
4628         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4629 
4630         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4631         br(LT, DOSHORT);
4632       BIND(LINEAR_MEDIUM);
4633         (this->*str1_load_1chr)(first, Address(str1));
4634         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4635         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4636         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4637         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4638 
4639       BIND(FIRST_LOOP);
4640         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4641         cmp(first, ch2);
4642         br(EQ, STR1_LOOP);
4643       BIND(STR2_NEXT);
4644         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4645         br(LE, FIRST_LOOP);
4646         b(NOMATCH);
4647 
4648       BIND(STR1_LOOP);
4649         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4650         add(cnt2tmp, cnt2_neg, str2_chr_size);
4651         br(GE, MATCH);
4652 
4653       BIND(STR1_NEXT);
4654         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4655         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4656         cmp(ch1, ch2);
4657         br(NE, STR2_NEXT);
4658         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4659         add(cnt2tmp, cnt2tmp, str2_chr_size);
4660         br(LT, STR1_NEXT);
4661         b(MATCH);
4662 
4663       BIND(DOSHORT);
4664       if (str1_isL == str2_isL) {
4665         cmp(cnt1, (u1)2);
4666         br(LT, DO1);
4667         br(GT, DO3);
4668       }
4669     }
4670 
4671     if (icnt1 == 4) {
4672       Label CH1_LOOP;
4673 
4674         (this->*load_4chr)(ch1, str1);
4675         sub(result_tmp, cnt2, 4);
4676         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4677         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4678 
4679       BIND(CH1_LOOP);
4680         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4681         cmp(ch1, ch2);
4682         br(EQ, MATCH);
4683         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4684         br(LE, CH1_LOOP);
4685         b(NOMATCH);
4686       }
4687 
4688     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4689       Label CH1_LOOP;
4690 
4691       BIND(DO2);
4692         (this->*load_2chr)(ch1, str1);
4693         if (icnt1 == 2) {
4694           sub(result_tmp, cnt2, 2);
4695         }
4696         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4697         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4698       BIND(CH1_LOOP);
4699         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4700         cmp(ch1, ch2);
4701         br(EQ, MATCH);
4702         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4703         br(LE, CH1_LOOP);
4704         b(NOMATCH);
4705     }
4706 
4707     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4708       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4709 
4710       BIND(DO3);
4711         (this->*load_2chr)(first, str1);
4712         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4713         if (icnt1 == 3) {
4714           sub(result_tmp, cnt2, 3);
4715         }
4716         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4717         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4718       BIND(FIRST_LOOP);
4719         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4720         cmpw(first, ch2);
4721         br(EQ, STR1_LOOP);
4722       BIND(STR2_NEXT);
4723         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4724         br(LE, FIRST_LOOP);
4725         b(NOMATCH);
4726 
4727       BIND(STR1_LOOP);
4728         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4729         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4730         cmp(ch1, ch2);
4731         br(NE, STR2_NEXT);
4732         b(MATCH);
4733     }
4734 
4735     if (icnt1 == -1 || icnt1 == 1) {
4736       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4737 
4738       BIND(DO1);
4739         (this->*str1_load_1chr)(ch1, str1);
4740         cmp(cnt2, (u1)8);
4741         br(LT, DO1_SHORT);
4742 
4743         sub(result_tmp, cnt2, 8/str2_chr_size);
4744         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4745         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4746         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4747 
4748         if (str2_isL) {
4749           orr(ch1, ch1, ch1, LSL, 8);
4750         }
4751         orr(ch1, ch1, ch1, LSL, 16);
4752         orr(ch1, ch1, ch1, LSL, 32);
4753       BIND(CH1_LOOP);
4754         ldr(ch2, Address(str2, cnt2_neg));
4755         eor(ch2, ch1, ch2);
4756         sub(tmp1, ch2, tmp3);
4757         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4758         bics(tmp1, tmp1, tmp2);
4759         br(NE, HAS_ZERO);
4760         adds(cnt2_neg, cnt2_neg, 8);
4761         br(LT, CH1_LOOP);
4762 
4763         cmp(cnt2_neg, (u1)8);
4764         mov(cnt2_neg, 0);
4765         br(LT, CH1_LOOP);
4766         b(NOMATCH);
4767 
4768       BIND(HAS_ZERO);
4769         rev(tmp1, tmp1);
4770         clz(tmp1, tmp1);
4771         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4772         b(MATCH);
4773 
4774       BIND(DO1_SHORT);
4775         mov(result_tmp, cnt2);
4776         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4777         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4778       BIND(DO1_LOOP);
4779         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4780         cmpw(ch1, ch2);
4781         br(EQ, MATCH);
4782         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4783         br(LT, DO1_LOOP);
4784     }
4785   }
4786   BIND(NOMATCH);
4787     mov(result, -1);
4788     b(DONE);
4789   BIND(MATCH);
4790     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4791   BIND(DONE);
4792 }
4793 
4794 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4795 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4796 
4797 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4798                                          Register ch, Register result,
4799                                          Register tmp1, Register tmp2, Register tmp3)
4800 {
4801   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4802   Register cnt1_neg = cnt1;
4803   Register ch1 = rscratch1;
4804   Register result_tmp = rscratch2;
4805 
4806   cmp(cnt1, (u1)4);
4807   br(LT, DO1_SHORT);
4808 
4809   orr(ch, ch, ch, LSL, 16);
4810   orr(ch, ch, ch, LSL, 32);
4811 
4812   sub(cnt1, cnt1, 4);
4813   mov(result_tmp, cnt1);
4814   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4815   sub(cnt1_neg, zr, cnt1, LSL, 1);
4816 
4817   mov(tmp3, 0x0001000100010001);
4818 
4819   BIND(CH1_LOOP);
4820     ldr(ch1, Address(str1, cnt1_neg));
4821     eor(ch1, ch, ch1);
4822     sub(tmp1, ch1, tmp3);
4823     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4824     bics(tmp1, tmp1, tmp2);
4825     br(NE, HAS_ZERO);
4826     adds(cnt1_neg, cnt1_neg, 8);
4827     br(LT, CH1_LOOP);
4828 
4829     cmp(cnt1_neg, (u1)8);
4830     mov(cnt1_neg, 0);
4831     br(LT, CH1_LOOP);
4832     b(NOMATCH);
4833 
4834   BIND(HAS_ZERO);
4835     rev(tmp1, tmp1);
4836     clz(tmp1, tmp1);
4837     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4838     b(MATCH);
4839 
4840   BIND(DO1_SHORT);
4841     mov(result_tmp, cnt1);
4842     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4843     sub(cnt1_neg, zr, cnt1, LSL, 1);
4844   BIND(DO1_LOOP);
4845     ldrh(ch1, Address(str1, cnt1_neg));
4846     cmpw(ch, ch1);
4847     br(EQ, MATCH);
4848     adds(cnt1_neg, cnt1_neg, 2);
4849     br(LT, DO1_LOOP);
4850   BIND(NOMATCH);
4851     mov(result, -1);
4852     b(DONE);
4853   BIND(MATCH);
4854     add(result, result_tmp, cnt1_neg, ASR, 1);
4855   BIND(DONE);
4856 }
4857 
4858 // Compare strings.
4859 void MacroAssembler::string_compare(Register str1, Register str2,
4860     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4861     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4862   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4863       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4864       SHORT_LOOP_START, TAIL_CHECK;
4865 
4866   const u1 STUB_THRESHOLD = 64 + 8;
4867   bool isLL = ae == StrIntrinsicNode::LL;
4868   bool isLU = ae == StrIntrinsicNode::LU;
4869   bool isUL = ae == StrIntrinsicNode::UL;
4870 
4871   bool str1_isL = isLL || isLU;
4872   bool str2_isL = isLL || isUL;
4873 
4874   int str1_chr_shift = str1_isL ? 0 : 1;
4875   int str2_chr_shift = str2_isL ? 0 : 1;
4876   int str1_chr_size = str1_isL ? 1 : 2;
4877   int str2_chr_size = str2_isL ? 1 : 2;
4878   int minCharsInWord = isLL ? wordSize : wordSize/2;
4879 
4880   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4881   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4882                                       (chr_insn)&MacroAssembler::ldrh;
4883   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4884                                       (chr_insn)&MacroAssembler::ldrh;
4885   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4886                             (uxt_insn)&MacroAssembler::uxthw;
4887 
4888   BLOCK_COMMENT("string_compare {");
4889 
4890   // Bizzarely, the counts are passed in bytes, regardless of whether they
4891   // are L or U strings, however the result is always in characters.
4892   if (!str1_isL) asrw(cnt1, cnt1, 1);
4893   if (!str2_isL) asrw(cnt2, cnt2, 1);
4894 
4895   // Compute the minimum of the string lengths and save the difference.
4896   subsw(result, cnt1, cnt2);
4897   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4898 
4899   // A very short string
4900   cmpw(cnt2, minCharsInWord);
4901   br(Assembler::LE, SHORT_STRING);
4902 
4903   // Compare longwords
4904   // load first parts of strings and finish initialization while loading
4905   {
4906     if (str1_isL == str2_isL) { // LL or UU
4907       ldr(tmp1, Address(str1));
4908       cmp(str1, str2);
4909       br(Assembler::EQ, DONE);
4910       ldr(tmp2, Address(str2));
4911       cmp(cnt2, STUB_THRESHOLD);
4912       br(GE, STUB);
4913       subsw(cnt2, cnt2, minCharsInWord);
4914       br(EQ, TAIL_CHECK);
4915       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4916       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4917       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4918     } else if (isLU) {
4919       ldrs(vtmp, Address(str1));
4920       cmp(str1, str2);
4921       br(Assembler::EQ, DONE);
4922       ldr(tmp2, Address(str2));
4923       cmp(cnt2, STUB_THRESHOLD);
4924       br(GE, STUB);
4925       subw(cnt2, cnt2, 4);
4926       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4927       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4928       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4929       zip1(vtmp, T8B, vtmp, vtmpZ);
4930       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4931       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4932       add(cnt1, cnt1, 4);
4933       fmovd(tmp1, vtmp);
4934     } else { // UL case
4935       ldr(tmp1, Address(str1));
4936       cmp(str1, str2);
4937       br(Assembler::EQ, DONE);
4938       ldrs(vtmp, Address(str2));
4939       cmp(cnt2, STUB_THRESHOLD);
4940       br(GE, STUB);
4941       subw(cnt2, cnt2, 4);
4942       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4943       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4944       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4945       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4946       zip1(vtmp, T8B, vtmp, vtmpZ);
4947       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4948       add(cnt1, cnt1, 8);
4949       fmovd(tmp2, vtmp);
4950     }
4951     adds(cnt2, cnt2, isUL ? 4 : 8);
4952     br(GE, TAIL);
4953     eor(rscratch2, tmp1, tmp2);
4954     cbnz(rscratch2, DIFFERENCE);
4955     // main loop
4956     bind(NEXT_WORD);
4957     if (str1_isL == str2_isL) {
4958       ldr(tmp1, Address(str1, cnt2));
4959       ldr(tmp2, Address(str2, cnt2));
4960       adds(cnt2, cnt2, 8);
4961     } else if (isLU) {
4962       ldrs(vtmp, Address(str1, cnt1));
4963       ldr(tmp2, Address(str2, cnt2));
4964       add(cnt1, cnt1, 4);
4965       zip1(vtmp, T8B, vtmp, vtmpZ);
4966       fmovd(tmp1, vtmp);
4967       adds(cnt2, cnt2, 8);
4968     } else { // UL
4969       ldrs(vtmp, Address(str2, cnt2));
4970       ldr(tmp1, Address(str1, cnt1));
4971       zip1(vtmp, T8B, vtmp, vtmpZ);
4972       add(cnt1, cnt1, 8);
4973       fmovd(tmp2, vtmp);
4974       adds(cnt2, cnt2, 4);
4975     }
4976     br(GE, TAIL);
4977 
4978     eor(rscratch2, tmp1, tmp2);
4979     cbz(rscratch2, NEXT_WORD);
4980     b(DIFFERENCE);
4981     bind(TAIL);
4982     eor(rscratch2, tmp1, tmp2);
4983     cbnz(rscratch2, DIFFERENCE);
4984     // Last longword.  In the case where length == 4 we compare the
4985     // same longword twice, but that's still faster than another
4986     // conditional branch.
4987     if (str1_isL == str2_isL) {
4988       ldr(tmp1, Address(str1));
4989       ldr(tmp2, Address(str2));
4990     } else if (isLU) {
4991       ldrs(vtmp, Address(str1));
4992       ldr(tmp2, Address(str2));
4993       zip1(vtmp, T8B, vtmp, vtmpZ);
4994       fmovd(tmp1, vtmp);
4995     } else { // UL
4996       ldrs(vtmp, Address(str2));
4997       ldr(tmp1, Address(str1));
4998       zip1(vtmp, T8B, vtmp, vtmpZ);
4999       fmovd(tmp2, vtmp);
5000     }
5001     bind(TAIL_CHECK);
5002     eor(rscratch2, tmp1, tmp2);
5003     cbz(rscratch2, DONE);
5004 
5005     // Find the first different characters in the longwords and
5006     // compute their difference.
5007     bind(DIFFERENCE);
5008     rev(rscratch2, rscratch2);
5009     clz(rscratch2, rscratch2);
5010     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5011     lsrv(tmp1, tmp1, rscratch2);
5012     (this->*ext_chr)(tmp1, tmp1);
5013     lsrv(tmp2, tmp2, rscratch2);
5014     (this->*ext_chr)(tmp2, tmp2);
5015     subw(result, tmp1, tmp2);
5016     b(DONE);
5017   }
5018 
5019   bind(STUB);
5020     RuntimeAddress stub = NULL;
5021     switch(ae) {
5022       case StrIntrinsicNode::LL:
5023         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5024         break;
5025       case StrIntrinsicNode::UU:
5026         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5027         break;
5028       case StrIntrinsicNode::LU:
5029         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5030         break;
5031       case StrIntrinsicNode::UL:
5032         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5033         break;
5034       default:
5035         ShouldNotReachHere();
5036      }
5037     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5038     trampoline_call(stub);
5039     b(DONE);
5040 
5041   bind(SHORT_STRING);
5042   // Is the minimum length zero?
5043   cbz(cnt2, DONE);
5044   // arrange code to do most branches while loading and loading next characters
5045   // while comparing previous
5046   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5047   subs(cnt2, cnt2, 1);
5048   br(EQ, SHORT_LAST_INIT);
5049   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5050   b(SHORT_LOOP_START);
5051   bind(SHORT_LOOP);
5052   subs(cnt2, cnt2, 1);
5053   br(EQ, SHORT_LAST);
5054   bind(SHORT_LOOP_START);
5055   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5056   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5057   cmp(tmp1, cnt1);
5058   br(NE, SHORT_LOOP_TAIL);
5059   subs(cnt2, cnt2, 1);
5060   br(EQ, SHORT_LAST2);
5061   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5062   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5063   cmp(tmp2, rscratch1);
5064   br(EQ, SHORT_LOOP);
5065   sub(result, tmp2, rscratch1);
5066   b(DONE);
5067   bind(SHORT_LOOP_TAIL);
5068   sub(result, tmp1, cnt1);
5069   b(DONE);
5070   bind(SHORT_LAST2);
5071   cmp(tmp2, rscratch1);
5072   br(EQ, DONE);
5073   sub(result, tmp2, rscratch1);
5074 
5075   b(DONE);
5076   bind(SHORT_LAST_INIT);
5077   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5078   bind(SHORT_LAST);
5079   cmp(tmp1, cnt1);
5080   br(EQ, DONE);
5081   sub(result, tmp1, cnt1);
5082 
5083   bind(DONE);
5084 
5085   BLOCK_COMMENT("} string_compare");
5086 }
5087 #endif // COMPILER2
5088 
5089 // This method checks if provided byte array contains byte with highest bit set.
5090 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5091     // Simple and most common case of aligned small array which is not at the
5092     // end of memory page is placed here. All other cases are in stub.
5093     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5094     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5095     assert_different_registers(ary1, len, result);
5096 
5097     cmpw(len, 0);
5098     br(LE, SET_RESULT);
5099     cmpw(len, 4 * wordSize);
5100     br(GE, STUB_LONG); // size > 32 then go to stub
5101 
5102     int shift = 64 - exact_log2(os::vm_page_size());
5103     lsl(rscratch1, ary1, shift);
5104     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5105     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5106     br(CS, STUB); // at the end of page then go to stub
5107     subs(len, len, wordSize);
5108     br(LT, END);
5109 
5110   BIND(LOOP);
5111     ldr(rscratch1, Address(post(ary1, wordSize)));
5112     tst(rscratch1, UPPER_BIT_MASK);
5113     br(NE, SET_RESULT);
5114     subs(len, len, wordSize);
5115     br(GE, LOOP);
5116     cmpw(len, -wordSize);
5117     br(EQ, SET_RESULT);
5118 
5119   BIND(END);
5120     ldr(result, Address(ary1));
5121     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5122     lslv(result, result, len);
5123     tst(result, UPPER_BIT_MASK);
5124     b(SET_RESULT);
5125 
5126   BIND(STUB);
5127     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5128     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5129     trampoline_call(has_neg);
5130     b(DONE);
5131 
5132   BIND(STUB_LONG);
5133     RuntimeAddress has_neg_long =  RuntimeAddress(
5134             StubRoutines::aarch64::has_negatives_long());
5135     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5136     trampoline_call(has_neg_long);
5137     b(DONE);
5138 
5139   BIND(SET_RESULT);
5140     cset(result, NE); // set true or false
5141 
5142   BIND(DONE);
5143 }
5144 
5145 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5146                                    Register tmp4, Register tmp5, Register result,
5147                                    Register cnt1, int elem_size) {
5148   Label DONE, SAME;
5149   Register tmp1 = rscratch1;
5150   Register tmp2 = rscratch2;
5151   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5152   int elem_per_word = wordSize/elem_size;
5153   int log_elem_size = exact_log2(elem_size);
5154   int length_offset = arrayOopDesc::length_offset_in_bytes();
5155   int base_offset
5156     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5157   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5158 
5159   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5160   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5161 
5162 #ifndef PRODUCT
5163   {
5164     const char kind = (elem_size == 2) ? 'U' : 'L';
5165     char comment[64];
5166     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5167     BLOCK_COMMENT(comment);
5168   }
5169 #endif
5170 
5171   // if (a1 == a2)
5172   //     return true;
5173   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5174   br(EQ, SAME);
5175 
5176   if (UseSimpleArrayEquals) {
5177     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5178     // if (a1 == null || a2 == null)
5179     //     return false;
5180     // a1 & a2 == 0 means (some-pointer is null) or
5181     // (very-rare-or-even-probably-impossible-pointer-values)
5182     // so, we can save one branch in most cases
5183     tst(a1, a2);
5184     mov(result, false);
5185     br(EQ, A_MIGHT_BE_NULL);
5186     // if (a1.length != a2.length)
5187     //      return false;
5188     bind(A_IS_NOT_NULL);
5189     ldrw(cnt1, Address(a1, length_offset));
5190     ldrw(cnt2, Address(a2, length_offset));
5191     eorw(tmp5, cnt1, cnt2);
5192     cbnzw(tmp5, DONE);
5193     lea(a1, Address(a1, base_offset));
5194     lea(a2, Address(a2, base_offset));
5195     // Check for short strings, i.e. smaller than wordSize.
5196     subs(cnt1, cnt1, elem_per_word);
5197     br(Assembler::LT, SHORT);
5198     // Main 8 byte comparison loop.
5199     bind(NEXT_WORD); {
5200       ldr(tmp1, Address(post(a1, wordSize)));
5201       ldr(tmp2, Address(post(a2, wordSize)));
5202       subs(cnt1, cnt1, elem_per_word);
5203       eor(tmp5, tmp1, tmp2);
5204       cbnz(tmp5, DONE);
5205     } br(GT, NEXT_WORD);
5206     // Last longword.  In the case where length == 4 we compare the
5207     // same longword twice, but that's still faster than another
5208     // conditional branch.
5209     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5210     // length == 4.
5211     if (log_elem_size > 0)
5212       lsl(cnt1, cnt1, log_elem_size);
5213     ldr(tmp3, Address(a1, cnt1));
5214     ldr(tmp4, Address(a2, cnt1));
5215     eor(tmp5, tmp3, tmp4);
5216     cbnz(tmp5, DONE);
5217     b(SAME);
5218     bind(A_MIGHT_BE_NULL);
5219     // in case both a1 and a2 are not-null, proceed with loads
5220     cbz(a1, DONE);
5221     cbz(a2, DONE);
5222     b(A_IS_NOT_NULL);
5223     bind(SHORT);
5224 
5225     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5226     {
5227       ldrw(tmp1, Address(post(a1, 4)));
5228       ldrw(tmp2, Address(post(a2, 4)));
5229       eorw(tmp5, tmp1, tmp2);
5230       cbnzw(tmp5, DONE);
5231     }
5232     bind(TAIL03);
5233     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5234     {
5235       ldrh(tmp3, Address(post(a1, 2)));
5236       ldrh(tmp4, Address(post(a2, 2)));
5237       eorw(tmp5, tmp3, tmp4);
5238       cbnzw(tmp5, DONE);
5239     }
5240     bind(TAIL01);
5241     if (elem_size == 1) { // Only needed when comparing byte arrays.
5242       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5243       {
5244         ldrb(tmp1, a1);
5245         ldrb(tmp2, a2);
5246         eorw(tmp5, tmp1, tmp2);
5247         cbnzw(tmp5, DONE);
5248       }
5249     }
5250   } else {
5251     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5252         CSET_EQ, LAST_CHECK;
5253     mov(result, false);
5254     cbz(a1, DONE);
5255     ldrw(cnt1, Address(a1, length_offset));
5256     cbz(a2, DONE);
5257     ldrw(cnt2, Address(a2, length_offset));
5258     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5259     // faster to perform another branch before comparing a1 and a2
5260     cmp(cnt1, (u1)elem_per_word);
5261     br(LE, SHORT); // short or same
5262     ldr(tmp3, Address(pre(a1, base_offset)));
5263     subs(zr, cnt1, stubBytesThreshold);
5264     br(GE, STUB);
5265     ldr(tmp4, Address(pre(a2, base_offset)));
5266     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5267     cmp(cnt2, cnt1);
5268     br(NE, DONE);
5269 
5270     // Main 16 byte comparison loop with 2 exits
5271     bind(NEXT_DWORD); {
5272       ldr(tmp1, Address(pre(a1, wordSize)));
5273       ldr(tmp2, Address(pre(a2, wordSize)));
5274       subs(cnt1, cnt1, 2 * elem_per_word);
5275       br(LE, TAIL);
5276       eor(tmp4, tmp3, tmp4);
5277       cbnz(tmp4, DONE);
5278       ldr(tmp3, Address(pre(a1, wordSize)));
5279       ldr(tmp4, Address(pre(a2, wordSize)));
5280       cmp(cnt1, (u1)elem_per_word);
5281       br(LE, TAIL2);
5282       cmp(tmp1, tmp2);
5283     } br(EQ, NEXT_DWORD);
5284     b(DONE);
5285 
5286     bind(TAIL);
5287     eor(tmp4, tmp3, tmp4);
5288     eor(tmp2, tmp1, tmp2);
5289     lslv(tmp2, tmp2, tmp5);
5290     orr(tmp5, tmp4, tmp2);
5291     cmp(tmp5, zr);
5292     b(CSET_EQ);
5293 
5294     bind(TAIL2);
5295     eor(tmp2, tmp1, tmp2);
5296     cbnz(tmp2, DONE);
5297     b(LAST_CHECK);
5298 
5299     bind(STUB);
5300     ldr(tmp4, Address(pre(a2, base_offset)));
5301     cmp(cnt2, cnt1);
5302     br(NE, DONE);
5303     if (elem_size == 2) { // convert to byte counter
5304       lsl(cnt1, cnt1, 1);
5305     }
5306     eor(tmp5, tmp3, tmp4);
5307     cbnz(tmp5, DONE);
5308     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5309     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5310     trampoline_call(stub);
5311     b(DONE);
5312 
5313     bind(EARLY_OUT);
5314     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5315     // so, if a2 == null => return false(0), else return true, so we can return a2
5316     mov(result, a2);
5317     b(DONE);
5318     bind(SHORT);
5319     cmp(cnt2, cnt1);
5320     br(NE, DONE);
5321     cbz(cnt1, SAME);
5322     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5323     ldr(tmp3, Address(a1, base_offset));
5324     ldr(tmp4, Address(a2, base_offset));
5325     bind(LAST_CHECK);
5326     eor(tmp4, tmp3, tmp4);
5327     lslv(tmp5, tmp4, tmp5);
5328     cmp(tmp5, zr);
5329     bind(CSET_EQ);
5330     cset(result, EQ);
5331     b(DONE);
5332   }
5333 
5334   bind(SAME);
5335   mov(result, true);
5336   // That's it.
5337   bind(DONE);
5338 
5339   BLOCK_COMMENT("} array_equals");
5340 }
5341 
5342 // Compare Strings
5343 
5344 // For Strings we're passed the address of the first characters in a1
5345 // and a2 and the length in cnt1.
5346 // elem_size is the element size in bytes: either 1 or 2.
5347 // There are two implementations.  For arrays >= 8 bytes, all
5348 // comparisons (including the final one, which may overlap) are
5349 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5350 // halfword, then a short, and then a byte.
5351 
5352 void MacroAssembler::string_equals(Register a1, Register a2,
5353                                    Register result, Register cnt1, int elem_size)
5354 {
5355   Label SAME, DONE, SHORT, NEXT_WORD;
5356   Register tmp1 = rscratch1;
5357   Register tmp2 = rscratch2;
5358   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5359 
5360   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5361   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5362 
5363 #ifndef PRODUCT
5364   {
5365     const char kind = (elem_size == 2) ? 'U' : 'L';
5366     char comment[64];
5367     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5368     BLOCK_COMMENT(comment);
5369   }
5370 #endif
5371 
5372   mov(result, false);
5373 
5374   // Check for short strings, i.e. smaller than wordSize.
5375   subs(cnt1, cnt1, wordSize);
5376   br(Assembler::LT, SHORT);
5377   // Main 8 byte comparison loop.
5378   bind(NEXT_WORD); {
5379     ldr(tmp1, Address(post(a1, wordSize)));
5380     ldr(tmp2, Address(post(a2, wordSize)));
5381     subs(cnt1, cnt1, wordSize);
5382     eor(tmp1, tmp1, tmp2);
5383     cbnz(tmp1, DONE);
5384   } br(GT, NEXT_WORD);
5385   // Last longword.  In the case where length == 4 we compare the
5386   // same longword twice, but that's still faster than another
5387   // conditional branch.
5388   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5389   // length == 4.
5390   ldr(tmp1, Address(a1, cnt1));
5391   ldr(tmp2, Address(a2, cnt1));
5392   eor(tmp2, tmp1, tmp2);
5393   cbnz(tmp2, DONE);
5394   b(SAME);
5395 
5396   bind(SHORT);
5397   Label TAIL03, TAIL01;
5398 
5399   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5400   {
5401     ldrw(tmp1, Address(post(a1, 4)));
5402     ldrw(tmp2, Address(post(a2, 4)));
5403     eorw(tmp1, tmp1, tmp2);
5404     cbnzw(tmp1, DONE);
5405   }
5406   bind(TAIL03);
5407   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5408   {
5409     ldrh(tmp1, Address(post(a1, 2)));
5410     ldrh(tmp2, Address(post(a2, 2)));
5411     eorw(tmp1, tmp1, tmp2);
5412     cbnzw(tmp1, DONE);
5413   }
5414   bind(TAIL01);
5415   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5416     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5417     {
5418       ldrb(tmp1, a1);
5419       ldrb(tmp2, a2);
5420       eorw(tmp1, tmp1, tmp2);
5421       cbnzw(tmp1, DONE);
5422     }
5423   }
5424   // Arrays are equal.
5425   bind(SAME);
5426   mov(result, true);
5427 
5428   // That's it.
5429   bind(DONE);
5430   BLOCK_COMMENT("} string_equals");
5431 }
5432 
5433 
5434 // The size of the blocks erased by the zero_blocks stub.  We must
5435 // handle anything smaller than this ourselves in zero_words().
5436 const int MacroAssembler::zero_words_block_size = 8;
5437 
5438 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5439 // possible, handling small word counts locally and delegating
5440 // anything larger to the zero_blocks stub.  It is expanded many times
5441 // in compiled code, so it is important to keep it short.
5442 
5443 // ptr:   Address of a buffer to be zeroed.
5444 // cnt:   Count in HeapWords.
5445 //
5446 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5447 void MacroAssembler::zero_words(Register ptr, Register cnt)
5448 {
5449   assert(is_power_of_2(zero_words_block_size), "adjust this");
5450   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5451 
5452   BLOCK_COMMENT("zero_words {");
5453   cmp(cnt, (u1)zero_words_block_size);
5454   Label around;
5455   br(LO, around);
5456   {
5457     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5458     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5459     if (StubRoutines::aarch64::complete()) {
5460       trampoline_call(zero_blocks);
5461     } else {
5462       bl(zero_blocks);
5463     }
5464   }
5465   bind(around);
5466   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5467     Label l;
5468     tbz(cnt, exact_log2(i), l);
5469     for (int j = 0; j < i; j += 2) {
5470       stp(zr, zr, post(ptr, 16));
5471     }
5472     bind(l);
5473   }
5474   {
5475     Label l;
5476     tbz(cnt, 0, l);
5477     str(zr, Address(ptr));
5478     bind(l);
5479   }
5480   BLOCK_COMMENT("} zero_words");
5481 }
5482 
5483 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5484 // cnt:          Immediate count in HeapWords.
5485 #define SmallArraySize (18 * BytesPerLong)
5486 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5487 {
5488   BLOCK_COMMENT("zero_words {");
5489   int i = cnt & 1;  // store any odd word to start
5490   if (i) str(zr, Address(base));
5491 
5492   if (cnt <= SmallArraySize / BytesPerLong) {
5493     for (; i < (int)cnt; i += 2)
5494       stp(zr, zr, Address(base, i * wordSize));
5495   } else {
5496     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5497     int remainder = cnt % (2 * unroll);
5498     for (; i < remainder; i += 2)
5499       stp(zr, zr, Address(base, i * wordSize));
5500 
5501     Label loop;
5502     Register cnt_reg = rscratch1;
5503     Register loop_base = rscratch2;
5504     cnt = cnt - remainder;
5505     mov(cnt_reg, cnt);
5506     // adjust base and prebias by -2 * wordSize so we can pre-increment
5507     add(loop_base, base, (remainder - 2) * wordSize);
5508     bind(loop);
5509     sub(cnt_reg, cnt_reg, 2 * unroll);
5510     for (i = 1; i < unroll; i++)
5511       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5512     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5513     cbnz(cnt_reg, loop);
5514   }
5515   BLOCK_COMMENT("} zero_words");
5516 }
5517 
5518 // Zero blocks of memory by using DC ZVA.
5519 //
5520 // Aligns the base address first sufficently for DC ZVA, then uses
5521 // DC ZVA repeatedly for every full block.  cnt is the size to be
5522 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5523 // in cnt.
5524 //
5525 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5526 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5527 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5528   Register tmp = rscratch1;
5529   Register tmp2 = rscratch2;
5530   int zva_length = VM_Version::zva_length();
5531   Label initial_table_end, loop_zva;
5532   Label fini;
5533 
5534   // Base must be 16 byte aligned. If not just return and let caller handle it
5535   tst(base, 0x0f);
5536   br(Assembler::NE, fini);
5537   // Align base with ZVA length.
5538   neg(tmp, base);
5539   andr(tmp, tmp, zva_length - 1);
5540 
5541   // tmp: the number of bytes to be filled to align the base with ZVA length.
5542   add(base, base, tmp);
5543   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5544   adr(tmp2, initial_table_end);
5545   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5546   br(tmp2);
5547 
5548   for (int i = -zva_length + 16; i < 0; i += 16)
5549     stp(zr, zr, Address(base, i));
5550   bind(initial_table_end);
5551 
5552   sub(cnt, cnt, zva_length >> 3);
5553   bind(loop_zva);
5554   dc(Assembler::ZVA, base);
5555   subs(cnt, cnt, zva_length >> 3);
5556   add(base, base, zva_length);
5557   br(Assembler::GE, loop_zva);
5558   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5559   bind(fini);
5560 }
5561 
5562 // base:   Address of a buffer to be filled, 8 bytes aligned.
5563 // cnt:    Count in 8-byte unit.
5564 // value:  Value to be filled with.
5565 // base will point to the end of the buffer after filling.
5566 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5567 {
5568 //  Algorithm:
5569 //
5570 //    scratch1 = cnt & 7;
5571 //    cnt -= scratch1;
5572 //    p += scratch1;
5573 //    switch (scratch1) {
5574 //      do {
5575 //        cnt -= 8;
5576 //          p[-8] = v;
5577 //        case 7:
5578 //          p[-7] = v;
5579 //        case 6:
5580 //          p[-6] = v;
5581 //          // ...
5582 //        case 1:
5583 //          p[-1] = v;
5584 //        case 0:
5585 //          p += 8;
5586 //      } while (cnt);
5587 //    }
5588 
5589   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5590 
5591   Label fini, skip, entry, loop;
5592   const int unroll = 8; // Number of stp instructions we'll unroll
5593 
5594   cbz(cnt, fini);
5595   tbz(base, 3, skip);
5596   str(value, Address(post(base, 8)));
5597   sub(cnt, cnt, 1);
5598   bind(skip);
5599 
5600   andr(rscratch1, cnt, (unroll-1) * 2);
5601   sub(cnt, cnt, rscratch1);
5602   add(base, base, rscratch1, Assembler::LSL, 3);
5603   adr(rscratch2, entry);
5604   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5605   br(rscratch2);
5606 
5607   bind(loop);
5608   add(base, base, unroll * 16);
5609   for (int i = -unroll; i < 0; i++)
5610     stp(value, value, Address(base, i * 16));
5611   bind(entry);
5612   subs(cnt, cnt, unroll * 2);
5613   br(Assembler::GE, loop);
5614 
5615   tbz(cnt, 0, fini);
5616   str(value, Address(post(base, 8)));
5617   bind(fini);
5618 }
5619 
5620 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5621 // java/lang/StringUTF16.compress.
5622 void MacroAssembler::encode_iso_array(Register src, Register dst,
5623                       Register len, Register result,
5624                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5625                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5626 {
5627     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5628         NEXT_32_START, NEXT_32_PRFM_START;
5629     Register tmp1 = rscratch1, tmp2 = rscratch2;
5630 
5631       mov(result, len); // Save initial len
5632 
5633 #ifndef BUILTIN_SIM
5634       cmp(len, (u1)8); // handle shortest strings first
5635       br(LT, LOOP_1);
5636       cmp(len, (u1)32);
5637       br(LT, NEXT_8);
5638       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5639       // to convert chars to bytes
5640       if (SoftwarePrefetchHintDistance >= 0) {
5641         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5642         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5643         br(LE, NEXT_32_START);
5644         b(NEXT_32_PRFM_START);
5645         BIND(NEXT_32_PRFM);
5646           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5647         BIND(NEXT_32_PRFM_START);
5648           prfm(Address(src, SoftwarePrefetchHintDistance));
5649           orr(v4, T16B, Vtmp1, Vtmp2);
5650           orr(v5, T16B, Vtmp3, Vtmp4);
5651           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5652           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5653           uzp2(v5, T16B, v4, v5); // high bytes
5654           umov(tmp2, v5, D, 1);
5655           fmovd(tmp1, v5);
5656           orr(tmp1, tmp1, tmp2);
5657           cbnz(tmp1, LOOP_8);
5658           stpq(Vtmp1, Vtmp3, dst);
5659           sub(len, len, 32);
5660           add(dst, dst, 32);
5661           add(src, src, 64);
5662           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5663           br(GE, NEXT_32_PRFM);
5664           cmp(len, (u1)32);
5665           br(LT, LOOP_8);
5666         BIND(NEXT_32);
5667           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5668         BIND(NEXT_32_START);
5669       } else {
5670         BIND(NEXT_32);
5671           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5672       }
5673       prfm(Address(src, SoftwarePrefetchHintDistance));
5674       uzp1(v4, T16B, Vtmp1, Vtmp2);
5675       uzp1(v5, T16B, Vtmp3, Vtmp4);
5676       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5677       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5678       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5679       umov(tmp2, Vtmp1, D, 1);
5680       fmovd(tmp1, Vtmp1);
5681       orr(tmp1, tmp1, tmp2);
5682       cbnz(tmp1, LOOP_8);
5683       stpq(v4, v5, dst);
5684       sub(len, len, 32);
5685       add(dst, dst, 32);
5686       add(src, src, 64);
5687       cmp(len, (u1)32);
5688       br(GE, NEXT_32);
5689       cbz(len, DONE);
5690 
5691     BIND(LOOP_8);
5692       cmp(len, (u1)8);
5693       br(LT, LOOP_1);
5694     BIND(NEXT_8);
5695       ld1(Vtmp1, T8H, src);
5696       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5697       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5698       fmovd(tmp1, Vtmp3);
5699       cbnz(tmp1, NEXT_1);
5700       strd(Vtmp2, dst);
5701 
5702       sub(len, len, 8);
5703       add(dst, dst, 8);
5704       add(src, src, 16);
5705       cmp(len, (u1)8);
5706       br(GE, NEXT_8);
5707 
5708     BIND(LOOP_1);
5709 #endif
5710     cbz(len, DONE);
5711     BIND(NEXT_1);
5712       ldrh(tmp1, Address(post(src, 2)));
5713       tst(tmp1, 0xff00);
5714       br(NE, SET_RESULT);
5715       strb(tmp1, Address(post(dst, 1)));
5716       subs(len, len, 1);
5717       br(GT, NEXT_1);
5718 
5719     BIND(SET_RESULT);
5720       sub(result, result, len); // Return index where we stopped
5721                                 // Return len == 0 if we processed all
5722                                 // characters
5723     BIND(DONE);
5724 }
5725 
5726 
5727 // Inflate byte[] array to char[].
5728 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5729                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5730                                         Register tmp4) {
5731   Label big, done, after_init, to_stub;
5732 
5733   assert_different_registers(src, dst, len, tmp4, rscratch1);
5734 
5735   fmovd(vtmp1, zr);
5736   lsrw(tmp4, len, 3);
5737   bind(after_init);
5738   cbnzw(tmp4, big);
5739   // Short string: less than 8 bytes.
5740   {
5741     Label loop, tiny;
5742 
5743     cmpw(len, 4);
5744     br(LT, tiny);
5745     // Use SIMD to do 4 bytes.
5746     ldrs(vtmp2, post(src, 4));
5747     zip1(vtmp3, T8B, vtmp2, vtmp1);
5748     subw(len, len, 4);
5749     strd(vtmp3, post(dst, 8));
5750 
5751     cbzw(len, done);
5752 
5753     // Do the remaining bytes by steam.
5754     bind(loop);
5755     ldrb(tmp4, post(src, 1));
5756     strh(tmp4, post(dst, 2));
5757     subw(len, len, 1);
5758 
5759     bind(tiny);
5760     cbnz(len, loop);
5761 
5762     b(done);
5763   }
5764 
5765   if (SoftwarePrefetchHintDistance >= 0) {
5766     bind(to_stub);
5767       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5768       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5769       trampoline_call(stub);
5770       b(after_init);
5771   }
5772 
5773   // Unpack the bytes 8 at a time.
5774   bind(big);
5775   {
5776     Label loop, around, loop_last, loop_start;
5777 
5778     if (SoftwarePrefetchHintDistance >= 0) {
5779       const int large_loop_threshold = (64 + 16)/8;
5780       ldrd(vtmp2, post(src, 8));
5781       andw(len, len, 7);
5782       cmp(tmp4, (u1)large_loop_threshold);
5783       br(GE, to_stub);
5784       b(loop_start);
5785 
5786       bind(loop);
5787       ldrd(vtmp2, post(src, 8));
5788       bind(loop_start);
5789       subs(tmp4, tmp4, 1);
5790       br(EQ, loop_last);
5791       zip1(vtmp2, T16B, vtmp2, vtmp1);
5792       ldrd(vtmp3, post(src, 8));
5793       st1(vtmp2, T8H, post(dst, 16));
5794       subs(tmp4, tmp4, 1);
5795       zip1(vtmp3, T16B, vtmp3, vtmp1);
5796       st1(vtmp3, T8H, post(dst, 16));
5797       br(NE, loop);
5798       b(around);
5799       bind(loop_last);
5800       zip1(vtmp2, T16B, vtmp2, vtmp1);
5801       st1(vtmp2, T8H, post(dst, 16));
5802       bind(around);
5803       cbz(len, done);
5804     } else {
5805       andw(len, len, 7);
5806       bind(loop);
5807       ldrd(vtmp2, post(src, 8));
5808       sub(tmp4, tmp4, 1);
5809       zip1(vtmp3, T16B, vtmp2, vtmp1);
5810       st1(vtmp3, T8H, post(dst, 16));
5811       cbnz(tmp4, loop);
5812     }
5813   }
5814 
5815   // Do the tail of up to 8 bytes.
5816   add(src, src, len);
5817   ldrd(vtmp3, Address(src, -8));
5818   add(dst, dst, len, ext::uxtw, 1);
5819   zip1(vtmp3, T16B, vtmp3, vtmp1);
5820   strq(vtmp3, Address(dst, -16));
5821 
5822   bind(done);
5823 }
5824 
5825 // Compress char[] array to byte[].
5826 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5827                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5828                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5829                                          Register result) {
5830   encode_iso_array(src, dst, len, result,
5831                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5832   cmp(len, zr);
5833   csel(result, result, zr, EQ);
5834 }
5835 
5836 // get_thread() can be called anywhere inside generated code so we
5837 // need to save whatever non-callee save context might get clobbered
5838 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5839 // the call setup code.
5840 //
5841 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5842 //
5843 void MacroAssembler::get_thread(Register dst) {
5844   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5845   push(saved_regs, sp);
5846 
5847   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5848   blrt(lr, 1, 0, 1);
5849   if (dst != c_rarg0) {
5850     mov(dst, c_rarg0);
5851   }
5852 
5853   pop(saved_regs, sp);
5854 }