1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/thread.hpp"
  50 #ifdef COMPILER1
  51 #include "c1/c1_LIRAssembler.hpp"
  52 #endif
  53 #ifdef COMPILER2
  54 #include "oops/oop.hpp"
  55 #include "opto/compile.hpp"
  56 #include "opto/intrinsicnode.hpp"
  57 #include "opto/node.hpp"
  58 #endif
  59 
  60 #ifdef PRODUCT
  61 #define BLOCK_COMMENT(str) /* nothing */
  62 #define STOP(error) stop(error)
  63 #else
  64 #define BLOCK_COMMENT(str) block_comment(str)
  65 #define STOP(error) block_comment(error); stop(error)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Patch any kind of instruction; there may be several instructions.
  71 // Return the total length (in bytes) of the instructions.
  72 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  73   int instructions = 1;
  74   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  75   long offset = (target - branch) >> 2;
  76   unsigned insn = *(unsigned*)branch;
  77   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  78     // Load register (literal)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  81     // Unconditional branch (immediate)
  82     Instruction_aarch64::spatch(branch, 25, 0, offset);
  83   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  84     // Conditional branch (immediate)
  85     Instruction_aarch64::spatch(branch, 23, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  87     // Compare & branch (immediate)
  88     Instruction_aarch64::spatch(branch, 23, 5, offset);
  89   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  90     // Test & branch (immediate)
  91     Instruction_aarch64::spatch(branch, 18, 5, offset);
  92   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  93     // PC-rel. addressing
  94     offset = target-branch;
  95     int shift = Instruction_aarch64::extract(insn, 31, 31);
  96     if (shift) {
  97       u_int64_t dest = (u_int64_t)target;
  98       uint64_t pc_page = (uint64_t)branch >> 12;
  99       uint64_t adr_page = (uint64_t)target >> 12;
 100       unsigned offset_lo = dest & 0xfff;
 101       offset = adr_page - pc_page;
 102 
 103       // We handle 4 types of PC relative addressing
 104       //   1 - adrp    Rx, target_page
 105       //       ldr/str Ry, [Rx, #offset_in_page]
 106       //   2 - adrp    Rx, target_page
 107       //       add     Ry, Rx, #offset_in_page
 108       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 109       //       movk    Rx, #imm16<<32
 110       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 111       // In the first 3 cases we must check that Rx is the same in the adrp and the
 112       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 113       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 114       // to be followed by a random unrelated ldr/str, add or movk instruction.
 115       //
 116       unsigned insn2 = ((unsigned*)branch)[1];
 117       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 118                 Instruction_aarch64::extract(insn, 4, 0) ==
 119                         Instruction_aarch64::extract(insn2, 9, 5)) {
 120         // Load/store register (unsigned immediate)
 121         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 122         Instruction_aarch64::patch(branch + sizeof (unsigned),
 123                                     21, 10, offset_lo >> size);
 124         guarantee(((dest >> size) << size) == dest, "misaligned target");
 125         instructions = 2;
 126       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 127                 Instruction_aarch64::extract(insn, 4, 0) ==
 128                         Instruction_aarch64::extract(insn2, 4, 0)) {
 129         // add (immediate)
 130         Instruction_aarch64::patch(branch + sizeof (unsigned),
 131                                    21, 10, offset_lo);
 132         instructions = 2;
 133       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 134                    Instruction_aarch64::extract(insn, 4, 0) ==
 135                      Instruction_aarch64::extract(insn2, 4, 0)) {
 136         // movk #imm16<<32
 137         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 138         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 139         long pc_page = (long)branch >> 12;
 140         long adr_page = (long)dest >> 12;
 141         offset = adr_page - pc_page;
 142         instructions = 2;
 143       }
 144     }
 145     int offset_lo = offset & 3;
 146     offset >>= 2;
 147     Instruction_aarch64::spatch(branch, 23, 5, offset);
 148     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 149   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 150     u_int64_t dest = (u_int64_t)target;
 151     // Move wide constant
 152     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 153     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 154     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 155     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 156     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 157     assert(target_addr_for_insn(branch) == target, "should be");
 158     instructions = 3;
 159   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 160              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 161     // nothing to do
 162     assert(target == 0, "did not expect to relocate target for polling page load");
 163   } else {
 164     ShouldNotReachHere();
 165   }
 166   return instructions * NativeInstruction::instruction_size;
 167 }
 168 
 169 int MacroAssembler::patch_oop(address insn_addr, address o) {
 170   int instructions;
 171   unsigned insn = *(unsigned*)insn_addr;
 172   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 173 
 174   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 175   // narrow OOPs by setting the upper 16 bits in the first
 176   // instruction.
 177   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 178     // Move narrow OOP
 179     narrowOop n = CompressedOops::encode((oop)o);
 180     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 181     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 182     instructions = 2;
 183   } else {
 184     // Move wide OOP
 185     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 186     uintptr_t dest = (uintptr_t)o;
 187     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 190     instructions = 3;
 191   }
 192   return instructions * NativeInstruction::instruction_size;
 193 }
 194 
 195 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 196   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 197   // We encode narrow ones by setting the upper 16 bits in the first
 198   // instruction.
 199   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 200   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 201          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 202 
 203   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 204   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 205   return 2 * NativeInstruction::instruction_size;
 206 }
 207 
 208 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 209   long offset = 0;
 210   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 211     // Load register (literal)
 212     offset = Instruction_aarch64::sextract(insn, 23, 5);
 213     return address(((uint64_t)insn_addr + (offset << 2)));
 214   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 215     // Unconditional branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 25, 0);
 217   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 218     // Conditional branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 23, 5);
 220   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 221     // Compare & branch (immediate)
 222     offset = Instruction_aarch64::sextract(insn, 23, 5);
 223    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 224     // Test & branch (immediate)
 225     offset = Instruction_aarch64::sextract(insn, 18, 5);
 226   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 227     // PC-rel. addressing
 228     offset = Instruction_aarch64::extract(insn, 30, 29);
 229     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 230     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 231     if (shift) {
 232       offset <<= shift;
 233       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 234       target_page &= ((uint64_t)-1) << shift;
 235       // Return the target address for the following sequences
 236       //   1 - adrp    Rx, target_page
 237       //       ldr/str Ry, [Rx, #offset_in_page]
 238       //   2 - adrp    Rx, target_page
 239       //       add     Ry, Rx, #offset_in_page
 240       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 241       //       movk    Rx, #imm12<<32
 242       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 243       //
 244       // In the first two cases  we check that the register is the same and
 245       // return the target_page + the offset within the page.
 246       // Otherwise we assume it is a page aligned relocation and return
 247       // the target page only.
 248       //
 249       unsigned insn2 = ((unsigned*)insn_addr)[1];
 250       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 251                 Instruction_aarch64::extract(insn, 4, 0) ==
 252                         Instruction_aarch64::extract(insn2, 9, 5)) {
 253         // Load/store register (unsigned immediate)
 254         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 255         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 256         return address(target_page + (byte_offset << size));
 257       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 258                 Instruction_aarch64::extract(insn, 4, 0) ==
 259                         Instruction_aarch64::extract(insn2, 4, 0)) {
 260         // add (immediate)
 261         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 262         return address(target_page + byte_offset);
 263       } else {
 264         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 265                Instruction_aarch64::extract(insn, 4, 0) ==
 266                  Instruction_aarch64::extract(insn2, 4, 0)) {
 267           target_page = (target_page & 0xffffffff) |
 268                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 269         }
 270         return (address)target_page;
 271       }
 272     } else {
 273       ShouldNotReachHere();
 274     }
 275   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 276     u_int32_t *insns = (u_int32_t *)insn_addr;
 277     // Move wide constant: movz, movk, movk.  See movptr().
 278     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 279     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 280     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 283   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 284              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 285     return 0;
 286   } else {
 287     ShouldNotReachHere();
 288   }
 289   return address(((uint64_t)insn_addr + (offset << 2)));
 290 }
 291 
 292 void MacroAssembler::safepoint_poll(Label& slow_path) {
 293   if (SafepointMechanism::uses_thread_local_poll()) {
 294     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 295     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 296   } else {
 297     unsigned long offset;
 298     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 299     ldrw(rscratch1, Address(rscratch1, offset));
 300     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 301     cbnz(rscratch1, slow_path);
 302   }
 303 }
 304 
 305 // Just like safepoint_poll, but use an acquiring load for thread-
 306 // local polling.
 307 //
 308 // We need an acquire here to ensure that any subsequent load of the
 309 // global SafepointSynchronize::_state flag is ordered after this load
 310 // of the local Thread::_polling page.  We don't want this poll to
 311 // return false (i.e. not safepointing) and a later poll of the global
 312 // SafepointSynchronize::_state spuriously to return true.
 313 //
 314 // This is to avoid a race when we're in a native->Java transition
 315 // racing the code which wakes up from a safepoint.
 316 //
 317 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 318   if (SafepointMechanism::uses_thread_local_poll()) {
 319     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 320     ldar(rscratch1, rscratch1);
 321     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 322   } else {
 323     safepoint_poll(slow_path);
 324   }
 325 }
 326 
 327 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 328   // we must set sp to zero to clear frame
 329   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 330 
 331   // must clear fp, so that compiled frames are not confused; it is
 332   // possible that we need it only for debugging
 333   if (clear_fp) {
 334     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 335   }
 336 
 337   // Always clear the pc because it could have been set by make_walkable()
 338   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 339 }
 340 
 341 // Calls to C land
 342 //
 343 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 344 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 345 // has to be reset to 0. This is required to allow proper stack traversal.
 346 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 347                                          Register last_java_fp,
 348                                          Register last_java_pc,
 349                                          Register scratch) {
 350 
 351   if (last_java_pc->is_valid()) {
 352       str(last_java_pc, Address(rthread,
 353                                 JavaThread::frame_anchor_offset()
 354                                 + JavaFrameAnchor::last_Java_pc_offset()));
 355     }
 356 
 357   // determine last_java_sp register
 358   if (last_java_sp == sp) {
 359     mov(scratch, sp);
 360     last_java_sp = scratch;
 361   } else if (!last_java_sp->is_valid()) {
 362     last_java_sp = esp;
 363   }
 364 
 365   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 366 
 367   // last_java_fp is optional
 368   if (last_java_fp->is_valid()) {
 369     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 370   }
 371 }
 372 
 373 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 374                                          Register last_java_fp,
 375                                          address  last_java_pc,
 376                                          Register scratch) {
 377   assert(last_java_pc != NULL, "must provide a valid PC");
 378 
 379   adr(scratch, last_java_pc);
 380   str(scratch, Address(rthread,
 381                        JavaThread::frame_anchor_offset()
 382                        + JavaFrameAnchor::last_Java_pc_offset()));
 383 
 384   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 385 }
 386 
 387 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 388                                          Register last_java_fp,
 389                                          Label &L,
 390                                          Register scratch) {
 391   if (L.is_bound()) {
 392     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 393   } else {
 394     InstructionMark im(this);
 395     L.add_patch_at(code(), locator());
 396     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 397   }
 398 }
 399 
 400 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 401   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 402   assert(CodeCache::find_blob(entry.target()) != NULL,
 403          "destination of far call not found in code cache");
 404   if (far_branches()) {
 405     unsigned long offset;
 406     // We can use ADRP here because we know that the total size of
 407     // the code cache cannot exceed 2Gb.
 408     adrp(tmp, entry, offset);
 409     add(tmp, tmp, offset);
 410     if (cbuf) cbuf->set_insts_mark();
 411     blr(tmp);
 412   } else {
 413     if (cbuf) cbuf->set_insts_mark();
 414     bl(entry);
 415   }
 416 }
 417 
 418 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 419   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 420   assert(CodeCache::find_blob(entry.target()) != NULL,
 421          "destination of far call not found in code cache");
 422   if (far_branches()) {
 423     unsigned long offset;
 424     // We can use ADRP here because we know that the total size of
 425     // the code cache cannot exceed 2Gb.
 426     adrp(tmp, entry, offset);
 427     add(tmp, tmp, offset);
 428     if (cbuf) cbuf->set_insts_mark();
 429     br(tmp);
 430   } else {
 431     if (cbuf) cbuf->set_insts_mark();
 432     b(entry);
 433   }
 434 }
 435 
 436 void MacroAssembler::reserved_stack_check() {
 437     // testing if reserved zone needs to be enabled
 438     Label no_reserved_zone_enabling;
 439 
 440     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 441     cmp(sp, rscratch1);
 442     br(Assembler::LO, no_reserved_zone_enabling);
 443 
 444     enter();   // LR and FP are live.
 445     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 446     mov(c_rarg0, rthread);
 447     blr(rscratch1);
 448     leave();
 449 
 450     // We have already removed our own frame.
 451     // throw_delayed_StackOverflowError will think that it's been
 452     // called by our caller.
 453     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 454     br(rscratch1);
 455     should_not_reach_here();
 456 
 457     bind(no_reserved_zone_enabling);
 458 }
 459 
 460 int MacroAssembler::biased_locking_enter(Register lock_reg,
 461                                          Register obj_reg,
 462                                          Register swap_reg,
 463                                          Register tmp_reg,
 464                                          bool swap_reg_contains_mark,
 465                                          Label& done,
 466                                          Label* slow_case,
 467                                          BiasedLockingCounters* counters) {
 468   assert(UseBiasedLocking, "why call this otherwise?");
 469   assert_different_registers(lock_reg, obj_reg, swap_reg);
 470 
 471   if (PrintBiasedLockingStatistics && counters == NULL)
 472     counters = BiasedLocking::counters();
 473 
 474   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 475   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 476   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 477   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 478   Address saved_mark_addr(lock_reg, 0);
 479 
 480   // Biased locking
 481   // See whether the lock is currently biased toward our thread and
 482   // whether the epoch is still valid
 483   // Note that the runtime guarantees sufficient alignment of JavaThread
 484   // pointers to allow age to be placed into low bits
 485   // First check to see whether biasing is even enabled for this object
 486   Label cas_label;
 487   int null_check_offset = -1;
 488   if (!swap_reg_contains_mark) {
 489     null_check_offset = offset();
 490     ldr(swap_reg, mark_addr);
 491   }
 492   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 493   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 494   br(Assembler::NE, cas_label);
 495   // The bias pattern is present in the object's header. Need to check
 496   // whether the bias owner and the epoch are both still current.
 497   load_prototype_header(tmp_reg, obj_reg);
 498   orr(tmp_reg, tmp_reg, rthread);
 499   eor(tmp_reg, swap_reg, tmp_reg);
 500   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 501   if (counters != NULL) {
 502     Label around;
 503     cbnz(tmp_reg, around);
 504     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 505     b(done);
 506     bind(around);
 507   } else {
 508     cbz(tmp_reg, done);
 509   }
 510 
 511   Label try_revoke_bias;
 512   Label try_rebias;
 513 
 514   // At this point we know that the header has the bias pattern and
 515   // that we are not the bias owner in the current epoch. We need to
 516   // figure out more details about the state of the header in order to
 517   // know what operations can be legally performed on the object's
 518   // header.
 519 
 520   // If the low three bits in the xor result aren't clear, that means
 521   // the prototype header is no longer biased and we have to revoke
 522   // the bias on this object.
 523   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 524   cbnz(rscratch1, try_revoke_bias);
 525 
 526   // Biasing is still enabled for this data type. See whether the
 527   // epoch of the current bias is still valid, meaning that the epoch
 528   // bits of the mark word are equal to the epoch bits of the
 529   // prototype header. (Note that the prototype header's epoch bits
 530   // only change at a safepoint.) If not, attempt to rebias the object
 531   // toward the current thread. Note that we must be absolutely sure
 532   // that the current epoch is invalid in order to do this because
 533   // otherwise the manipulations it performs on the mark word are
 534   // illegal.
 535   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 536   cbnz(rscratch1, try_rebias);
 537 
 538   // The epoch of the current bias is still valid but we know nothing
 539   // about the owner; it might be set or it might be clear. Try to
 540   // acquire the bias of the object using an atomic operation. If this
 541   // fails we will go in to the runtime to revoke the object's bias.
 542   // Note that we first construct the presumed unbiased header so we
 543   // don't accidentally blow away another thread's valid bias.
 544   {
 545     Label here;
 546     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 547     andr(swap_reg, swap_reg, rscratch1);
 548     orr(tmp_reg, swap_reg, rthread);
 549     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 550     // If the biasing toward our thread failed, this means that
 551     // another thread succeeded in biasing it toward itself and we
 552     // need to revoke that bias. The revocation will occur in the
 553     // interpreter runtime in the slow case.
 554     bind(here);
 555     if (counters != NULL) {
 556       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 557                   tmp_reg, rscratch1, rscratch2);
 558     }
 559   }
 560   b(done);
 561 
 562   bind(try_rebias);
 563   // At this point we know the epoch has expired, meaning that the
 564   // current "bias owner", if any, is actually invalid. Under these
 565   // circumstances _only_, we are allowed to use the current header's
 566   // value as the comparison value when doing the cas to acquire the
 567   // bias in the current epoch. In other words, we allow transfer of
 568   // the bias from one thread to another directly in this situation.
 569   //
 570   // FIXME: due to a lack of registers we currently blow away the age
 571   // bits in this situation. Should attempt to preserve them.
 572   {
 573     Label here;
 574     load_prototype_header(tmp_reg, obj_reg);
 575     orr(tmp_reg, rthread, tmp_reg);
 576     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 577     // If the biasing toward our thread failed, then another thread
 578     // succeeded in biasing it toward itself and we need to revoke that
 579     // bias. The revocation will occur in the runtime in the slow case.
 580     bind(here);
 581     if (counters != NULL) {
 582       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 583                   tmp_reg, rscratch1, rscratch2);
 584     }
 585   }
 586   b(done);
 587 
 588   bind(try_revoke_bias);
 589   // The prototype mark in the klass doesn't have the bias bit set any
 590   // more, indicating that objects of this data type are not supposed
 591   // to be biased any more. We are going to try to reset the mark of
 592   // this object to the prototype value and fall through to the
 593   // CAS-based locking scheme. Note that if our CAS fails, it means
 594   // that another thread raced us for the privilege of revoking the
 595   // bias of this particular object, so it's okay to continue in the
 596   // normal locking code.
 597   //
 598   // FIXME: due to a lack of registers we currently blow away the age
 599   // bits in this situation. Should attempt to preserve them.
 600   {
 601     Label here, nope;
 602     load_prototype_header(tmp_reg, obj_reg);
 603     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 604     bind(here);
 605 
 606     // Fall through to the normal CAS-based lock, because no matter what
 607     // the result of the above CAS, some thread must have succeeded in
 608     // removing the bias bit from the object's header.
 609     if (counters != NULL) {
 610       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 611                   rscratch1, rscratch2);
 612     }
 613     bind(nope);
 614   }
 615 
 616   bind(cas_label);
 617 
 618   return null_check_offset;
 619 }
 620 
 621 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 622   assert(UseBiasedLocking, "why call this otherwise?");
 623 
 624   // Check for biased locking unlock case, which is a no-op
 625   // Note: we do not have to check the thread ID for two reasons.
 626   // First, the interpreter checks for IllegalMonitorStateException at
 627   // a higher level. Second, if the bias was revoked while we held the
 628   // lock, the object could not be rebiased toward another thread, so
 629   // the bias bit would be clear.
 630   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 631   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 632   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 633   br(Assembler::EQ, done);
 634 }
 635 
 636 static void pass_arg0(MacroAssembler* masm, Register arg) {
 637   if (c_rarg0 != arg ) {
 638     masm->mov(c_rarg0, arg);
 639   }
 640 }
 641 
 642 static void pass_arg1(MacroAssembler* masm, Register arg) {
 643   if (c_rarg1 != arg ) {
 644     masm->mov(c_rarg1, arg);
 645   }
 646 }
 647 
 648 static void pass_arg2(MacroAssembler* masm, Register arg) {
 649   if (c_rarg2 != arg ) {
 650     masm->mov(c_rarg2, arg);
 651   }
 652 }
 653 
 654 static void pass_arg3(MacroAssembler* masm, Register arg) {
 655   if (c_rarg3 != arg ) {
 656     masm->mov(c_rarg3, arg);
 657   }
 658 }
 659 
 660 void MacroAssembler::call_VM_base(Register oop_result,
 661                                   Register java_thread,
 662                                   Register last_java_sp,
 663                                   address  entry_point,
 664                                   int      number_of_arguments,
 665                                   bool     check_exceptions) {
 666    // determine java_thread register
 667   if (!java_thread->is_valid()) {
 668     java_thread = rthread;
 669   }
 670 
 671   // determine last_java_sp register
 672   if (!last_java_sp->is_valid()) {
 673     last_java_sp = esp;
 674   }
 675 
 676   // debugging support
 677   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 678   assert(java_thread == rthread, "unexpected register");
 679 #ifdef ASSERT
 680   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 681   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 682 #endif // ASSERT
 683 
 684   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 685   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 686 
 687   // push java thread (becomes first argument of C function)
 688 
 689   mov(c_rarg0, java_thread);
 690 
 691   // set last Java frame before call
 692   assert(last_java_sp != rfp, "can't use rfp");
 693 
 694   Label l;
 695   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 696 
 697   // do the call, remove parameters
 698   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 699 
 700   // reset last Java frame
 701   // Only interpreter should have to clear fp
 702   reset_last_Java_frame(true);
 703 
 704    // C++ interp handles this in the interpreter
 705   check_and_handle_popframe(java_thread);
 706   check_and_handle_earlyret(java_thread);
 707 
 708   if (check_exceptions) {
 709     // check for pending exceptions (java_thread is set upon return)
 710     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 711     Label ok;
 712     cbz(rscratch1, ok);
 713     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 714     br(rscratch1);
 715     bind(ok);
 716   }
 717 
 718   // get oop result if there is one and reset the value in the thread
 719   if (oop_result->is_valid()) {
 720     get_vm_result(oop_result, java_thread);
 721   }
 722 }
 723 
 724 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 725   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 726 }
 727 
 728 // Maybe emit a call via a trampoline.  If the code cache is small
 729 // trampolines won't be emitted.
 730 
 731 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 732   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 733   assert(entry.rspec().type() == relocInfo::runtime_call_type
 734          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 735          || entry.rspec().type() == relocInfo::static_call_type
 736          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 737 
 738   // We need a trampoline if branches are far.
 739   if (far_branches()) {
 740     bool in_scratch_emit_size = false;
 741 #ifdef COMPILER2
 742     // We don't want to emit a trampoline if C2 is generating dummy
 743     // code during its branch shortening phase.
 744     CompileTask* task = ciEnv::current()->task();
 745     in_scratch_emit_size =
 746       (task != NULL && is_c2_compile(task->comp_level()) &&
 747        Compile::current()->in_scratch_emit_size());
 748 #endif
 749     if (!in_scratch_emit_size) {
 750       address stub = emit_trampoline_stub(offset(), entry.target());
 751       if (stub == NULL) {
 752         return NULL; // CodeCache is full
 753       }
 754     }
 755   }
 756 
 757   if (cbuf) cbuf->set_insts_mark();
 758   relocate(entry.rspec());
 759   if (!far_branches()) {
 760     bl(entry.target());
 761   } else {
 762     bl(pc());
 763   }
 764   // just need to return a non-null address
 765   return pc();
 766 }
 767 
 768 
 769 // Emit a trampoline stub for a call to a target which is too far away.
 770 //
 771 // code sequences:
 772 //
 773 // call-site:
 774 //   branch-and-link to <destination> or <trampoline stub>
 775 //
 776 // Related trampoline stub for this call site in the stub section:
 777 //   load the call target from the constant pool
 778 //   branch (LR still points to the call site above)
 779 
 780 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 781                                              address dest) {
 782   // Max stub size: alignment nop, TrampolineStub.
 783   address stub = start_a_stub(NativeInstruction::instruction_size
 784                    + NativeCallTrampolineStub::instruction_size);
 785   if (stub == NULL) {
 786     return NULL;  // CodeBuffer::expand failed
 787   }
 788 
 789   // Create a trampoline stub relocation which relates this trampoline stub
 790   // with the call instruction at insts_call_instruction_offset in the
 791   // instructions code-section.
 792   align(wordSize);
 793   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 794                                             + insts_call_instruction_offset));
 795   const int stub_start_offset = offset();
 796 
 797   // Now, create the trampoline stub's code:
 798   // - load the call
 799   // - call
 800   Label target;
 801   ldr(rscratch1, target);
 802   br(rscratch1);
 803   bind(target);
 804   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 805          "should be");
 806   emit_int64((int64_t)dest);
 807 
 808   const address stub_start_addr = addr_at(stub_start_offset);
 809 
 810   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 811 
 812   end_a_stub();
 813   return stub_start_addr;
 814 }
 815 
 816 void MacroAssembler::emit_static_call_stub() {
 817   // CompiledDirectStaticCall::set_to_interpreted knows the
 818   // exact layout of this stub.
 819 
 820   isb();
 821   mov_metadata(rmethod, (Metadata*)NULL);
 822 
 823   // Jump to the entry point of the i2c stub.
 824   movptr(rscratch1, 0);
 825   br(rscratch1);
 826 }
 827 
 828 void MacroAssembler::c2bool(Register x) {
 829   // implements x == 0 ? 0 : 1
 830   // note: must only look at least-significant byte of x
 831   //       since C-style booleans are stored in one byte
 832   //       only! (was bug)
 833   tst(x, 0xff);
 834   cset(x, Assembler::NE);
 835 }
 836 
 837 address MacroAssembler::ic_call(address entry, jint method_index) {
 838   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 839   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 840   // unsigned long offset;
 841   // ldr_constant(rscratch2, const_ptr);
 842   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 843   return trampoline_call(Address(entry, rh));
 844 }
 845 
 846 // Implementation of call_VM versions
 847 
 848 void MacroAssembler::call_VM(Register oop_result,
 849                              address entry_point,
 850                              bool check_exceptions) {
 851   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 852 }
 853 
 854 void MacroAssembler::call_VM(Register oop_result,
 855                              address entry_point,
 856                              Register arg_1,
 857                              bool check_exceptions) {
 858   pass_arg1(this, arg_1);
 859   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 860 }
 861 
 862 void MacroAssembler::call_VM(Register oop_result,
 863                              address entry_point,
 864                              Register arg_1,
 865                              Register arg_2,
 866                              bool check_exceptions) {
 867   assert(arg_1 != c_rarg2, "smashed arg");
 868   pass_arg2(this, arg_2);
 869   pass_arg1(this, arg_1);
 870   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 871 }
 872 
 873 void MacroAssembler::call_VM(Register oop_result,
 874                              address entry_point,
 875                              Register arg_1,
 876                              Register arg_2,
 877                              Register arg_3,
 878                              bool check_exceptions) {
 879   assert(arg_1 != c_rarg3, "smashed arg");
 880   assert(arg_2 != c_rarg3, "smashed arg");
 881   pass_arg3(this, arg_3);
 882 
 883   assert(arg_1 != c_rarg2, "smashed arg");
 884   pass_arg2(this, arg_2);
 885 
 886   pass_arg1(this, arg_1);
 887   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 888 }
 889 
 890 void MacroAssembler::call_VM(Register oop_result,
 891                              Register last_java_sp,
 892                              address entry_point,
 893                              int number_of_arguments,
 894                              bool check_exceptions) {
 895   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 896 }
 897 
 898 void MacroAssembler::call_VM(Register oop_result,
 899                              Register last_java_sp,
 900                              address entry_point,
 901                              Register arg_1,
 902                              bool check_exceptions) {
 903   pass_arg1(this, arg_1);
 904   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 905 }
 906 
 907 void MacroAssembler::call_VM(Register oop_result,
 908                              Register last_java_sp,
 909                              address entry_point,
 910                              Register arg_1,
 911                              Register arg_2,
 912                              bool check_exceptions) {
 913 
 914   assert(arg_1 != c_rarg2, "smashed arg");
 915   pass_arg2(this, arg_2);
 916   pass_arg1(this, arg_1);
 917   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 918 }
 919 
 920 void MacroAssembler::call_VM(Register oop_result,
 921                              Register last_java_sp,
 922                              address entry_point,
 923                              Register arg_1,
 924                              Register arg_2,
 925                              Register arg_3,
 926                              bool check_exceptions) {
 927   assert(arg_1 != c_rarg3, "smashed arg");
 928   assert(arg_2 != c_rarg3, "smashed arg");
 929   pass_arg3(this, arg_3);
 930   assert(arg_1 != c_rarg2, "smashed arg");
 931   pass_arg2(this, arg_2);
 932   pass_arg1(this, arg_1);
 933   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 934 }
 935 
 936 
 937 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 938   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 939   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 940   verify_oop(oop_result, "broken oop in call_VM_base");
 941 }
 942 
 943 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 944   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 945   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 946 }
 947 
 948 void MacroAssembler::align(int modulus) {
 949   while (offset() % modulus != 0) nop();
 950 }
 951 
 952 // these are no-ops overridden by InterpreterMacroAssembler
 953 
 954 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 955 
 956 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 957 
 958 
 959 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 960                                                       Register tmp,
 961                                                       int offset) {
 962   intptr_t value = *delayed_value_addr;
 963   if (value != 0)
 964     return RegisterOrConstant(value + offset);
 965 
 966   // load indirectly to solve generation ordering problem
 967   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 968 
 969   if (offset != 0)
 970     add(tmp, tmp, offset);
 971 
 972   return RegisterOrConstant(tmp);
 973 }
 974 
 975 
 976 void MacroAssembler:: notify(int type) {
 977   if (type == bytecode_start) {
 978     // set_last_Java_frame(esp, rfp, (address)NULL);
 979     Assembler:: notify(type);
 980     // reset_last_Java_frame(true);
 981   }
 982   else
 983     Assembler:: notify(type);
 984 }
 985 
 986 // Look up the method for a megamorphic invokeinterface call.
 987 // The target method is determined by <intf_klass, itable_index>.
 988 // The receiver klass is in recv_klass.
 989 // On success, the result will be in method_result, and execution falls through.
 990 // On failure, execution transfers to the given label.
 991 void MacroAssembler::lookup_interface_method(Register recv_klass,
 992                                              Register intf_klass,
 993                                              RegisterOrConstant itable_index,
 994                                              Register method_result,
 995                                              Register scan_temp,
 996                                              Label& L_no_such_interface,
 997                          bool return_method) {
 998   assert_different_registers(recv_klass, intf_klass, scan_temp);
 999   assert_different_registers(method_result, intf_klass, scan_temp);
1000   assert(recv_klass != method_result || !return_method,
1001      "recv_klass can be destroyed when method isn't needed");
1002   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1003          "caller must use same register for non-constant itable index as for method");
1004 
1005   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1006   int vtable_base = in_bytes(Klass::vtable_start_offset());
1007   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1008   int scan_step   = itableOffsetEntry::size() * wordSize;
1009   int vte_size    = vtableEntry::size_in_bytes();
1010   assert(vte_size == wordSize, "else adjust times_vte_scale");
1011 
1012   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1013 
1014   // %%% Could store the aligned, prescaled offset in the klassoop.
1015   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1016   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1017   add(scan_temp, scan_temp, vtable_base);
1018 
1019   if (return_method) {
1020     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1021     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1022     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1023     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1024     if (itentry_off)
1025       add(recv_klass, recv_klass, itentry_off);
1026   }
1027 
1028   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1029   //   if (scan->interface() == intf) {
1030   //     result = (klass + scan->offset() + itable_index);
1031   //   }
1032   // }
1033   Label search, found_method;
1034 
1035   for (int peel = 1; peel >= 0; peel--) {
1036     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1037     cmp(intf_klass, method_result);
1038 
1039     if (peel) {
1040       br(Assembler::EQ, found_method);
1041     } else {
1042       br(Assembler::NE, search);
1043       // (invert the test to fall through to found_method...)
1044     }
1045 
1046     if (!peel)  break;
1047 
1048     bind(search);
1049 
1050     // Check that the previous entry is non-null.  A null entry means that
1051     // the receiver class doesn't implement the interface, and wasn't the
1052     // same as when the caller was compiled.
1053     cbz(method_result, L_no_such_interface);
1054     add(scan_temp, scan_temp, scan_step);
1055   }
1056 
1057   bind(found_method);
1058 
1059   // Got a hit.
1060   if (return_method) {
1061     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1062     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1063   }
1064 }
1065 
1066 // virtual method calling
1067 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1068                                            RegisterOrConstant vtable_index,
1069                                            Register method_result) {
1070   const int base = in_bytes(Klass::vtable_start_offset());
1071   assert(vtableEntry::size() * wordSize == 8,
1072          "adjust the scaling in the code below");
1073   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1074 
1075   if (vtable_index.is_register()) {
1076     lea(method_result, Address(recv_klass,
1077                                vtable_index.as_register(),
1078                                Address::lsl(LogBytesPerWord)));
1079     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1080   } else {
1081     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1082     ldr(method_result,
1083         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1084   }
1085 }
1086 
1087 void MacroAssembler::check_klass_subtype(Register sub_klass,
1088                            Register super_klass,
1089                            Register temp_reg,
1090                            Label& L_success) {
1091   Label L_failure;
1092   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1093   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1094   bind(L_failure);
1095 }
1096 
1097 
1098 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1099                                                    Register super_klass,
1100                                                    Register temp_reg,
1101                                                    Label* L_success,
1102                                                    Label* L_failure,
1103                                                    Label* L_slow_path,
1104                                         RegisterOrConstant super_check_offset) {
1105   assert_different_registers(sub_klass, super_klass, temp_reg);
1106   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1107   if (super_check_offset.is_register()) {
1108     assert_different_registers(sub_klass, super_klass,
1109                                super_check_offset.as_register());
1110   } else if (must_load_sco) {
1111     assert(temp_reg != noreg, "supply either a temp or a register offset");
1112   }
1113 
1114   Label L_fallthrough;
1115   int label_nulls = 0;
1116   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1117   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1118   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1119   assert(label_nulls <= 1, "at most one NULL in the batch");
1120 
1121   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1122   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1123   Address super_check_offset_addr(super_klass, sco_offset);
1124 
1125   // Hacked jmp, which may only be used just before L_fallthrough.
1126 #define final_jmp(label)                                                \
1127   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1128   else                            b(label)                /*omit semi*/
1129 
1130   // If the pointers are equal, we are done (e.g., String[] elements).
1131   // This self-check enables sharing of secondary supertype arrays among
1132   // non-primary types such as array-of-interface.  Otherwise, each such
1133   // type would need its own customized SSA.
1134   // We move this check to the front of the fast path because many
1135   // type checks are in fact trivially successful in this manner,
1136   // so we get a nicely predicted branch right at the start of the check.
1137   cmp(sub_klass, super_klass);
1138   br(Assembler::EQ, *L_success);
1139 
1140   // Check the supertype display:
1141   if (must_load_sco) {
1142     ldrw(temp_reg, super_check_offset_addr);
1143     super_check_offset = RegisterOrConstant(temp_reg);
1144   }
1145   Address super_check_addr(sub_klass, super_check_offset);
1146   ldr(rscratch1, super_check_addr);
1147   cmp(super_klass, rscratch1); // load displayed supertype
1148 
1149   // This check has worked decisively for primary supers.
1150   // Secondary supers are sought in the super_cache ('super_cache_addr').
1151   // (Secondary supers are interfaces and very deeply nested subtypes.)
1152   // This works in the same check above because of a tricky aliasing
1153   // between the super_cache and the primary super display elements.
1154   // (The 'super_check_addr' can address either, as the case requires.)
1155   // Note that the cache is updated below if it does not help us find
1156   // what we need immediately.
1157   // So if it was a primary super, we can just fail immediately.
1158   // Otherwise, it's the slow path for us (no success at this point).
1159 
1160   if (super_check_offset.is_register()) {
1161     br(Assembler::EQ, *L_success);
1162     subs(zr, super_check_offset.as_register(), sc_offset);
1163     if (L_failure == &L_fallthrough) {
1164       br(Assembler::EQ, *L_slow_path);
1165     } else {
1166       br(Assembler::NE, *L_failure);
1167       final_jmp(*L_slow_path);
1168     }
1169   } else if (super_check_offset.as_constant() == sc_offset) {
1170     // Need a slow path; fast failure is impossible.
1171     if (L_slow_path == &L_fallthrough) {
1172       br(Assembler::EQ, *L_success);
1173     } else {
1174       br(Assembler::NE, *L_slow_path);
1175       final_jmp(*L_success);
1176     }
1177   } else {
1178     // No slow path; it's a fast decision.
1179     if (L_failure == &L_fallthrough) {
1180       br(Assembler::EQ, *L_success);
1181     } else {
1182       br(Assembler::NE, *L_failure);
1183       final_jmp(*L_success);
1184     }
1185   }
1186 
1187   bind(L_fallthrough);
1188 
1189 #undef final_jmp
1190 }
1191 
1192 // These two are taken from x86, but they look generally useful
1193 
1194 // scans count pointer sized words at [addr] for occurence of value,
1195 // generic
1196 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1197                                 Register scratch) {
1198   Label Lloop, Lexit;
1199   cbz(count, Lexit);
1200   bind(Lloop);
1201   ldr(scratch, post(addr, wordSize));
1202   cmp(value, scratch);
1203   br(EQ, Lexit);
1204   sub(count, count, 1);
1205   cbnz(count, Lloop);
1206   bind(Lexit);
1207 }
1208 
1209 // scans count 4 byte words at [addr] for occurence of value,
1210 // generic
1211 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1212                                 Register scratch) {
1213   Label Lloop, Lexit;
1214   cbz(count, Lexit);
1215   bind(Lloop);
1216   ldrw(scratch, post(addr, wordSize));
1217   cmpw(value, scratch);
1218   br(EQ, Lexit);
1219   sub(count, count, 1);
1220   cbnz(count, Lloop);
1221   bind(Lexit);
1222 }
1223 
1224 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1225                                                    Register super_klass,
1226                                                    Register temp_reg,
1227                                                    Register temp2_reg,
1228                                                    Label* L_success,
1229                                                    Label* L_failure,
1230                                                    bool set_cond_codes) {
1231   assert_different_registers(sub_klass, super_klass, temp_reg);
1232   if (temp2_reg != noreg)
1233     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1234 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1235 
1236   Label L_fallthrough;
1237   int label_nulls = 0;
1238   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1239   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1240   assert(label_nulls <= 1, "at most one NULL in the batch");
1241 
1242   // a couple of useful fields in sub_klass:
1243   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1244   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1245   Address secondary_supers_addr(sub_klass, ss_offset);
1246   Address super_cache_addr(     sub_klass, sc_offset);
1247 
1248   BLOCK_COMMENT("check_klass_subtype_slow_path");
1249 
1250   // Do a linear scan of the secondary super-klass chain.
1251   // This code is rarely used, so simplicity is a virtue here.
1252   // The repne_scan instruction uses fixed registers, which we must spill.
1253   // Don't worry too much about pre-existing connections with the input regs.
1254 
1255   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1256   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1257 
1258   RegSet pushed_registers;
1259   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1260   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1261 
1262   if (super_klass != r0 || UseCompressedOops) {
1263     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1264   }
1265 
1266   push(pushed_registers, sp);
1267 
1268   // Get super_klass value into r0 (even if it was in r5 or r2).
1269   if (super_klass != r0) {
1270     mov(r0, super_klass);
1271   }
1272 
1273 #ifndef PRODUCT
1274   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1275   Address pst_counter_addr(rscratch2);
1276   ldr(rscratch1, pst_counter_addr);
1277   add(rscratch1, rscratch1, 1);
1278   str(rscratch1, pst_counter_addr);
1279 #endif //PRODUCT
1280 
1281   // We will consult the secondary-super array.
1282   ldr(r5, secondary_supers_addr);
1283   // Load the array length.
1284   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1285   // Skip to start of data.
1286   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1287 
1288   cmp(sp, zr); // Clear Z flag; SP is never zero
1289   // Scan R2 words at [R5] for an occurrence of R0.
1290   // Set NZ/Z based on last compare.
1291   repne_scan(r5, r0, r2, rscratch1);
1292 
1293   // Unspill the temp. registers:
1294   pop(pushed_registers, sp);
1295 
1296   br(Assembler::NE, *L_failure);
1297 
1298   // Success.  Cache the super we found and proceed in triumph.
1299   str(super_klass, super_cache_addr);
1300 
1301   if (L_success != &L_fallthrough) {
1302     b(*L_success);
1303   }
1304 
1305 #undef IS_A_TEMP
1306 
1307   bind(L_fallthrough);
1308 }
1309 
1310 
1311 void MacroAssembler::verify_oop(Register reg, const char* s) {
1312   if (!VerifyOops) return;
1313 
1314   // Pass register number to verify_oop_subroutine
1315   const char* b = NULL;
1316   {
1317     ResourceMark rm;
1318     stringStream ss;
1319     ss.print("verify_oop: %s: %s", reg->name(), s);
1320     b = code_string(ss.as_string());
1321   }
1322   BLOCK_COMMENT("verify_oop {");
1323 
1324   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1325   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1326 
1327   mov(r0, reg);
1328   mov(rscratch1, (address)b);
1329 
1330   // call indirectly to solve generation ordering problem
1331   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1332   ldr(rscratch2, Address(rscratch2));
1333   blr(rscratch2);
1334 
1335   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1336   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1337 
1338   BLOCK_COMMENT("} verify_oop");
1339 }
1340 
1341 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1342   if (!VerifyOops) return;
1343 
1344   const char* b = NULL;
1345   {
1346     ResourceMark rm;
1347     stringStream ss;
1348     ss.print("verify_oop_addr: %s", s);
1349     b = code_string(ss.as_string());
1350   }
1351   BLOCK_COMMENT("verify_oop_addr {");
1352 
1353   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1354   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1355 
1356   // addr may contain sp so we will have to adjust it based on the
1357   // pushes that we just did.
1358   if (addr.uses(sp)) {
1359     lea(r0, addr);
1360     ldr(r0, Address(r0, 4 * wordSize));
1361   } else {
1362     ldr(r0, addr);
1363   }
1364   mov(rscratch1, (address)b);
1365 
1366   // call indirectly to solve generation ordering problem
1367   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1368   ldr(rscratch2, Address(rscratch2));
1369   blr(rscratch2);
1370 
1371   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1372   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1373 
1374   BLOCK_COMMENT("} verify_oop_addr");
1375 }
1376 
1377 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1378                                          int extra_slot_offset) {
1379   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1380   int stackElementSize = Interpreter::stackElementSize;
1381   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1382 #ifdef ASSERT
1383   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1384   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1385 #endif
1386   if (arg_slot.is_constant()) {
1387     return Address(esp, arg_slot.as_constant() * stackElementSize
1388                    + offset);
1389   } else {
1390     add(rscratch1, esp, arg_slot.as_register(),
1391         ext::uxtx, exact_log2(stackElementSize));
1392     return Address(rscratch1, offset);
1393   }
1394 }
1395 
1396 void MacroAssembler::call_VM_leaf_base(address entry_point,
1397                                        int number_of_arguments,
1398                                        Label *retaddr) {
1399   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1400 }
1401 
1402 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1403                                         int number_of_gp_arguments,
1404                                         int number_of_fp_arguments,
1405                                         ret_type type,
1406                                         Label *retaddr) {
1407   Label E, L;
1408 
1409   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1410 
1411   // We add 1 to number_of_arguments because the thread in arg0 is
1412   // not counted
1413   mov(rscratch1, entry_point);
1414   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1415   if (retaddr)
1416     bind(*retaddr);
1417 
1418   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1419   maybe_isb();
1420 }
1421 
1422 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1423   call_VM_leaf_base(entry_point, number_of_arguments);
1424 }
1425 
1426 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1427   pass_arg0(this, arg_0);
1428   call_VM_leaf_base(entry_point, 1);
1429 }
1430 
1431 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1432   pass_arg0(this, arg_0);
1433   pass_arg1(this, arg_1);
1434   call_VM_leaf_base(entry_point, 2);
1435 }
1436 
1437 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1438                                   Register arg_1, Register arg_2) {
1439   pass_arg0(this, arg_0);
1440   pass_arg1(this, arg_1);
1441   pass_arg2(this, arg_2);
1442   call_VM_leaf_base(entry_point, 3);
1443 }
1444 
1445 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1446   pass_arg0(this, arg_0);
1447   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1448 }
1449 
1450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1451 
1452   assert(arg_0 != c_rarg1, "smashed arg");
1453   pass_arg1(this, arg_1);
1454   pass_arg0(this, arg_0);
1455   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1456 }
1457 
1458 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1459   assert(arg_0 != c_rarg2, "smashed arg");
1460   assert(arg_1 != c_rarg2, "smashed arg");
1461   pass_arg2(this, arg_2);
1462   assert(arg_0 != c_rarg1, "smashed arg");
1463   pass_arg1(this, arg_1);
1464   pass_arg0(this, arg_0);
1465   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1466 }
1467 
1468 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1469   assert(arg_0 != c_rarg3, "smashed arg");
1470   assert(arg_1 != c_rarg3, "smashed arg");
1471   assert(arg_2 != c_rarg3, "smashed arg");
1472   pass_arg3(this, arg_3);
1473   assert(arg_0 != c_rarg2, "smashed arg");
1474   assert(arg_1 != c_rarg2, "smashed arg");
1475   pass_arg2(this, arg_2);
1476   assert(arg_0 != c_rarg1, "smashed arg");
1477   pass_arg1(this, arg_1);
1478   pass_arg0(this, arg_0);
1479   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1480 }
1481 
1482 void MacroAssembler::null_check(Register reg, int offset) {
1483   if (needs_explicit_null_check(offset)) {
1484     // provoke OS NULL exception if reg = NULL by
1485     // accessing M[reg] w/o changing any registers
1486     // NOTE: this is plenty to provoke a segv
1487     ldr(zr, Address(reg));
1488   } else {
1489     // nothing to do, (later) access of M[reg + offset]
1490     // will provoke OS NULL exception if reg = NULL
1491   }
1492 }
1493 
1494 // MacroAssembler protected routines needed to implement
1495 // public methods
1496 
1497 void MacroAssembler::mov(Register r, Address dest) {
1498   code_section()->relocate(pc(), dest.rspec());
1499   u_int64_t imm64 = (u_int64_t)dest.target();
1500   movptr(r, imm64);
1501 }
1502 
1503 // Move a constant pointer into r.  In AArch64 mode the virtual
1504 // address space is 48 bits in size, so we only need three
1505 // instructions to create a patchable instruction sequence that can
1506 // reach anywhere.
1507 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1508 #ifndef PRODUCT
1509   {
1510     char buffer[64];
1511     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1512     block_comment(buffer);
1513   }
1514 #endif
1515   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1516   movz(r, imm64 & 0xffff);
1517   imm64 >>= 16;
1518   movk(r, imm64 & 0xffff, 16);
1519   imm64 >>= 16;
1520   movk(r, imm64 & 0xffff, 32);
1521 }
1522 
1523 // Macro to mov replicated immediate to vector register.
1524 //  Vd will get the following values for different arrangements in T
1525 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1526 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1527 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1528 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1529 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1530 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1531 //   T1D/T2D: invalid
1532 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1533   assert(T != T1D && T != T2D, "invalid arrangement");
1534   if (T == T8B || T == T16B) {
1535     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1536     movi(Vd, T, imm32 & 0xff, 0);
1537     return;
1538   }
1539   u_int32_t nimm32 = ~imm32;
1540   if (T == T4H || T == T8H) {
1541     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1542     imm32 &= 0xffff;
1543     nimm32 &= 0xffff;
1544   }
1545   u_int32_t x = imm32;
1546   int movi_cnt = 0;
1547   int movn_cnt = 0;
1548   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1549   x = nimm32;
1550   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1551   if (movn_cnt < movi_cnt) imm32 = nimm32;
1552   unsigned lsl = 0;
1553   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1554   if (movn_cnt < movi_cnt)
1555     mvni(Vd, T, imm32 & 0xff, lsl);
1556   else
1557     movi(Vd, T, imm32 & 0xff, lsl);
1558   imm32 >>= 8; lsl += 8;
1559   while (imm32) {
1560     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1561     if (movn_cnt < movi_cnt)
1562       bici(Vd, T, imm32 & 0xff, lsl);
1563     else
1564       orri(Vd, T, imm32 & 0xff, lsl);
1565     lsl += 8; imm32 >>= 8;
1566   }
1567 }
1568 
1569 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1570 {
1571 #ifndef PRODUCT
1572   {
1573     char buffer[64];
1574     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1575     block_comment(buffer);
1576   }
1577 #endif
1578   if (operand_valid_for_logical_immediate(false, imm64)) {
1579     orr(dst, zr, imm64);
1580   } else {
1581     // we can use a combination of MOVZ or MOVN with
1582     // MOVK to build up the constant
1583     u_int64_t imm_h[4];
1584     int zero_count = 0;
1585     int neg_count = 0;
1586     int i;
1587     for (i = 0; i < 4; i++) {
1588       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1589       if (imm_h[i] == 0) {
1590         zero_count++;
1591       } else if (imm_h[i] == 0xffffL) {
1592         neg_count++;
1593       }
1594     }
1595     if (zero_count == 4) {
1596       // one MOVZ will do
1597       movz(dst, 0);
1598     } else if (neg_count == 4) {
1599       // one MOVN will do
1600       movn(dst, 0);
1601     } else if (zero_count == 3) {
1602       for (i = 0; i < 4; i++) {
1603         if (imm_h[i] != 0L) {
1604           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1605           break;
1606         }
1607       }
1608     } else if (neg_count == 3) {
1609       // one MOVN will do
1610       for (int i = 0; i < 4; i++) {
1611         if (imm_h[i] != 0xffffL) {
1612           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1613           break;
1614         }
1615       }
1616     } else if (zero_count == 2) {
1617       // one MOVZ and one MOVK will do
1618       for (i = 0; i < 3; i++) {
1619         if (imm_h[i] != 0L) {
1620           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1621           i++;
1622           break;
1623         }
1624       }
1625       for (;i < 4; i++) {
1626         if (imm_h[i] != 0L) {
1627           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1628         }
1629       }
1630     } else if (neg_count == 2) {
1631       // one MOVN and one MOVK will do
1632       for (i = 0; i < 4; i++) {
1633         if (imm_h[i] != 0xffffL) {
1634           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1635           i++;
1636           break;
1637         }
1638       }
1639       for (;i < 4; i++) {
1640         if (imm_h[i] != 0xffffL) {
1641           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1642         }
1643       }
1644     } else if (zero_count == 1) {
1645       // one MOVZ and two MOVKs will do
1646       for (i = 0; i < 4; i++) {
1647         if (imm_h[i] != 0L) {
1648           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1649           i++;
1650           break;
1651         }
1652       }
1653       for (;i < 4; i++) {
1654         if (imm_h[i] != 0x0L) {
1655           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1656         }
1657       }
1658     } else if (neg_count == 1) {
1659       // one MOVN and two MOVKs will do
1660       for (i = 0; i < 4; i++) {
1661         if (imm_h[i] != 0xffffL) {
1662           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1663           i++;
1664           break;
1665         }
1666       }
1667       for (;i < 4; i++) {
1668         if (imm_h[i] != 0xffffL) {
1669           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1670         }
1671       }
1672     } else {
1673       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1674       movz(dst, (u_int32_t)imm_h[0], 0);
1675       for (i = 1; i < 4; i++) {
1676         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1677       }
1678     }
1679   }
1680 }
1681 
1682 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1683 {
1684 #ifndef PRODUCT
1685     {
1686       char buffer[64];
1687       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1688       block_comment(buffer);
1689     }
1690 #endif
1691   if (operand_valid_for_logical_immediate(true, imm32)) {
1692     orrw(dst, zr, imm32);
1693   } else {
1694     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1695     // constant
1696     u_int32_t imm_h[2];
1697     imm_h[0] = imm32 & 0xffff;
1698     imm_h[1] = ((imm32 >> 16) & 0xffff);
1699     if (imm_h[0] == 0) {
1700       movzw(dst, imm_h[1], 16);
1701     } else if (imm_h[0] == 0xffff) {
1702       movnw(dst, imm_h[1] ^ 0xffff, 16);
1703     } else if (imm_h[1] == 0) {
1704       movzw(dst, imm_h[0], 0);
1705     } else if (imm_h[1] == 0xffff) {
1706       movnw(dst, imm_h[0] ^ 0xffff, 0);
1707     } else {
1708       // use a MOVZ and MOVK (makes it easier to debug)
1709       movzw(dst, imm_h[0], 0);
1710       movkw(dst, imm_h[1], 16);
1711     }
1712   }
1713 }
1714 
1715 // Form an address from base + offset in Rd.  Rd may or may
1716 // not actually be used: you must use the Address that is returned.
1717 // It is up to you to ensure that the shift provided matches the size
1718 // of your data.
1719 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1720   if (Address::offset_ok_for_immed(byte_offset, shift))
1721     // It fits; no need for any heroics
1722     return Address(base, byte_offset);
1723 
1724   // Don't do anything clever with negative or misaligned offsets
1725   unsigned mask = (1 << shift) - 1;
1726   if (byte_offset < 0 || byte_offset & mask) {
1727     mov(Rd, byte_offset);
1728     add(Rd, base, Rd);
1729     return Address(Rd);
1730   }
1731 
1732   // See if we can do this with two 12-bit offsets
1733   {
1734     unsigned long word_offset = byte_offset >> shift;
1735     unsigned long masked_offset = word_offset & 0xfff000;
1736     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1737         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1738       add(Rd, base, masked_offset << shift);
1739       word_offset -= masked_offset;
1740       return Address(Rd, word_offset << shift);
1741     }
1742   }
1743 
1744   // Do it the hard way
1745   mov(Rd, byte_offset);
1746   add(Rd, base, Rd);
1747   return Address(Rd);
1748 }
1749 
1750 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1751   if (UseLSE) {
1752     mov(tmp, 1);
1753     ldadd(Assembler::word, tmp, zr, counter_addr);
1754     return;
1755   }
1756   Label retry_load;
1757   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1758     prfm(Address(counter_addr), PSTL1STRM);
1759   bind(retry_load);
1760   // flush and load exclusive from the memory location
1761   ldxrw(tmp, counter_addr);
1762   addw(tmp, tmp, 1);
1763   // if we store+flush with no intervening write tmp wil be zero
1764   stxrw(tmp2, tmp, counter_addr);
1765   cbnzw(tmp2, retry_load);
1766 }
1767 
1768 
1769 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1770                                     bool want_remainder, Register scratch)
1771 {
1772   // Full implementation of Java idiv and irem.  The function
1773   // returns the (pc) offset of the div instruction - may be needed
1774   // for implicit exceptions.
1775   //
1776   // constraint : ra/rb =/= scratch
1777   //         normal case
1778   //
1779   // input : ra: dividend
1780   //         rb: divisor
1781   //
1782   // result: either
1783   //         quotient  (= ra idiv rb)
1784   //         remainder (= ra irem rb)
1785 
1786   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1787 
1788   int idivl_offset = offset();
1789   if (! want_remainder) {
1790     sdivw(result, ra, rb);
1791   } else {
1792     sdivw(scratch, ra, rb);
1793     Assembler::msubw(result, scratch, rb, ra);
1794   }
1795 
1796   return idivl_offset;
1797 }
1798 
1799 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1800                                     bool want_remainder, Register scratch)
1801 {
1802   // Full implementation of Java ldiv and lrem.  The function
1803   // returns the (pc) offset of the div instruction - may be needed
1804   // for implicit exceptions.
1805   //
1806   // constraint : ra/rb =/= scratch
1807   //         normal case
1808   //
1809   // input : ra: dividend
1810   //         rb: divisor
1811   //
1812   // result: either
1813   //         quotient  (= ra idiv rb)
1814   //         remainder (= ra irem rb)
1815 
1816   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1817 
1818   int idivq_offset = offset();
1819   if (! want_remainder) {
1820     sdiv(result, ra, rb);
1821   } else {
1822     sdiv(scratch, ra, rb);
1823     Assembler::msub(result, scratch, rb, ra);
1824   }
1825 
1826   return idivq_offset;
1827 }
1828 
1829 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1830   address prev = pc() - NativeMembar::instruction_size;
1831   address last = code()->last_insn();
1832   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1833     NativeMembar *bar = NativeMembar_at(prev);
1834     // We are merging two memory barrier instructions.  On AArch64 we
1835     // can do this simply by ORing them together.
1836     bar->set_kind(bar->get_kind() | order_constraint);
1837     BLOCK_COMMENT("merged membar");
1838   } else {
1839     code()->set_last_insn(pc());
1840     dmb(Assembler::barrier(order_constraint));
1841   }
1842 }
1843 
1844 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1845   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1846     merge_ldst(rt, adr, size_in_bytes, is_store);
1847     code()->clear_last_insn();
1848     return true;
1849   } else {
1850     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1851     const unsigned mask = size_in_bytes - 1;
1852     if (adr.getMode() == Address::base_plus_offset &&
1853         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1854       code()->set_last_insn(pc());
1855     }
1856     return false;
1857   }
1858 }
1859 
1860 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1861   // We always try to merge two adjacent loads into one ldp.
1862   if (!try_merge_ldst(Rx, adr, 8, false)) {
1863     Assembler::ldr(Rx, adr);
1864   }
1865 }
1866 
1867 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1868   // We always try to merge two adjacent loads into one ldp.
1869   if (!try_merge_ldst(Rw, adr, 4, false)) {
1870     Assembler::ldrw(Rw, adr);
1871   }
1872 }
1873 
1874 void MacroAssembler::str(Register Rx, const Address &adr) {
1875   // We always try to merge two adjacent stores into one stp.
1876   if (!try_merge_ldst(Rx, adr, 8, true)) {
1877     Assembler::str(Rx, adr);
1878   }
1879 }
1880 
1881 void MacroAssembler::strw(Register Rw, const Address &adr) {
1882   // We always try to merge two adjacent stores into one stp.
1883   if (!try_merge_ldst(Rw, adr, 4, true)) {
1884     Assembler::strw(Rw, adr);
1885   }
1886 }
1887 
1888 // MacroAssembler routines found actually to be needed
1889 
1890 void MacroAssembler::push(Register src)
1891 {
1892   str(src, Address(pre(esp, -1 * wordSize)));
1893 }
1894 
1895 void MacroAssembler::pop(Register dst)
1896 {
1897   ldr(dst, Address(post(esp, 1 * wordSize)));
1898 }
1899 
1900 // Note: load_unsigned_short used to be called load_unsigned_word.
1901 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1902   int off = offset();
1903   ldrh(dst, src);
1904   return off;
1905 }
1906 
1907 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1908   int off = offset();
1909   ldrb(dst, src);
1910   return off;
1911 }
1912 
1913 int MacroAssembler::load_signed_short(Register dst, Address src) {
1914   int off = offset();
1915   ldrsh(dst, src);
1916   return off;
1917 }
1918 
1919 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1920   int off = offset();
1921   ldrsb(dst, src);
1922   return off;
1923 }
1924 
1925 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1926   int off = offset();
1927   ldrshw(dst, src);
1928   return off;
1929 }
1930 
1931 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1932   int off = offset();
1933   ldrsbw(dst, src);
1934   return off;
1935 }
1936 
1937 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1938   switch (size_in_bytes) {
1939   case  8:  ldr(dst, src); break;
1940   case  4:  ldrw(dst, src); break;
1941   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1942   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1943   default:  ShouldNotReachHere();
1944   }
1945 }
1946 
1947 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1948   switch (size_in_bytes) {
1949   case  8:  str(src, dst); break;
1950   case  4:  strw(src, dst); break;
1951   case  2:  strh(src, dst); break;
1952   case  1:  strb(src, dst); break;
1953   default:  ShouldNotReachHere();
1954   }
1955 }
1956 
1957 void MacroAssembler::decrementw(Register reg, int value)
1958 {
1959   if (value < 0)  { incrementw(reg, -value);      return; }
1960   if (value == 0) {                               return; }
1961   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1962   /* else */ {
1963     guarantee(reg != rscratch2, "invalid dst for register decrement");
1964     movw(rscratch2, (unsigned)value);
1965     subw(reg, reg, rscratch2);
1966   }
1967 }
1968 
1969 void MacroAssembler::decrement(Register reg, int value)
1970 {
1971   if (value < 0)  { increment(reg, -value);      return; }
1972   if (value == 0) {                              return; }
1973   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1974   /* else */ {
1975     assert(reg != rscratch2, "invalid dst for register decrement");
1976     mov(rscratch2, (unsigned long)value);
1977     sub(reg, reg, rscratch2);
1978   }
1979 }
1980 
1981 void MacroAssembler::decrementw(Address dst, int value)
1982 {
1983   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1984   if (dst.getMode() == Address::literal) {
1985     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1986     lea(rscratch2, dst);
1987     dst = Address(rscratch2);
1988   }
1989   ldrw(rscratch1, dst);
1990   decrementw(rscratch1, value);
1991   strw(rscratch1, dst);
1992 }
1993 
1994 void MacroAssembler::decrement(Address dst, int value)
1995 {
1996   assert(!dst.uses(rscratch1), "invalid address for decrement");
1997   if (dst.getMode() == Address::literal) {
1998     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1999     lea(rscratch2, dst);
2000     dst = Address(rscratch2);
2001   }
2002   ldr(rscratch1, dst);
2003   decrement(rscratch1, value);
2004   str(rscratch1, dst);
2005 }
2006 
2007 void MacroAssembler::incrementw(Register reg, int value)
2008 {
2009   if (value < 0)  { decrementw(reg, -value);      return; }
2010   if (value == 0) {                               return; }
2011   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2012   /* else */ {
2013     assert(reg != rscratch2, "invalid dst for register increment");
2014     movw(rscratch2, (unsigned)value);
2015     addw(reg, reg, rscratch2);
2016   }
2017 }
2018 
2019 void MacroAssembler::increment(Register reg, int value)
2020 {
2021   if (value < 0)  { decrement(reg, -value);      return; }
2022   if (value == 0) {                              return; }
2023   if (value < (1 << 12)) { add(reg, reg, value); return; }
2024   /* else */ {
2025     assert(reg != rscratch2, "invalid dst for register increment");
2026     movw(rscratch2, (unsigned)value);
2027     add(reg, reg, rscratch2);
2028   }
2029 }
2030 
2031 void MacroAssembler::incrementw(Address dst, int value)
2032 {
2033   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2034   if (dst.getMode() == Address::literal) {
2035     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2036     lea(rscratch2, dst);
2037     dst = Address(rscratch2);
2038   }
2039   ldrw(rscratch1, dst);
2040   incrementw(rscratch1, value);
2041   strw(rscratch1, dst);
2042 }
2043 
2044 void MacroAssembler::increment(Address dst, int value)
2045 {
2046   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2047   if (dst.getMode() == Address::literal) {
2048     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2049     lea(rscratch2, dst);
2050     dst = Address(rscratch2);
2051   }
2052   ldr(rscratch1, dst);
2053   increment(rscratch1, value);
2054   str(rscratch1, dst);
2055 }
2056 
2057 
2058 void MacroAssembler::pusha() {
2059   push(0x7fffffff, sp);
2060 }
2061 
2062 void MacroAssembler::popa() {
2063   pop(0x7fffffff, sp);
2064 }
2065 
2066 // Push lots of registers in the bit set supplied.  Don't push sp.
2067 // Return the number of words pushed
2068 int MacroAssembler::push(unsigned int bitset, Register stack) {
2069   int words_pushed = 0;
2070 
2071   // Scan bitset to accumulate register pairs
2072   unsigned char regs[32];
2073   int count = 0;
2074   for (int reg = 0; reg <= 30; reg++) {
2075     if (1 & bitset)
2076       regs[count++] = reg;
2077     bitset >>= 1;
2078   }
2079   regs[count++] = zr->encoding_nocheck();
2080   count &= ~1;  // Only push an even nuber of regs
2081 
2082   if (count) {
2083     stp(as_Register(regs[0]), as_Register(regs[1]),
2084        Address(pre(stack, -count * wordSize)));
2085     words_pushed += 2;
2086   }
2087   for (int i = 2; i < count; i += 2) {
2088     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2089        Address(stack, i * wordSize));
2090     words_pushed += 2;
2091   }
2092 
2093   assert(words_pushed == count, "oops, pushed != count");
2094 
2095   return count;
2096 }
2097 
2098 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2099   int words_pushed = 0;
2100 
2101   // Scan bitset to accumulate register pairs
2102   unsigned char regs[32];
2103   int count = 0;
2104   for (int reg = 0; reg <= 30; reg++) {
2105     if (1 & bitset)
2106       regs[count++] = reg;
2107     bitset >>= 1;
2108   }
2109   regs[count++] = zr->encoding_nocheck();
2110   count &= ~1;
2111 
2112   for (int i = 2; i < count; i += 2) {
2113     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2114        Address(stack, i * wordSize));
2115     words_pushed += 2;
2116   }
2117   if (count) {
2118     ldp(as_Register(regs[0]), as_Register(regs[1]),
2119        Address(post(stack, count * wordSize)));
2120     words_pushed += 2;
2121   }
2122 
2123   assert(words_pushed == count, "oops, pushed != count");
2124 
2125   return count;
2126 }
2127 #ifdef ASSERT
2128 void MacroAssembler::verify_heapbase(const char* msg) {
2129 #if 0
2130   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2131   assert (Universe::heap() != NULL, "java heap should be initialized");
2132   if (CheckCompressedOops) {
2133     Label ok;
2134     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2135     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2136     br(Assembler::EQ, ok);
2137     stop(msg);
2138     bind(ok);
2139     pop(1 << rscratch1->encoding(), sp);
2140   }
2141 #endif
2142 }
2143 #endif
2144 
2145 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2146   Label done, not_weak;
2147   cbz(value, done);           // Use NULL as-is.
2148 
2149   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2150   tbz(r0, 0, not_weak);    // Test for jweak tag.
2151 
2152   // Resolve jweak.
2153   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2154                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2155   verify_oop(value);
2156   b(done);
2157 
2158   bind(not_weak);
2159   // Resolve (untagged) jobject.
2160   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2161   verify_oop(value);
2162   bind(done);
2163 }
2164 
2165 void MacroAssembler::stop(const char* msg) {
2166   address ip = pc();
2167   pusha();
2168   mov(c_rarg0, (address)msg);
2169   mov(c_rarg1, (address)ip);
2170   mov(c_rarg2, sp);
2171   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2172   // call(c_rarg3);
2173   blrt(c_rarg3, 3, 0, 1);
2174   hlt(0);
2175 }
2176 
2177 void MacroAssembler::warn(const char* msg) {
2178   pusha();
2179   mov(c_rarg0, (address)msg);
2180   mov(lr, CAST_FROM_FN_PTR(address, warning));
2181   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2182   popa();
2183 }
2184 
2185 void MacroAssembler::unimplemented(const char* what) {
2186   const char* buf = NULL;
2187   {
2188     ResourceMark rm;
2189     stringStream ss;
2190     ss.print("unimplemented: %s", what);
2191     buf = code_string(ss.as_string());
2192   }
2193   stop(buf);
2194 }
2195 
2196 // If a constant does not fit in an immediate field, generate some
2197 // number of MOV instructions and then perform the operation.
2198 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2199                                            add_sub_imm_insn insn1,
2200                                            add_sub_reg_insn insn2) {
2201   assert(Rd != zr, "Rd = zr and not setting flags?");
2202   if (operand_valid_for_add_sub_immediate((int)imm)) {
2203     (this->*insn1)(Rd, Rn, imm);
2204   } else {
2205     if (uabs(imm) < (1 << 24)) {
2206        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2207        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2208     } else {
2209        assert_different_registers(Rd, Rn);
2210        mov(Rd, (uint64_t)imm);
2211        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2212     }
2213   }
2214 }
2215 
2216 // Seperate vsn which sets the flags. Optimisations are more restricted
2217 // because we must set the flags correctly.
2218 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2219                                            add_sub_imm_insn insn1,
2220                                            add_sub_reg_insn insn2) {
2221   if (operand_valid_for_add_sub_immediate((int)imm)) {
2222     (this->*insn1)(Rd, Rn, imm);
2223   } else {
2224     assert_different_registers(Rd, Rn);
2225     assert(Rd != zr, "overflow in immediate operand");
2226     mov(Rd, (uint64_t)imm);
2227     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2228   }
2229 }
2230 
2231 
2232 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2233   if (increment.is_register()) {
2234     add(Rd, Rn, increment.as_register());
2235   } else {
2236     add(Rd, Rn, increment.as_constant());
2237   }
2238 }
2239 
2240 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2241   if (increment.is_register()) {
2242     addw(Rd, Rn, increment.as_register());
2243   } else {
2244     addw(Rd, Rn, increment.as_constant());
2245   }
2246 }
2247 
2248 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2249   if (decrement.is_register()) {
2250     sub(Rd, Rn, decrement.as_register());
2251   } else {
2252     sub(Rd, Rn, decrement.as_constant());
2253   }
2254 }
2255 
2256 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2257   if (decrement.is_register()) {
2258     subw(Rd, Rn, decrement.as_register());
2259   } else {
2260     subw(Rd, Rn, decrement.as_constant());
2261   }
2262 }
2263 
2264 void MacroAssembler::reinit_heapbase()
2265 {
2266   if (UseCompressedOops) {
2267     if (Universe::is_fully_initialized()) {
2268       mov(rheapbase, CompressedOops::ptrs_base());
2269     } else {
2270       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2271       ldr(rheapbase, Address(rheapbase));
2272     }
2273   }
2274 }
2275 
2276 // this simulates the behaviour of the x86 cmpxchg instruction using a
2277 // load linked/store conditional pair. we use the acquire/release
2278 // versions of these instructions so that we flush pending writes as
2279 // per Java semantics.
2280 
2281 // n.b the x86 version assumes the old value to be compared against is
2282 // in rax and updates rax with the value located in memory if the
2283 // cmpxchg fails. we supply a register for the old value explicitly
2284 
2285 // the aarch64 load linked/store conditional instructions do not
2286 // accept an offset. so, unlike x86, we must provide a plain register
2287 // to identify the memory word to be compared/exchanged rather than a
2288 // register+offset Address.
2289 
2290 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2291                                 Label &succeed, Label *fail) {
2292   // oldv holds comparison value
2293   // newv holds value to write in exchange
2294   // addr identifies memory word to compare against/update
2295   if (UseLSE) {
2296     mov(tmp, oldv);
2297     casal(Assembler::xword, oldv, newv, addr);
2298     cmp(tmp, oldv);
2299     br(Assembler::EQ, succeed);
2300     membar(AnyAny);
2301   } else {
2302     Label retry_load, nope;
2303     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2304       prfm(Address(addr), PSTL1STRM);
2305     bind(retry_load);
2306     // flush and load exclusive from the memory location
2307     // and fail if it is not what we expect
2308     ldaxr(tmp, addr);
2309     cmp(tmp, oldv);
2310     br(Assembler::NE, nope);
2311     // if we store+flush with no intervening write tmp wil be zero
2312     stlxr(tmp, newv, addr);
2313     cbzw(tmp, succeed);
2314     // retry so we only ever return after a load fails to compare
2315     // ensures we don't return a stale value after a failed write.
2316     b(retry_load);
2317     // if the memory word differs we return it in oldv and signal a fail
2318     bind(nope);
2319     membar(AnyAny);
2320     mov(oldv, tmp);
2321   }
2322   if (fail)
2323     b(*fail);
2324 }
2325 
2326 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2327                                         Label &succeed, Label *fail) {
2328   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2329   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2330 }
2331 
2332 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2333                                 Label &succeed, Label *fail) {
2334   // oldv holds comparison value
2335   // newv holds value to write in exchange
2336   // addr identifies memory word to compare against/update
2337   // tmp returns 0/1 for success/failure
2338   if (UseLSE) {
2339     mov(tmp, oldv);
2340     casal(Assembler::word, oldv, newv, addr);
2341     cmp(tmp, oldv);
2342     br(Assembler::EQ, succeed);
2343     membar(AnyAny);
2344   } else {
2345     Label retry_load, nope;
2346     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2347       prfm(Address(addr), PSTL1STRM);
2348     bind(retry_load);
2349     // flush and load exclusive from the memory location
2350     // and fail if it is not what we expect
2351     ldaxrw(tmp, addr);
2352     cmp(tmp, oldv);
2353     br(Assembler::NE, nope);
2354     // if we store+flush with no intervening write tmp wil be zero
2355     stlxrw(tmp, newv, addr);
2356     cbzw(tmp, succeed);
2357     // retry so we only ever return after a load fails to compare
2358     // ensures we don't return a stale value after a failed write.
2359     b(retry_load);
2360     // if the memory word differs we return it in oldv and signal a fail
2361     bind(nope);
2362     membar(AnyAny);
2363     mov(oldv, tmp);
2364   }
2365   if (fail)
2366     b(*fail);
2367 }
2368 
2369 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2370 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2371 // Pass a register for the result, otherwise pass noreg.
2372 
2373 // Clobbers rscratch1
2374 void MacroAssembler::cmpxchg(Register addr, Register expected,
2375                              Register new_val,
2376                              enum operand_size size,
2377                              bool acquire, bool release,
2378                              bool weak,
2379                              Register result) {
2380   if (result == noreg)  result = rscratch1;
2381   BLOCK_COMMENT("cmpxchg {");
2382   if (UseLSE) {
2383     mov(result, expected);
2384     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2385     compare_eq(result, expected, size);
2386   } else {
2387     Label retry_load, done;
2388     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2389       prfm(Address(addr), PSTL1STRM);
2390     bind(retry_load);
2391     load_exclusive(result, addr, size, acquire);
2392     compare_eq(result, expected, size);
2393     br(Assembler::NE, done);
2394     store_exclusive(rscratch1, new_val, addr, size, release);
2395     if (weak) {
2396       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2397     } else {
2398       cbnzw(rscratch1, retry_load);
2399     }
2400     bind(done);
2401   }
2402   BLOCK_COMMENT("} cmpxchg");
2403 }
2404 
2405 // A generic comparison. Only compares for equality, clobbers rscratch1.
2406 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2407   if (size == xword) {
2408     cmp(rm, rn);
2409   } else if (size == word) {
2410     cmpw(rm, rn);
2411   } else if (size == halfword) {
2412     eorw(rscratch1, rm, rn);
2413     ands(zr, rscratch1, 0xffff);
2414   } else if (size == byte) {
2415     eorw(rscratch1, rm, rn);
2416     ands(zr, rscratch1, 0xff);
2417   } else {
2418     ShouldNotReachHere();
2419   }
2420 }
2421 
2422 
2423 static bool different(Register a, RegisterOrConstant b, Register c) {
2424   if (b.is_constant())
2425     return a != c;
2426   else
2427     return a != b.as_register() && a != c && b.as_register() != c;
2428 }
2429 
2430 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2431 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2432   if (UseLSE) {                                                         \
2433     prev = prev->is_valid() ? prev : zr;                                \
2434     if (incr.is_register()) {                                           \
2435       AOP(sz, incr.as_register(), prev, addr);                          \
2436     } else {                                                            \
2437       mov(rscratch2, incr.as_constant());                               \
2438       AOP(sz, rscratch2, prev, addr);                                   \
2439     }                                                                   \
2440     return;                                                             \
2441   }                                                                     \
2442   Register result = rscratch2;                                          \
2443   if (prev->is_valid())                                                 \
2444     result = different(prev, incr, addr) ? prev : rscratch2;            \
2445                                                                         \
2446   Label retry_load;                                                     \
2447   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2448     prfm(Address(addr), PSTL1STRM);                                     \
2449   bind(retry_load);                                                     \
2450   LDXR(result, addr);                                                   \
2451   OP(rscratch1, result, incr);                                          \
2452   STXR(rscratch2, rscratch1, addr);                                     \
2453   cbnzw(rscratch2, retry_load);                                         \
2454   if (prev->is_valid() && prev != result) {                             \
2455     IOP(prev, rscratch1, incr);                                         \
2456   }                                                                     \
2457 }
2458 
2459 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2460 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2461 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2462 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2463 
2464 #undef ATOMIC_OP
2465 
2466 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2467 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2468   if (UseLSE) {                                                         \
2469     prev = prev->is_valid() ? prev : zr;                                \
2470     AOP(sz, newv, prev, addr);                                          \
2471     return;                                                             \
2472   }                                                                     \
2473   Register result = rscratch2;                                          \
2474   if (prev->is_valid())                                                 \
2475     result = different(prev, newv, addr) ? prev : rscratch2;            \
2476                                                                         \
2477   Label retry_load;                                                     \
2478   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2479     prfm(Address(addr), PSTL1STRM);                                     \
2480   bind(retry_load);                                                     \
2481   LDXR(result, addr);                                                   \
2482   STXR(rscratch1, newv, addr);                                          \
2483   cbnzw(rscratch1, retry_load);                                         \
2484   if (prev->is_valid() && prev != result)                               \
2485     mov(prev, result);                                                  \
2486 }
2487 
2488 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2489 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2490 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2491 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2492 
2493 #undef ATOMIC_XCHG
2494 
2495 #ifndef PRODUCT
2496 extern "C" void findpc(intptr_t x);
2497 #endif
2498 
2499 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2500 {
2501   // In order to get locks to work, we need to fake a in_VM state
2502   if (ShowMessageBoxOnError ) {
2503     JavaThread* thread = JavaThread::current();
2504     JavaThreadState saved_state = thread->thread_state();
2505     thread->set_thread_state(_thread_in_vm);
2506 #ifndef PRODUCT
2507     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2508       ttyLocker ttyl;
2509       BytecodeCounter::print();
2510     }
2511 #endif
2512     if (os::message_box(msg, "Execution stopped, print registers?")) {
2513       ttyLocker ttyl;
2514       tty->print_cr(" pc = 0x%016lx", pc);
2515 #ifndef PRODUCT
2516       tty->cr();
2517       findpc(pc);
2518       tty->cr();
2519 #endif
2520       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2521       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2522       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2523       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2524       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2525       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2526       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2527       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2528       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2529       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2530       tty->print_cr("r10 = 0x%016lx", regs[10]);
2531       tty->print_cr("r11 = 0x%016lx", regs[11]);
2532       tty->print_cr("r12 = 0x%016lx", regs[12]);
2533       tty->print_cr("r13 = 0x%016lx", regs[13]);
2534       tty->print_cr("r14 = 0x%016lx", regs[14]);
2535       tty->print_cr("r15 = 0x%016lx", regs[15]);
2536       tty->print_cr("r16 = 0x%016lx", regs[16]);
2537       tty->print_cr("r17 = 0x%016lx", regs[17]);
2538       tty->print_cr("r18 = 0x%016lx", regs[18]);
2539       tty->print_cr("r19 = 0x%016lx", regs[19]);
2540       tty->print_cr("r20 = 0x%016lx", regs[20]);
2541       tty->print_cr("r21 = 0x%016lx", regs[21]);
2542       tty->print_cr("r22 = 0x%016lx", regs[22]);
2543       tty->print_cr("r23 = 0x%016lx", regs[23]);
2544       tty->print_cr("r24 = 0x%016lx", regs[24]);
2545       tty->print_cr("r25 = 0x%016lx", regs[25]);
2546       tty->print_cr("r26 = 0x%016lx", regs[26]);
2547       tty->print_cr("r27 = 0x%016lx", regs[27]);
2548       tty->print_cr("r28 = 0x%016lx", regs[28]);
2549       tty->print_cr("r30 = 0x%016lx", regs[30]);
2550       tty->print_cr("r31 = 0x%016lx", regs[31]);
2551       BREAKPOINT;
2552     }
2553     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2554   } else {
2555     ttyLocker ttyl;
2556     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2557                     msg);
2558     assert(false, "DEBUG MESSAGE: %s", msg);
2559   }
2560 }
2561 
2562 #ifdef BUILTIN_SIM
2563 // routine to generate an x86 prolog for a stub function which
2564 // bootstraps into the generated ARM code which directly follows the
2565 // stub
2566 //
2567 // the argument encodes the number of general and fp registers
2568 // passed by the caller and the callng convention (currently just
2569 // the number of general registers and assumes C argument passing)
2570 
2571 extern "C" {
2572 int aarch64_stub_prolog_size();
2573 void aarch64_stub_prolog();
2574 void aarch64_prolog();
2575 }
2576 
2577 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2578                                    address *prolog_ptr)
2579 {
2580   int calltype = (((ret_type & 0x3) << 8) |
2581                   ((fp_arg_count & 0xf) << 4) |
2582                   (gp_arg_count & 0xf));
2583 
2584   // the addresses for the x86 to ARM entry code we need to use
2585   address start = pc();
2586   // printf("start = %lx\n", start);
2587   int byteCount =  aarch64_stub_prolog_size();
2588   // printf("byteCount = %x\n", byteCount);
2589   int instructionCount = (byteCount + 3)/ 4;
2590   // printf("instructionCount = %x\n", instructionCount);
2591   for (int i = 0; i < instructionCount; i++) {
2592     nop();
2593   }
2594 
2595   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2596 
2597   // write the address of the setup routine and the call format at the
2598   // end of into the copied code
2599   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2600   if (prolog_ptr)
2601     patch_end[-2] = (u_int64_t)prolog_ptr;
2602   patch_end[-1] = calltype;
2603 }
2604 #endif
2605 
2606 void MacroAssembler::push_call_clobbered_registers() {
2607   int step = 4 * wordSize;
2608   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2609   sub(sp, sp, step);
2610   mov(rscratch1, -step);
2611   // Push v0-v7, v16-v31.
2612   for (int i = 31; i>= 4; i -= 4) {
2613     if (i <= v7->encoding() || i >= v16->encoding())
2614       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2615           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2616   }
2617   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2618       as_FloatRegister(3), T1D, Address(sp));
2619 }
2620 
2621 void MacroAssembler::pop_call_clobbered_registers() {
2622   for (int i = 0; i < 32; i += 4) {
2623     if (i <= v7->encoding() || i >= v16->encoding())
2624       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2625           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2626   }
2627 
2628   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2629 }
2630 
2631 void MacroAssembler::push_CPU_state(bool save_vectors) {
2632   int step = (save_vectors ? 8 : 4) * wordSize;
2633   push(0x3fffffff, sp);         // integer registers except lr & sp
2634   mov(rscratch1, -step);
2635   sub(sp, sp, step);
2636   for (int i = 28; i >= 4; i -= 4) {
2637     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2638         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2639   }
2640   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2641 }
2642 
2643 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2644   int step = (restore_vectors ? 8 : 4) * wordSize;
2645   for (int i = 0; i <= 28; i += 4)
2646     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2647         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2648   pop(0x3fffffff, sp);         // integer registers except lr & sp
2649 }
2650 
2651 /**
2652  * Helpers for multiply_to_len().
2653  */
2654 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2655                                      Register src1, Register src2) {
2656   adds(dest_lo, dest_lo, src1);
2657   adc(dest_hi, dest_hi, zr);
2658   adds(dest_lo, dest_lo, src2);
2659   adc(final_dest_hi, dest_hi, zr);
2660 }
2661 
2662 // Generate an address from (r + r1 extend offset).  "size" is the
2663 // size of the operand.  The result may be in rscratch2.
2664 Address MacroAssembler::offsetted_address(Register r, Register r1,
2665                                           Address::extend ext, int offset, int size) {
2666   if (offset || (ext.shift() % size != 0)) {
2667     lea(rscratch2, Address(r, r1, ext));
2668     return Address(rscratch2, offset);
2669   } else {
2670     return Address(r, r1, ext);
2671   }
2672 }
2673 
2674 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2675 {
2676   assert(offset >= 0, "spill to negative address?");
2677   // Offset reachable ?
2678   //   Not aligned - 9 bits signed offset
2679   //   Aligned - 12 bits unsigned offset shifted
2680   Register base = sp;
2681   if ((offset & (size-1)) && offset >= (1<<8)) {
2682     add(tmp, base, offset & ((1<<12)-1));
2683     base = tmp;
2684     offset &= -1u<<12;
2685   }
2686 
2687   if (offset >= (1<<12) * size) {
2688     add(tmp, base, offset & (((1<<12)-1)<<12));
2689     base = tmp;
2690     offset &= ~(((1<<12)-1)<<12);
2691   }
2692 
2693   return Address(base, offset);
2694 }
2695 
2696 // Checks whether offset is aligned.
2697 // Returns true if it is, else false.
2698 bool MacroAssembler::merge_alignment_check(Register base,
2699                                            size_t size,
2700                                            long cur_offset,
2701                                            long prev_offset) const {
2702   if (AvoidUnalignedAccesses) {
2703     if (base == sp) {
2704       // Checks whether low offset if aligned to pair of registers.
2705       long pair_mask = size * 2 - 1;
2706       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2707       return (offset & pair_mask) == 0;
2708     } else { // If base is not sp, we can't guarantee the access is aligned.
2709       return false;
2710     }
2711   } else {
2712     long mask = size - 1;
2713     // Load/store pair instruction only supports element size aligned offset.
2714     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2715   }
2716 }
2717 
2718 // Checks whether current and previous loads/stores can be merged.
2719 // Returns true if it can be merged, else false.
2720 bool MacroAssembler::ldst_can_merge(Register rt,
2721                                     const Address &adr,
2722                                     size_t cur_size_in_bytes,
2723                                     bool is_store) const {
2724   address prev = pc() - NativeInstruction::instruction_size;
2725   address last = code()->last_insn();
2726 
2727   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2728     return false;
2729   }
2730 
2731   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2732     return false;
2733   }
2734 
2735   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2736   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2737 
2738   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2739   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2740 
2741   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2742     return false;
2743   }
2744 
2745   long max_offset = 63 * prev_size_in_bytes;
2746   long min_offset = -64 * prev_size_in_bytes;
2747 
2748   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2749 
2750   // Only same base can be merged.
2751   if (adr.base() != prev_ldst->base()) {
2752     return false;
2753   }
2754 
2755   long cur_offset = adr.offset();
2756   long prev_offset = prev_ldst->offset();
2757   size_t diff = abs(cur_offset - prev_offset);
2758   if (diff != prev_size_in_bytes) {
2759     return false;
2760   }
2761 
2762   // Following cases can not be merged:
2763   // ldr x2, [x2, #8]
2764   // ldr x3, [x2, #16]
2765   // or:
2766   // ldr x2, [x3, #8]
2767   // ldr x2, [x3, #16]
2768   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2769   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2770     return false;
2771   }
2772 
2773   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2774   // Offset range must be in ldp/stp instruction's range.
2775   if (low_offset > max_offset || low_offset < min_offset) {
2776     return false;
2777   }
2778 
2779   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2780     return true;
2781   }
2782 
2783   return false;
2784 }
2785 
2786 // Merge current load/store with previous load/store into ldp/stp.
2787 void MacroAssembler::merge_ldst(Register rt,
2788                                 const Address &adr,
2789                                 size_t cur_size_in_bytes,
2790                                 bool is_store) {
2791 
2792   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2793 
2794   Register rt_low, rt_high;
2795   address prev = pc() - NativeInstruction::instruction_size;
2796   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2797 
2798   long offset;
2799 
2800   if (adr.offset() < prev_ldst->offset()) {
2801     offset = adr.offset();
2802     rt_low = rt;
2803     rt_high = prev_ldst->target();
2804   } else {
2805     offset = prev_ldst->offset();
2806     rt_low = prev_ldst->target();
2807     rt_high = rt;
2808   }
2809 
2810   Address adr_p = Address(prev_ldst->base(), offset);
2811   // Overwrite previous generated binary.
2812   code_section()->set_end(prev);
2813 
2814   const int sz = prev_ldst->size_in_bytes();
2815   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2816   if (!is_store) {
2817     BLOCK_COMMENT("merged ldr pair");
2818     if (sz == 8) {
2819       ldp(rt_low, rt_high, adr_p);
2820     } else {
2821       ldpw(rt_low, rt_high, adr_p);
2822     }
2823   } else {
2824     BLOCK_COMMENT("merged str pair");
2825     if (sz == 8) {
2826       stp(rt_low, rt_high, adr_p);
2827     } else {
2828       stpw(rt_low, rt_high, adr_p);
2829     }
2830   }
2831 }
2832 
2833 /**
2834  * Multiply 64 bit by 64 bit first loop.
2835  */
2836 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2837                                            Register y, Register y_idx, Register z,
2838                                            Register carry, Register product,
2839                                            Register idx, Register kdx) {
2840   //
2841   //  jlong carry, x[], y[], z[];
2842   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2843   //    huge_128 product = y[idx] * x[xstart] + carry;
2844   //    z[kdx] = (jlong)product;
2845   //    carry  = (jlong)(product >>> 64);
2846   //  }
2847   //  z[xstart] = carry;
2848   //
2849 
2850   Label L_first_loop, L_first_loop_exit;
2851   Label L_one_x, L_one_y, L_multiply;
2852 
2853   subsw(xstart, xstart, 1);
2854   br(Assembler::MI, L_one_x);
2855 
2856   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2857   ldr(x_xstart, Address(rscratch1));
2858   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2859 
2860   bind(L_first_loop);
2861   subsw(idx, idx, 1);
2862   br(Assembler::MI, L_first_loop_exit);
2863   subsw(idx, idx, 1);
2864   br(Assembler::MI, L_one_y);
2865   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2866   ldr(y_idx, Address(rscratch1));
2867   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2868   bind(L_multiply);
2869 
2870   // AArch64 has a multiply-accumulate instruction that we can't use
2871   // here because it has no way to process carries, so we have to use
2872   // separate add and adc instructions.  Bah.
2873   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2874   mul(product, x_xstart, y_idx);
2875   adds(product, product, carry);
2876   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2877 
2878   subw(kdx, kdx, 2);
2879   ror(product, product, 32); // back to big-endian
2880   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2881 
2882   b(L_first_loop);
2883 
2884   bind(L_one_y);
2885   ldrw(y_idx, Address(y,  0));
2886   b(L_multiply);
2887 
2888   bind(L_one_x);
2889   ldrw(x_xstart, Address(x,  0));
2890   b(L_first_loop);
2891 
2892   bind(L_first_loop_exit);
2893 }
2894 
2895 /**
2896  * Multiply 128 bit by 128. Unrolled inner loop.
2897  *
2898  */
2899 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2900                                              Register carry, Register carry2,
2901                                              Register idx, Register jdx,
2902                                              Register yz_idx1, Register yz_idx2,
2903                                              Register tmp, Register tmp3, Register tmp4,
2904                                              Register tmp6, Register product_hi) {
2905 
2906   //   jlong carry, x[], y[], z[];
2907   //   int kdx = ystart+1;
2908   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2909   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2910   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2911   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2912   //     carry  = (jlong)(tmp4 >>> 64);
2913   //     z[kdx+idx+1] = (jlong)tmp3;
2914   //     z[kdx+idx] = (jlong)tmp4;
2915   //   }
2916   //   idx += 2;
2917   //   if (idx > 0) {
2918   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2919   //     z[kdx+idx] = (jlong)yz_idx1;
2920   //     carry  = (jlong)(yz_idx1 >>> 64);
2921   //   }
2922   //
2923 
2924   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2925 
2926   lsrw(jdx, idx, 2);
2927 
2928   bind(L_third_loop);
2929 
2930   subsw(jdx, jdx, 1);
2931   br(Assembler::MI, L_third_loop_exit);
2932   subw(idx, idx, 4);
2933 
2934   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2935 
2936   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2937 
2938   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2939 
2940   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2941   ror(yz_idx2, yz_idx2, 32);
2942 
2943   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2944 
2945   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2946   umulh(tmp4, product_hi, yz_idx1);
2947 
2948   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2949   ror(rscratch2, rscratch2, 32);
2950 
2951   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2952   umulh(carry2, product_hi, yz_idx2);
2953 
2954   // propagate sum of both multiplications into carry:tmp4:tmp3
2955   adds(tmp3, tmp3, carry);
2956   adc(tmp4, tmp4, zr);
2957   adds(tmp3, tmp3, rscratch1);
2958   adcs(tmp4, tmp4, tmp);
2959   adc(carry, carry2, zr);
2960   adds(tmp4, tmp4, rscratch2);
2961   adc(carry, carry, zr);
2962 
2963   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2964   ror(tmp4, tmp4, 32);
2965   stp(tmp4, tmp3, Address(tmp6, 0));
2966 
2967   b(L_third_loop);
2968   bind (L_third_loop_exit);
2969 
2970   andw (idx, idx, 0x3);
2971   cbz(idx, L_post_third_loop_done);
2972 
2973   Label L_check_1;
2974   subsw(idx, idx, 2);
2975   br(Assembler::MI, L_check_1);
2976 
2977   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2978   ldr(yz_idx1, Address(rscratch1, 0));
2979   ror(yz_idx1, yz_idx1, 32);
2980   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2981   umulh(tmp4, product_hi, yz_idx1);
2982   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2983   ldr(yz_idx2, Address(rscratch1, 0));
2984   ror(yz_idx2, yz_idx2, 32);
2985 
2986   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2987 
2988   ror(tmp3, tmp3, 32);
2989   str(tmp3, Address(rscratch1, 0));
2990 
2991   bind (L_check_1);
2992 
2993   andw (idx, idx, 0x1);
2994   subsw(idx, idx, 1);
2995   br(Assembler::MI, L_post_third_loop_done);
2996   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2997   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2998   umulh(carry2, tmp4, product_hi);
2999   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3000 
3001   add2_with_carry(carry2, tmp3, tmp4, carry);
3002 
3003   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3004   extr(carry, carry2, tmp3, 32);
3005 
3006   bind(L_post_third_loop_done);
3007 }
3008 
3009 /**
3010  * Code for BigInteger::multiplyToLen() instrinsic.
3011  *
3012  * r0: x
3013  * r1: xlen
3014  * r2: y
3015  * r3: ylen
3016  * r4:  z
3017  * r5: zlen
3018  * r10: tmp1
3019  * r11: tmp2
3020  * r12: tmp3
3021  * r13: tmp4
3022  * r14: tmp5
3023  * r15: tmp6
3024  * r16: tmp7
3025  *
3026  */
3027 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3028                                      Register z, Register zlen,
3029                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3030                                      Register tmp5, Register tmp6, Register product_hi) {
3031 
3032   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3033 
3034   const Register idx = tmp1;
3035   const Register kdx = tmp2;
3036   const Register xstart = tmp3;
3037 
3038   const Register y_idx = tmp4;
3039   const Register carry = tmp5;
3040   const Register product  = xlen;
3041   const Register x_xstart = zlen;  // reuse register
3042 
3043   // First Loop.
3044   //
3045   //  final static long LONG_MASK = 0xffffffffL;
3046   //  int xstart = xlen - 1;
3047   //  int ystart = ylen - 1;
3048   //  long carry = 0;
3049   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3050   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3051   //    z[kdx] = (int)product;
3052   //    carry = product >>> 32;
3053   //  }
3054   //  z[xstart] = (int)carry;
3055   //
3056 
3057   movw(idx, ylen);      // idx = ylen;
3058   movw(kdx, zlen);      // kdx = xlen+ylen;
3059   mov(carry, zr);       // carry = 0;
3060 
3061   Label L_done;
3062 
3063   movw(xstart, xlen);
3064   subsw(xstart, xstart, 1);
3065   br(Assembler::MI, L_done);
3066 
3067   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3068 
3069   Label L_second_loop;
3070   cbzw(kdx, L_second_loop);
3071 
3072   Label L_carry;
3073   subw(kdx, kdx, 1);
3074   cbzw(kdx, L_carry);
3075 
3076   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3077   lsr(carry, carry, 32);
3078   subw(kdx, kdx, 1);
3079 
3080   bind(L_carry);
3081   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3082 
3083   // Second and third (nested) loops.
3084   //
3085   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3086   //   carry = 0;
3087   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3088   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3089   //                    (z[k] & LONG_MASK) + carry;
3090   //     z[k] = (int)product;
3091   //     carry = product >>> 32;
3092   //   }
3093   //   z[i] = (int)carry;
3094   // }
3095   //
3096   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3097 
3098   const Register jdx = tmp1;
3099 
3100   bind(L_second_loop);
3101   mov(carry, zr);                // carry = 0;
3102   movw(jdx, ylen);               // j = ystart+1
3103 
3104   subsw(xstart, xstart, 1);      // i = xstart-1;
3105   br(Assembler::MI, L_done);
3106 
3107   str(z, Address(pre(sp, -4 * wordSize)));
3108 
3109   Label L_last_x;
3110   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3111   subsw(xstart, xstart, 1);       // i = xstart-1;
3112   br(Assembler::MI, L_last_x);
3113 
3114   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3115   ldr(product_hi, Address(rscratch1));
3116   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3117 
3118   Label L_third_loop_prologue;
3119   bind(L_third_loop_prologue);
3120 
3121   str(ylen, Address(sp, wordSize));
3122   stp(x, xstart, Address(sp, 2 * wordSize));
3123   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3124                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3125   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3126   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3127 
3128   addw(tmp3, xlen, 1);
3129   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3130   subsw(tmp3, tmp3, 1);
3131   br(Assembler::MI, L_done);
3132 
3133   lsr(carry, carry, 32);
3134   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3135   b(L_second_loop);
3136 
3137   // Next infrequent code is moved outside loops.
3138   bind(L_last_x);
3139   ldrw(product_hi, Address(x,  0));
3140   b(L_third_loop_prologue);
3141 
3142   bind(L_done);
3143 }
3144 
3145 // Code for BigInteger::mulAdd instrinsic
3146 // out     = r0
3147 // in      = r1
3148 // offset  = r2  (already out.length-offset)
3149 // len     = r3
3150 // k       = r4
3151 //
3152 // pseudo code from java implementation:
3153 // carry = 0;
3154 // offset = out.length-offset - 1;
3155 // for (int j=len-1; j >= 0; j--) {
3156 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3157 //     out[offset--] = (int)product;
3158 //     carry = product >>> 32;
3159 // }
3160 // return (int)carry;
3161 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3162       Register len, Register k) {
3163     Label LOOP, END;
3164     // pre-loop
3165     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3166     csel(out, zr, out, Assembler::EQ);
3167     br(Assembler::EQ, END);
3168     add(in, in, len, LSL, 2); // in[j+1] address
3169     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3170     mov(out, zr); // used to keep carry now
3171     BIND(LOOP);
3172     ldrw(rscratch1, Address(pre(in, -4)));
3173     madd(rscratch1, rscratch1, k, out);
3174     ldrw(rscratch2, Address(pre(offset, -4)));
3175     add(rscratch1, rscratch1, rscratch2);
3176     strw(rscratch1, Address(offset));
3177     lsr(out, rscratch1, 32);
3178     subs(len, len, 1);
3179     br(Assembler::NE, LOOP);
3180     BIND(END);
3181 }
3182 
3183 /**
3184  * Emits code to update CRC-32 with a byte value according to constants in table
3185  *
3186  * @param [in,out]crc   Register containing the crc.
3187  * @param [in]val       Register containing the byte to fold into the CRC.
3188  * @param [in]table     Register containing the table of crc constants.
3189  *
3190  * uint32_t crc;
3191  * val = crc_table[(val ^ crc) & 0xFF];
3192  * crc = val ^ (crc >> 8);
3193  *
3194  */
3195 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3196   eor(val, val, crc);
3197   andr(val, val, 0xff);
3198   ldrw(val, Address(table, val, Address::lsl(2)));
3199   eor(crc, val, crc, Assembler::LSR, 8);
3200 }
3201 
3202 /**
3203  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3204  *
3205  * @param [in,out]crc   Register containing the crc.
3206  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3207  * @param [in]table0    Register containing table 0 of crc constants.
3208  * @param [in]table1    Register containing table 1 of crc constants.
3209  * @param [in]table2    Register containing table 2 of crc constants.
3210  * @param [in]table3    Register containing table 3 of crc constants.
3211  *
3212  * uint32_t crc;
3213  *   v = crc ^ v
3214  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3215  *
3216  */
3217 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3218         Register table0, Register table1, Register table2, Register table3,
3219         bool upper) {
3220   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3221   uxtb(tmp, v);
3222   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3223   ubfx(tmp, v, 8, 8);
3224   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3225   eor(crc, crc, tmp);
3226   ubfx(tmp, v, 16, 8);
3227   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3228   eor(crc, crc, tmp);
3229   ubfx(tmp, v, 24, 8);
3230   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3231   eor(crc, crc, tmp);
3232 }
3233 
3234 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3235         Register len, Register tmp0, Register tmp1, Register tmp2,
3236         Register tmp3) {
3237     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3238     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3239 
3240     mvnw(crc, crc);
3241 
3242     subs(len, len, 128);
3243     br(Assembler::GE, CRC_by64_pre);
3244   BIND(CRC_less64);
3245     adds(len, len, 128-32);
3246     br(Assembler::GE, CRC_by32_loop);
3247   BIND(CRC_less32);
3248     adds(len, len, 32-4);
3249     br(Assembler::GE, CRC_by4_loop);
3250     adds(len, len, 4);
3251     br(Assembler::GT, CRC_by1_loop);
3252     b(L_exit);
3253 
3254   BIND(CRC_by32_loop);
3255     ldp(tmp0, tmp1, Address(post(buf, 16)));
3256     subs(len, len, 32);
3257     crc32x(crc, crc, tmp0);
3258     ldr(tmp2, Address(post(buf, 8)));
3259     crc32x(crc, crc, tmp1);
3260     ldr(tmp3, Address(post(buf, 8)));
3261     crc32x(crc, crc, tmp2);
3262     crc32x(crc, crc, tmp3);
3263     br(Assembler::GE, CRC_by32_loop);
3264     cmn(len, 32);
3265     br(Assembler::NE, CRC_less32);
3266     b(L_exit);
3267 
3268   BIND(CRC_by4_loop);
3269     ldrw(tmp0, Address(post(buf, 4)));
3270     subs(len, len, 4);
3271     crc32w(crc, crc, tmp0);
3272     br(Assembler::GE, CRC_by4_loop);
3273     adds(len, len, 4);
3274     br(Assembler::LE, L_exit);
3275   BIND(CRC_by1_loop);
3276     ldrb(tmp0, Address(post(buf, 1)));
3277     subs(len, len, 1);
3278     crc32b(crc, crc, tmp0);
3279     br(Assembler::GT, CRC_by1_loop);
3280     b(L_exit);
3281 
3282   BIND(CRC_by64_pre);
3283     sub(buf, buf, 8);
3284     ldp(tmp0, tmp1, Address(buf, 8));
3285     crc32x(crc, crc, tmp0);
3286     ldr(tmp2, Address(buf, 24));
3287     crc32x(crc, crc, tmp1);
3288     ldr(tmp3, Address(buf, 32));
3289     crc32x(crc, crc, tmp2);
3290     ldr(tmp0, Address(buf, 40));
3291     crc32x(crc, crc, tmp3);
3292     ldr(tmp1, Address(buf, 48));
3293     crc32x(crc, crc, tmp0);
3294     ldr(tmp2, Address(buf, 56));
3295     crc32x(crc, crc, tmp1);
3296     ldr(tmp3, Address(pre(buf, 64)));
3297 
3298     b(CRC_by64_loop);
3299 
3300     align(CodeEntryAlignment);
3301   BIND(CRC_by64_loop);
3302     subs(len, len, 64);
3303     crc32x(crc, crc, tmp2);
3304     ldr(tmp0, Address(buf, 8));
3305     crc32x(crc, crc, tmp3);
3306     ldr(tmp1, Address(buf, 16));
3307     crc32x(crc, crc, tmp0);
3308     ldr(tmp2, Address(buf, 24));
3309     crc32x(crc, crc, tmp1);
3310     ldr(tmp3, Address(buf, 32));
3311     crc32x(crc, crc, tmp2);
3312     ldr(tmp0, Address(buf, 40));
3313     crc32x(crc, crc, tmp3);
3314     ldr(tmp1, Address(buf, 48));
3315     crc32x(crc, crc, tmp0);
3316     ldr(tmp2, Address(buf, 56));
3317     crc32x(crc, crc, tmp1);
3318     ldr(tmp3, Address(pre(buf, 64)));
3319     br(Assembler::GE, CRC_by64_loop);
3320 
3321     // post-loop
3322     crc32x(crc, crc, tmp2);
3323     crc32x(crc, crc, tmp3);
3324 
3325     sub(len, len, 64);
3326     add(buf, buf, 8);
3327     cmn(len, 128);
3328     br(Assembler::NE, CRC_less64);
3329   BIND(L_exit);
3330     mvnw(crc, crc);
3331 }
3332 
3333 /**
3334  * @param crc   register containing existing CRC (32-bit)
3335  * @param buf   register pointing to input byte buffer (byte*)
3336  * @param len   register containing number of bytes
3337  * @param table register that will contain address of CRC table
3338  * @param tmp   scratch register
3339  */
3340 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3341         Register table0, Register table1, Register table2, Register table3,
3342         Register tmp, Register tmp2, Register tmp3) {
3343   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3344   unsigned long offset;
3345 
3346   if (UseCRC32) {
3347       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3348       return;
3349   }
3350 
3351     mvnw(crc, crc);
3352 
3353     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3354     if (offset) add(table0, table0, offset);
3355     add(table1, table0, 1*256*sizeof(juint));
3356     add(table2, table0, 2*256*sizeof(juint));
3357     add(table3, table0, 3*256*sizeof(juint));
3358 
3359   if (UseNeon) {
3360       cmp(len, (u1)64);
3361       br(Assembler::LT, L_by16);
3362       eor(v16, T16B, v16, v16);
3363 
3364     Label L_fold;
3365 
3366       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3367 
3368       ld1(v0, v1, T2D, post(buf, 32));
3369       ld1r(v4, T2D, post(tmp, 8));
3370       ld1r(v5, T2D, post(tmp, 8));
3371       ld1r(v6, T2D, post(tmp, 8));
3372       ld1r(v7, T2D, post(tmp, 8));
3373       mov(v16, T4S, 0, crc);
3374 
3375       eor(v0, T16B, v0, v16);
3376       sub(len, len, 64);
3377 
3378     BIND(L_fold);
3379       pmull(v22, T8H, v0, v5, T8B);
3380       pmull(v20, T8H, v0, v7, T8B);
3381       pmull(v23, T8H, v0, v4, T8B);
3382       pmull(v21, T8H, v0, v6, T8B);
3383 
3384       pmull2(v18, T8H, v0, v5, T16B);
3385       pmull2(v16, T8H, v0, v7, T16B);
3386       pmull2(v19, T8H, v0, v4, T16B);
3387       pmull2(v17, T8H, v0, v6, T16B);
3388 
3389       uzp1(v24, T8H, v20, v22);
3390       uzp2(v25, T8H, v20, v22);
3391       eor(v20, T16B, v24, v25);
3392 
3393       uzp1(v26, T8H, v16, v18);
3394       uzp2(v27, T8H, v16, v18);
3395       eor(v16, T16B, v26, v27);
3396 
3397       ushll2(v22, T4S, v20, T8H, 8);
3398       ushll(v20, T4S, v20, T4H, 8);
3399 
3400       ushll2(v18, T4S, v16, T8H, 8);
3401       ushll(v16, T4S, v16, T4H, 8);
3402 
3403       eor(v22, T16B, v23, v22);
3404       eor(v18, T16B, v19, v18);
3405       eor(v20, T16B, v21, v20);
3406       eor(v16, T16B, v17, v16);
3407 
3408       uzp1(v17, T2D, v16, v20);
3409       uzp2(v21, T2D, v16, v20);
3410       eor(v17, T16B, v17, v21);
3411 
3412       ushll2(v20, T2D, v17, T4S, 16);
3413       ushll(v16, T2D, v17, T2S, 16);
3414 
3415       eor(v20, T16B, v20, v22);
3416       eor(v16, T16B, v16, v18);
3417 
3418       uzp1(v17, T2D, v20, v16);
3419       uzp2(v21, T2D, v20, v16);
3420       eor(v28, T16B, v17, v21);
3421 
3422       pmull(v22, T8H, v1, v5, T8B);
3423       pmull(v20, T8H, v1, v7, T8B);
3424       pmull(v23, T8H, v1, v4, T8B);
3425       pmull(v21, T8H, v1, v6, T8B);
3426 
3427       pmull2(v18, T8H, v1, v5, T16B);
3428       pmull2(v16, T8H, v1, v7, T16B);
3429       pmull2(v19, T8H, v1, v4, T16B);
3430       pmull2(v17, T8H, v1, v6, T16B);
3431 
3432       ld1(v0, v1, T2D, post(buf, 32));
3433 
3434       uzp1(v24, T8H, v20, v22);
3435       uzp2(v25, T8H, v20, v22);
3436       eor(v20, T16B, v24, v25);
3437 
3438       uzp1(v26, T8H, v16, v18);
3439       uzp2(v27, T8H, v16, v18);
3440       eor(v16, T16B, v26, v27);
3441 
3442       ushll2(v22, T4S, v20, T8H, 8);
3443       ushll(v20, T4S, v20, T4H, 8);
3444 
3445       ushll2(v18, T4S, v16, T8H, 8);
3446       ushll(v16, T4S, v16, T4H, 8);
3447 
3448       eor(v22, T16B, v23, v22);
3449       eor(v18, T16B, v19, v18);
3450       eor(v20, T16B, v21, v20);
3451       eor(v16, T16B, v17, v16);
3452 
3453       uzp1(v17, T2D, v16, v20);
3454       uzp2(v21, T2D, v16, v20);
3455       eor(v16, T16B, v17, v21);
3456 
3457       ushll2(v20, T2D, v16, T4S, 16);
3458       ushll(v16, T2D, v16, T2S, 16);
3459 
3460       eor(v20, T16B, v22, v20);
3461       eor(v16, T16B, v16, v18);
3462 
3463       uzp1(v17, T2D, v20, v16);
3464       uzp2(v21, T2D, v20, v16);
3465       eor(v20, T16B, v17, v21);
3466 
3467       shl(v16, T2D, v28, 1);
3468       shl(v17, T2D, v20, 1);
3469 
3470       eor(v0, T16B, v0, v16);
3471       eor(v1, T16B, v1, v17);
3472 
3473       subs(len, len, 32);
3474       br(Assembler::GE, L_fold);
3475 
3476       mov(crc, 0);
3477       mov(tmp, v0, T1D, 0);
3478       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3479       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3480       mov(tmp, v0, T1D, 1);
3481       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3482       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3483       mov(tmp, v1, T1D, 0);
3484       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3485       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3486       mov(tmp, v1, T1D, 1);
3487       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3488       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3489 
3490       add(len, len, 32);
3491   }
3492 
3493   BIND(L_by16);
3494     subs(len, len, 16);
3495     br(Assembler::GE, L_by16_loop);
3496     adds(len, len, 16-4);
3497     br(Assembler::GE, L_by4_loop);
3498     adds(len, len, 4);
3499     br(Assembler::GT, L_by1_loop);
3500     b(L_exit);
3501 
3502   BIND(L_by4_loop);
3503     ldrw(tmp, Address(post(buf, 4)));
3504     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3505     subs(len, len, 4);
3506     br(Assembler::GE, L_by4_loop);
3507     adds(len, len, 4);
3508     br(Assembler::LE, L_exit);
3509   BIND(L_by1_loop);
3510     subs(len, len, 1);
3511     ldrb(tmp, Address(post(buf, 1)));
3512     update_byte_crc32(crc, tmp, table0);
3513     br(Assembler::GT, L_by1_loop);
3514     b(L_exit);
3515 
3516     align(CodeEntryAlignment);
3517   BIND(L_by16_loop);
3518     subs(len, len, 16);
3519     ldp(tmp, tmp3, Address(post(buf, 16)));
3520     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3521     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3522     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3523     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3524     br(Assembler::GE, L_by16_loop);
3525     adds(len, len, 16-4);
3526     br(Assembler::GE, L_by4_loop);
3527     adds(len, len, 4);
3528     br(Assembler::GT, L_by1_loop);
3529   BIND(L_exit);
3530     mvnw(crc, crc);
3531 }
3532 
3533 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3534         Register len, Register tmp0, Register tmp1, Register tmp2,
3535         Register tmp3) {
3536     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3537     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3538 
3539     subs(len, len, 128);
3540     br(Assembler::GE, CRC_by64_pre);
3541   BIND(CRC_less64);
3542     adds(len, len, 128-32);
3543     br(Assembler::GE, CRC_by32_loop);
3544   BIND(CRC_less32);
3545     adds(len, len, 32-4);
3546     br(Assembler::GE, CRC_by4_loop);
3547     adds(len, len, 4);
3548     br(Assembler::GT, CRC_by1_loop);
3549     b(L_exit);
3550 
3551   BIND(CRC_by32_loop);
3552     ldp(tmp0, tmp1, Address(post(buf, 16)));
3553     subs(len, len, 32);
3554     crc32cx(crc, crc, tmp0);
3555     ldr(tmp2, Address(post(buf, 8)));
3556     crc32cx(crc, crc, tmp1);
3557     ldr(tmp3, Address(post(buf, 8)));
3558     crc32cx(crc, crc, tmp2);
3559     crc32cx(crc, crc, tmp3);
3560     br(Assembler::GE, CRC_by32_loop);
3561     cmn(len, 32);
3562     br(Assembler::NE, CRC_less32);
3563     b(L_exit);
3564 
3565   BIND(CRC_by4_loop);
3566     ldrw(tmp0, Address(post(buf, 4)));
3567     subs(len, len, 4);
3568     crc32cw(crc, crc, tmp0);
3569     br(Assembler::GE, CRC_by4_loop);
3570     adds(len, len, 4);
3571     br(Assembler::LE, L_exit);
3572   BIND(CRC_by1_loop);
3573     ldrb(tmp0, Address(post(buf, 1)));
3574     subs(len, len, 1);
3575     crc32cb(crc, crc, tmp0);
3576     br(Assembler::GT, CRC_by1_loop);
3577     b(L_exit);
3578 
3579   BIND(CRC_by64_pre);
3580     sub(buf, buf, 8);
3581     ldp(tmp0, tmp1, Address(buf, 8));
3582     crc32cx(crc, crc, tmp0);
3583     ldr(tmp2, Address(buf, 24));
3584     crc32cx(crc, crc, tmp1);
3585     ldr(tmp3, Address(buf, 32));
3586     crc32cx(crc, crc, tmp2);
3587     ldr(tmp0, Address(buf, 40));
3588     crc32cx(crc, crc, tmp3);
3589     ldr(tmp1, Address(buf, 48));
3590     crc32cx(crc, crc, tmp0);
3591     ldr(tmp2, Address(buf, 56));
3592     crc32cx(crc, crc, tmp1);
3593     ldr(tmp3, Address(pre(buf, 64)));
3594 
3595     b(CRC_by64_loop);
3596 
3597     align(CodeEntryAlignment);
3598   BIND(CRC_by64_loop);
3599     subs(len, len, 64);
3600     crc32cx(crc, crc, tmp2);
3601     ldr(tmp0, Address(buf, 8));
3602     crc32cx(crc, crc, tmp3);
3603     ldr(tmp1, Address(buf, 16));
3604     crc32cx(crc, crc, tmp0);
3605     ldr(tmp2, Address(buf, 24));
3606     crc32cx(crc, crc, tmp1);
3607     ldr(tmp3, Address(buf, 32));
3608     crc32cx(crc, crc, tmp2);
3609     ldr(tmp0, Address(buf, 40));
3610     crc32cx(crc, crc, tmp3);
3611     ldr(tmp1, Address(buf, 48));
3612     crc32cx(crc, crc, tmp0);
3613     ldr(tmp2, Address(buf, 56));
3614     crc32cx(crc, crc, tmp1);
3615     ldr(tmp3, Address(pre(buf, 64)));
3616     br(Assembler::GE, CRC_by64_loop);
3617 
3618     // post-loop
3619     crc32cx(crc, crc, tmp2);
3620     crc32cx(crc, crc, tmp3);
3621 
3622     sub(len, len, 64);
3623     add(buf, buf, 8);
3624     cmn(len, 128);
3625     br(Assembler::NE, CRC_less64);
3626   BIND(L_exit);
3627 }
3628 
3629 /**
3630  * @param crc   register containing existing CRC (32-bit)
3631  * @param buf   register pointing to input byte buffer (byte*)
3632  * @param len   register containing number of bytes
3633  * @param table register that will contain address of CRC table
3634  * @param tmp   scratch register
3635  */
3636 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3637         Register table0, Register table1, Register table2, Register table3,
3638         Register tmp, Register tmp2, Register tmp3) {
3639   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3640 }
3641 
3642 
3643 SkipIfEqual::SkipIfEqual(
3644     MacroAssembler* masm, const bool* flag_addr, bool value) {
3645   _masm = masm;
3646   unsigned long offset;
3647   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3648   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3649   _masm->cbzw(rscratch1, _label);
3650 }
3651 
3652 SkipIfEqual::~SkipIfEqual() {
3653   _masm->bind(_label);
3654 }
3655 
3656 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3657   Address adr;
3658   switch(dst.getMode()) {
3659   case Address::base_plus_offset:
3660     // This is the expected mode, although we allow all the other
3661     // forms below.
3662     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3663     break;
3664   default:
3665     lea(rscratch2, dst);
3666     adr = Address(rscratch2);
3667     break;
3668   }
3669   ldr(rscratch1, adr);
3670   add(rscratch1, rscratch1, src);
3671   str(rscratch1, adr);
3672 }
3673 
3674 void MacroAssembler::cmpptr(Register src1, Address src2) {
3675   unsigned long offset;
3676   adrp(rscratch1, src2, offset);
3677   ldr(rscratch1, Address(rscratch1, offset));
3678   cmp(src1, rscratch1);
3679 }
3680 
3681 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3682   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3683   bs->obj_equals(this, obj1, obj2);
3684 }
3685 
3686 void MacroAssembler::load_klass(Register dst, Register src) {
3687   if (UseCompressedClassPointers) {
3688     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3689     decode_klass_not_null(dst);
3690   } else {
3691     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3692   }
3693 }
3694 
3695 // ((OopHandle)result).resolve();
3696 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3697   // OopHandle::resolve is an indirection.
3698   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3699 }
3700 
3701 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3702   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3703   ldr(dst, Address(rmethod, Method::const_offset()));
3704   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3705   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3706   ldr(dst, Address(dst, mirror_offset));
3707   resolve_oop_handle(dst, tmp);
3708 }
3709 
3710 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3711   if (UseCompressedClassPointers) {
3712     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3713     if (CompressedKlassPointers::base() == NULL) {
3714       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3715       return;
3716     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3717                && CompressedKlassPointers::shift() == 0) {
3718       // Only the bottom 32 bits matter
3719       cmpw(trial_klass, tmp);
3720       return;
3721     }
3722     decode_klass_not_null(tmp);
3723   } else {
3724     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3725   }
3726   cmp(trial_klass, tmp);
3727 }
3728 
3729 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3730   load_klass(dst, src);
3731   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3732 }
3733 
3734 void MacroAssembler::store_klass(Register dst, Register src) {
3735   // FIXME: Should this be a store release?  concurrent gcs assumes
3736   // klass length is valid if klass field is not null.
3737   if (UseCompressedClassPointers) {
3738     encode_klass_not_null(src);
3739     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3740   } else {
3741     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3742   }
3743 }
3744 
3745 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3746   if (UseCompressedClassPointers) {
3747     // Store to klass gap in destination
3748     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3749   }
3750 }
3751 
3752 // Algorithm must match CompressedOops::encode.
3753 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3754 #ifdef ASSERT
3755   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3756 #endif
3757   verify_oop(s, "broken oop in encode_heap_oop");
3758   if (CompressedOops::base() == NULL) {
3759     if (CompressedOops::shift() != 0) {
3760       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3761       lsr(d, s, LogMinObjAlignmentInBytes);
3762     } else {
3763       mov(d, s);
3764     }
3765   } else {
3766     subs(d, s, rheapbase);
3767     csel(d, d, zr, Assembler::HS);
3768     lsr(d, d, LogMinObjAlignmentInBytes);
3769 
3770     /*  Old algorithm: is this any worse?
3771     Label nonnull;
3772     cbnz(r, nonnull);
3773     sub(r, r, rheapbase);
3774     bind(nonnull);
3775     lsr(r, r, LogMinObjAlignmentInBytes);
3776     */
3777   }
3778 }
3779 
3780 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3781 #ifdef ASSERT
3782   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3783   if (CheckCompressedOops) {
3784     Label ok;
3785     cbnz(r, ok);
3786     stop("null oop passed to encode_heap_oop_not_null");
3787     bind(ok);
3788   }
3789 #endif
3790   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3791   if (CompressedOops::base() != NULL) {
3792     sub(r, r, rheapbase);
3793   }
3794   if (CompressedOops::shift() != 0) {
3795     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3796     lsr(r, r, LogMinObjAlignmentInBytes);
3797   }
3798 }
3799 
3800 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3801 #ifdef ASSERT
3802   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3803   if (CheckCompressedOops) {
3804     Label ok;
3805     cbnz(src, ok);
3806     stop("null oop passed to encode_heap_oop_not_null2");
3807     bind(ok);
3808   }
3809 #endif
3810   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3811 
3812   Register data = src;
3813   if (CompressedOops::base() != NULL) {
3814     sub(dst, src, rheapbase);
3815     data = dst;
3816   }
3817   if (CompressedOops::shift() != 0) {
3818     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3819     lsr(dst, data, LogMinObjAlignmentInBytes);
3820     data = dst;
3821   }
3822   if (data == src)
3823     mov(dst, src);
3824 }
3825 
3826 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3827 #ifdef ASSERT
3828   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3829 #endif
3830   if (CompressedOops::base() == NULL) {
3831     if (CompressedOops::shift() != 0 || d != s) {
3832       lsl(d, s, CompressedOops::shift());
3833     }
3834   } else {
3835     Label done;
3836     if (d != s)
3837       mov(d, s);
3838     cbz(s, done);
3839     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3840     bind(done);
3841   }
3842   verify_oop(d, "broken oop in decode_heap_oop");
3843 }
3844 
3845 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3846   assert (UseCompressedOops, "should only be used for compressed headers");
3847   assert (Universe::heap() != NULL, "java heap should be initialized");
3848   // Cannot assert, unverified entry point counts instructions (see .ad file)
3849   // vtableStubs also counts instructions in pd_code_size_limit.
3850   // Also do not verify_oop as this is called by verify_oop.
3851   if (CompressedOops::shift() != 0) {
3852     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3853     if (CompressedOops::base() != NULL) {
3854       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3855     } else {
3856       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3857     }
3858   } else {
3859     assert (CompressedOops::base() == NULL, "sanity");
3860   }
3861 }
3862 
3863 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3864   assert (UseCompressedOops, "should only be used for compressed headers");
3865   assert (Universe::heap() != NULL, "java heap should be initialized");
3866   // Cannot assert, unverified entry point counts instructions (see .ad file)
3867   // vtableStubs also counts instructions in pd_code_size_limit.
3868   // Also do not verify_oop as this is called by verify_oop.
3869   if (CompressedOops::shift() != 0) {
3870     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3871     if (CompressedOops::base() != NULL) {
3872       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3873     } else {
3874       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3875     }
3876   } else {
3877     assert (CompressedOops::base() == NULL, "sanity");
3878     if (dst != src) {
3879       mov(dst, src);
3880     }
3881   }
3882 }
3883 
3884 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3885   if (CompressedKlassPointers::base() == NULL) {
3886     if (CompressedKlassPointers::shift() != 0) {
3887       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3888       lsr(dst, src, LogKlassAlignmentInBytes);
3889     } else {
3890       if (dst != src) mov(dst, src);
3891     }
3892     return;
3893   }
3894 
3895   if (use_XOR_for_compressed_class_base) {
3896     if (CompressedKlassPointers::shift() != 0) {
3897       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3898       lsr(dst, dst, LogKlassAlignmentInBytes);
3899     } else {
3900       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3901     }
3902     return;
3903   }
3904 
3905   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3906       && CompressedKlassPointers::shift() == 0) {
3907     movw(dst, src);
3908     return;
3909   }
3910 
3911 #ifdef ASSERT
3912   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3913 #endif
3914 
3915   Register rbase = dst;
3916   if (dst == src) rbase = rheapbase;
3917   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3918   sub(dst, src, rbase);
3919   if (CompressedKlassPointers::shift() != 0) {
3920     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3921     lsr(dst, dst, LogKlassAlignmentInBytes);
3922   }
3923   if (dst == src) reinit_heapbase();
3924 }
3925 
3926 void MacroAssembler::encode_klass_not_null(Register r) {
3927   encode_klass_not_null(r, r);
3928 }
3929 
3930 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3931   Register rbase = dst;
3932   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3933 
3934   if (CompressedKlassPointers::base() == NULL) {
3935     if (CompressedKlassPointers::shift() != 0) {
3936       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3937       lsl(dst, src, LogKlassAlignmentInBytes);
3938     } else {
3939       if (dst != src) mov(dst, src);
3940     }
3941     return;
3942   }
3943 
3944   if (use_XOR_for_compressed_class_base) {
3945     if (CompressedKlassPointers::shift() != 0) {
3946       lsl(dst, src, LogKlassAlignmentInBytes);
3947       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3948     } else {
3949       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3950     }
3951     return;
3952   }
3953 
3954   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3955       && CompressedKlassPointers::shift() == 0) {
3956     if (dst != src)
3957       movw(dst, src);
3958     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
3959     return;
3960   }
3961 
3962   // Cannot assert, unverified entry point counts instructions (see .ad file)
3963   // vtableStubs also counts instructions in pd_code_size_limit.
3964   // Also do not verify_oop as this is called by verify_oop.
3965   if (dst == src) rbase = rheapbase;
3966   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3967   if (CompressedKlassPointers::shift() != 0) {
3968     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3969     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3970   } else {
3971     add(dst, rbase, src);
3972   }
3973   if (dst == src) reinit_heapbase();
3974 }
3975 
3976 void  MacroAssembler::decode_klass_not_null(Register r) {
3977   decode_klass_not_null(r, r);
3978 }
3979 
3980 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3981 #ifdef ASSERT
3982   {
3983     ThreadInVMfromUnknown tiv;
3984     assert (UseCompressedOops, "should only be used for compressed oops");
3985     assert (Universe::heap() != NULL, "java heap should be initialized");
3986     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3987     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3988   }
3989 #endif
3990   int oop_index = oop_recorder()->find_index(obj);
3991   InstructionMark im(this);
3992   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3993   code_section()->relocate(inst_mark(), rspec);
3994   movz(dst, 0xDEAD, 16);
3995   movk(dst, 0xBEEF);
3996 }
3997 
3998 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3999   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4000   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4001   int index = oop_recorder()->find_index(k);
4002   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4003 
4004   InstructionMark im(this);
4005   RelocationHolder rspec = metadata_Relocation::spec(index);
4006   code_section()->relocate(inst_mark(), rspec);
4007   narrowKlass nk = CompressedKlassPointers::encode(k);
4008   movz(dst, (nk >> 16), 16);
4009   movk(dst, nk & 0xffff);
4010 }
4011 
4012 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4013                                     Register dst, Address src,
4014                                     Register tmp1, Register thread_tmp) {
4015   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4016   decorators = AccessInternal::decorator_fixup(decorators);
4017   bool as_raw = (decorators & AS_RAW) != 0;
4018   if (as_raw) {
4019     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4020   } else {
4021     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4022   }
4023 }
4024 
4025 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4026                                      Address dst, Register src,
4027                                      Register tmp1, Register thread_tmp) {
4028   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4029   decorators = AccessInternal::decorator_fixup(decorators);
4030   bool as_raw = (decorators & AS_RAW) != 0;
4031   if (as_raw) {
4032     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4033   } else {
4034     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4035   }
4036 }
4037 
4038 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4039   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4040   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4041     decorators |= ACCESS_READ | ACCESS_WRITE;
4042   }
4043   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4044   return bs->resolve(this, decorators, obj);
4045 }
4046 
4047 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4048                                    Register thread_tmp, DecoratorSet decorators) {
4049   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4050 }
4051 
4052 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4053                                             Register thread_tmp, DecoratorSet decorators) {
4054   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4055 }
4056 
4057 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4058                                     Register thread_tmp, DecoratorSet decorators) {
4059   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4060 }
4061 
4062 // Used for storing NULLs.
4063 void MacroAssembler::store_heap_oop_null(Address dst) {
4064   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4065 }
4066 
4067 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4068   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4069   int index = oop_recorder()->allocate_metadata_index(obj);
4070   RelocationHolder rspec = metadata_Relocation::spec(index);
4071   return Address((address)obj, rspec);
4072 }
4073 
4074 // Move an oop into a register.  immediate is true if we want
4075 // immediate instrcutions, i.e. we are not going to patch this
4076 // instruction while the code is being executed by another thread.  In
4077 // that case we can use move immediates rather than the constant pool.
4078 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4079   int oop_index;
4080   if (obj == NULL) {
4081     oop_index = oop_recorder()->allocate_oop_index(obj);
4082   } else {
4083 #ifdef ASSERT
4084     {
4085       ThreadInVMfromUnknown tiv;
4086       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4087     }
4088 #endif
4089     oop_index = oop_recorder()->find_index(obj);
4090   }
4091   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4092   if (! immediate) {
4093     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4094     ldr_constant(dst, Address(dummy, rspec));
4095   } else
4096     mov(dst, Address((address)obj, rspec));
4097 }
4098 
4099 // Move a metadata address into a register.
4100 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4101   int oop_index;
4102   if (obj == NULL) {
4103     oop_index = oop_recorder()->allocate_metadata_index(obj);
4104   } else {
4105     oop_index = oop_recorder()->find_index(obj);
4106   }
4107   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4108   mov(dst, Address((address)obj, rspec));
4109 }
4110 
4111 Address MacroAssembler::constant_oop_address(jobject obj) {
4112 #ifdef ASSERT
4113   {
4114     ThreadInVMfromUnknown tiv;
4115     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4116     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4117   }
4118 #endif
4119   int oop_index = oop_recorder()->find_index(obj);
4120   return Address((address)obj, oop_Relocation::spec(oop_index));
4121 }
4122 
4123 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4124 void MacroAssembler::tlab_allocate(Register obj,
4125                                    Register var_size_in_bytes,
4126                                    int con_size_in_bytes,
4127                                    Register t1,
4128                                    Register t2,
4129                                    Label& slow_case) {
4130   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4131   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4132 }
4133 
4134 // Defines obj, preserves var_size_in_bytes
4135 void MacroAssembler::eden_allocate(Register obj,
4136                                    Register var_size_in_bytes,
4137                                    int con_size_in_bytes,
4138                                    Register t1,
4139                                    Label& slow_case) {
4140   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4141   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4142 }
4143 
4144 // Zero words; len is in bytes
4145 // Destroys all registers except addr
4146 // len must be a nonzero multiple of wordSize
4147 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4148   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4149 
4150 #ifdef ASSERT
4151   { Label L;
4152     tst(len, BytesPerWord - 1);
4153     br(Assembler::EQ, L);
4154     stop("len is not a multiple of BytesPerWord");
4155     bind(L);
4156   }
4157 #endif
4158 
4159 #ifndef PRODUCT
4160   block_comment("zero memory");
4161 #endif
4162 
4163   Label loop;
4164   Label entry;
4165 
4166 //  Algorithm:
4167 //
4168 //    scratch1 = cnt & 7;
4169 //    cnt -= scratch1;
4170 //    p += scratch1;
4171 //    switch (scratch1) {
4172 //      do {
4173 //        cnt -= 8;
4174 //          p[-8] = 0;
4175 //        case 7:
4176 //          p[-7] = 0;
4177 //        case 6:
4178 //          p[-6] = 0;
4179 //          // ...
4180 //        case 1:
4181 //          p[-1] = 0;
4182 //        case 0:
4183 //          p += 8;
4184 //      } while (cnt);
4185 //    }
4186 
4187   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4188 
4189   lsr(len, len, LogBytesPerWord);
4190   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4191   sub(len, len, rscratch1);      // cnt -= unroll
4192   // t1 always points to the end of the region we're about to zero
4193   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4194   adr(rscratch2, entry);
4195   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4196   br(rscratch2);
4197   bind(loop);
4198   sub(len, len, unroll);
4199   for (int i = -unroll; i < 0; i++)
4200     Assembler::str(zr, Address(t1, i * wordSize));
4201   bind(entry);
4202   add(t1, t1, unroll * wordSize);
4203   cbnz(len, loop);
4204 }
4205 
4206 void MacroAssembler::verify_tlab() {
4207 #ifdef ASSERT
4208   if (UseTLAB && VerifyOops) {
4209     Label next, ok;
4210 
4211     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4212 
4213     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4214     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4215     cmp(rscratch2, rscratch1);
4216     br(Assembler::HS, next);
4217     STOP("assert(top >= start)");
4218     should_not_reach_here();
4219 
4220     bind(next);
4221     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4222     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4223     cmp(rscratch2, rscratch1);
4224     br(Assembler::HS, ok);
4225     STOP("assert(top <= end)");
4226     should_not_reach_here();
4227 
4228     bind(ok);
4229     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4230   }
4231 #endif
4232 }
4233 
4234 // Writes to stack successive pages until offset reached to check for
4235 // stack overflow + shadow pages.  This clobbers tmp.
4236 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4237   assert_different_registers(tmp, size, rscratch1);
4238   mov(tmp, sp);
4239   // Bang stack for total size given plus shadow page size.
4240   // Bang one page at a time because large size can bang beyond yellow and
4241   // red zones.
4242   Label loop;
4243   mov(rscratch1, os::vm_page_size());
4244   bind(loop);
4245   lea(tmp, Address(tmp, -os::vm_page_size()));
4246   subsw(size, size, rscratch1);
4247   str(size, Address(tmp));
4248   br(Assembler::GT, loop);
4249 
4250   // Bang down shadow pages too.
4251   // At this point, (tmp-0) is the last address touched, so don't
4252   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4253   // was post-decremented.)  Skip this address by starting at i=1, and
4254   // touch a few more pages below.  N.B.  It is important to touch all
4255   // the way down to and including i=StackShadowPages.
4256   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4257     // this could be any sized move but this is can be a debugging crumb
4258     // so the bigger the better.
4259     lea(tmp, Address(tmp, -os::vm_page_size()));
4260     str(size, Address(tmp));
4261   }
4262 }
4263 
4264 
4265 // Move the address of the polling page into dest.
4266 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4267   if (SafepointMechanism::uses_thread_local_poll()) {
4268     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4269   } else {
4270     unsigned long off;
4271     adrp(dest, Address(page, rtype), off);
4272     assert(off == 0, "polling page must be page aligned");
4273   }
4274 }
4275 
4276 // Move the address of the polling page into r, then read the polling
4277 // page.
4278 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4279   get_polling_page(r, page, rtype);
4280   return read_polling_page(r, rtype);
4281 }
4282 
4283 // Read the polling page.  The address of the polling page must
4284 // already be in r.
4285 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4286   InstructionMark im(this);
4287   code_section()->relocate(inst_mark(), rtype);
4288   ldrw(zr, Address(r, 0));
4289   return inst_mark();
4290 }
4291 
4292 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4293   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4294   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4295   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4296   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4297   long offset_low = dest_page - low_page;
4298   long offset_high = dest_page - high_page;
4299 
4300   assert(is_valid_AArch64_address(dest.target()), "bad address");
4301   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4302 
4303   InstructionMark im(this);
4304   code_section()->relocate(inst_mark(), dest.rspec());
4305   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4306   // the code cache so that if it is relocated we know it will still reach
4307   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4308     _adrp(reg1, dest.target());
4309   } else {
4310     unsigned long target = (unsigned long)dest.target();
4311     unsigned long adrp_target
4312       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4313 
4314     _adrp(reg1, (address)adrp_target);
4315     movk(reg1, target >> 32, 32);
4316   }
4317   byte_offset = (unsigned long)dest.target() & 0xfff;
4318 }
4319 
4320 void MacroAssembler::load_byte_map_base(Register reg) {
4321   CardTable::CardValue* byte_map_base =
4322     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4323 
4324   if (is_valid_AArch64_address((address)byte_map_base)) {
4325     // Strictly speaking the byte_map_base isn't an address at all,
4326     // and it might even be negative.
4327     unsigned long offset;
4328     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4329     // We expect offset to be zero with most collectors.
4330     if (offset != 0) {
4331       add(reg, reg, offset);
4332     }
4333   } else {
4334     mov(reg, (uint64_t)byte_map_base);
4335   }
4336 }
4337 
4338 void MacroAssembler::build_frame(int framesize) {
4339   assert(framesize > 0, "framesize must be > 0");
4340   if (framesize < ((1 << 9) + 2 * wordSize)) {
4341     sub(sp, sp, framesize);
4342     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4343     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4344   } else {
4345     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4346     if (PreserveFramePointer) mov(rfp, sp);
4347     if (framesize < ((1 << 12) + 2 * wordSize))
4348       sub(sp, sp, framesize - 2 * wordSize);
4349     else {
4350       mov(rscratch1, framesize - 2 * wordSize);
4351       sub(sp, sp, rscratch1);
4352     }
4353   }
4354 }
4355 
4356 void MacroAssembler::remove_frame(int framesize) {
4357   assert(framesize > 0, "framesize must be > 0");
4358   if (framesize < ((1 << 9) + 2 * wordSize)) {
4359     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4360     add(sp, sp, framesize);
4361   } else {
4362     if (framesize < ((1 << 12) + 2 * wordSize))
4363       add(sp, sp, framesize - 2 * wordSize);
4364     else {
4365       mov(rscratch1, framesize - 2 * wordSize);
4366       add(sp, sp, rscratch1);
4367     }
4368     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4369   }
4370 }
4371 
4372 #ifdef COMPILER2
4373 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4374 
4375 // Search for str1 in str2 and return index or -1
4376 void MacroAssembler::string_indexof(Register str2, Register str1,
4377                                     Register cnt2, Register cnt1,
4378                                     Register tmp1, Register tmp2,
4379                                     Register tmp3, Register tmp4,
4380                                     Register tmp5, Register tmp6,
4381                                     int icnt1, Register result, int ae) {
4382   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4383   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4384 
4385   Register ch1 = rscratch1;
4386   Register ch2 = rscratch2;
4387   Register cnt1tmp = tmp1;
4388   Register cnt2tmp = tmp2;
4389   Register cnt1_neg = cnt1;
4390   Register cnt2_neg = cnt2;
4391   Register result_tmp = tmp4;
4392 
4393   bool isL = ae == StrIntrinsicNode::LL;
4394 
4395   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4396   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4397   int str1_chr_shift = str1_isL ? 0:1;
4398   int str2_chr_shift = str2_isL ? 0:1;
4399   int str1_chr_size = str1_isL ? 1:2;
4400   int str2_chr_size = str2_isL ? 1:2;
4401   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4402                                       (chr_insn)&MacroAssembler::ldrh;
4403   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4404                                       (chr_insn)&MacroAssembler::ldrh;
4405   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4406   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4407 
4408   // Note, inline_string_indexOf() generates checks:
4409   // if (substr.count > string.count) return -1;
4410   // if (substr.count == 0) return 0;
4411 
4412   // We have two strings, a source string in str2, cnt2 and a pattern string
4413   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4414 
4415   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4416   // With a small pattern and source we use linear scan.
4417 
4418   if (icnt1 == -1) {
4419     sub(result_tmp, cnt2, cnt1);
4420     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4421     br(LT, LINEARSEARCH);
4422     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4423     subs(zr, cnt1, 256);
4424     lsr(tmp1, cnt2, 2);
4425     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4426     br(GE, LINEARSTUB);
4427   }
4428 
4429 // The Boyer Moore alogorithm is based on the description here:-
4430 //
4431 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4432 //
4433 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4434 // and the 'Good Suffix' rule.
4435 //
4436 // These rules are essentially heuristics for how far we can shift the
4437 // pattern along the search string.
4438 //
4439 // The implementation here uses the 'Bad Character' rule only because of the
4440 // complexity of initialisation for the 'Good Suffix' rule.
4441 //
4442 // This is also known as the Boyer-Moore-Horspool algorithm:-
4443 //
4444 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4445 //
4446 // This particular implementation has few java-specific optimizations.
4447 //
4448 // #define ASIZE 256
4449 //
4450 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4451 //       int i, j;
4452 //       unsigned c;
4453 //       unsigned char bc[ASIZE];
4454 //
4455 //       /* Preprocessing */
4456 //       for (i = 0; i < ASIZE; ++i)
4457 //          bc[i] = m;
4458 //       for (i = 0; i < m - 1; ) {
4459 //          c = x[i];
4460 //          ++i;
4461 //          // c < 256 for Latin1 string, so, no need for branch
4462 //          #ifdef PATTERN_STRING_IS_LATIN1
4463 //          bc[c] = m - i;
4464 //          #else
4465 //          if (c < ASIZE) bc[c] = m - i;
4466 //          #endif
4467 //       }
4468 //
4469 //       /* Searching */
4470 //       j = 0;
4471 //       while (j <= n - m) {
4472 //          c = y[i+j];
4473 //          if (x[m-1] == c)
4474 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4475 //          if (i < 0) return j;
4476 //          // c < 256 for Latin1 string, so, no need for branch
4477 //          #ifdef SOURCE_STRING_IS_LATIN1
4478 //          // LL case: (c< 256) always true. Remove branch
4479 //          j += bc[y[j+m-1]];
4480 //          #endif
4481 //          #ifndef PATTERN_STRING_IS_UTF
4482 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4483 //          if (c < ASIZE)
4484 //            j += bc[y[j+m-1]];
4485 //          else
4486 //            j += 1
4487 //          #endif
4488 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4489 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4490 //          if (c < ASIZE)
4491 //            j += bc[y[j+m-1]];
4492 //          else
4493 //            j += m
4494 //          #endif
4495 //       }
4496 //    }
4497 
4498   if (icnt1 == -1) {
4499     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4500         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4501     Register cnt1end = tmp2;
4502     Register str2end = cnt2;
4503     Register skipch = tmp2;
4504 
4505     // str1 length is >=8, so, we can read at least 1 register for cases when
4506     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4507     // UL case. We'll re-read last character in inner pre-loop code to have
4508     // single outer pre-loop load
4509     const int firstStep = isL ? 7 : 3;
4510 
4511     const int ASIZE = 256;
4512     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4513     sub(sp, sp, ASIZE);
4514     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4515     mov(ch1, sp);
4516     BIND(BM_INIT_LOOP);
4517       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4518       subs(tmp5, tmp5, 1);
4519       br(GT, BM_INIT_LOOP);
4520 
4521       sub(cnt1tmp, cnt1, 1);
4522       mov(tmp5, str2);
4523       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4524       sub(ch2, cnt1, 1);
4525       mov(tmp3, str1);
4526     BIND(BCLOOP);
4527       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4528       if (!str1_isL) {
4529         subs(zr, ch1, ASIZE);
4530         br(HS, BCSKIP);
4531       }
4532       strb(ch2, Address(sp, ch1));
4533     BIND(BCSKIP);
4534       subs(ch2, ch2, 1);
4535       br(GT, BCLOOP);
4536 
4537       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4538       if (str1_isL == str2_isL) {
4539         // load last 8 bytes (8LL/4UU symbols)
4540         ldr(tmp6, Address(tmp6, -wordSize));
4541       } else {
4542         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4543         // convert Latin1 to UTF. We'll have to wait until load completed, but
4544         // it's still faster than per-character loads+checks
4545         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4546         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4547         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4548         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4549         orr(ch2, ch1, ch2, LSL, 16);
4550         orr(tmp6, tmp6, tmp3, LSL, 48);
4551         orr(tmp6, tmp6, ch2, LSL, 16);
4552       }
4553     BIND(BMLOOPSTR2);
4554       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4555       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4556       if (str1_isL == str2_isL) {
4557         // re-init tmp3. It's for free because it's executed in parallel with
4558         // load above. Alternative is to initialize it before loop, but it'll
4559         // affect performance on in-order systems with 2 or more ld/st pipelines
4560         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4561       }
4562       if (!isL) { // UU/UL case
4563         lsl(ch2, cnt1tmp, 1); // offset in bytes
4564       }
4565       cmp(tmp3, skipch);
4566       br(NE, BMSKIP);
4567       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4568       mov(ch1, tmp6);
4569       if (isL) {
4570         b(BMLOOPSTR1_AFTER_LOAD);
4571       } else {
4572         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4573         b(BMLOOPSTR1_CMP);
4574       }
4575     BIND(BMLOOPSTR1);
4576       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4577       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4578     BIND(BMLOOPSTR1_AFTER_LOAD);
4579       subs(cnt1tmp, cnt1tmp, 1);
4580       br(LT, BMLOOPSTR1_LASTCMP);
4581     BIND(BMLOOPSTR1_CMP);
4582       cmp(ch1, ch2);
4583       br(EQ, BMLOOPSTR1);
4584     BIND(BMSKIP);
4585       if (!isL) {
4586         // if we've met UTF symbol while searching Latin1 pattern, then we can
4587         // skip cnt1 symbols
4588         if (str1_isL != str2_isL) {
4589           mov(result_tmp, cnt1);
4590         } else {
4591           mov(result_tmp, 1);
4592         }
4593         subs(zr, skipch, ASIZE);
4594         br(HS, BMADV);
4595       }
4596       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4597     BIND(BMADV);
4598       sub(cnt1tmp, cnt1, 1);
4599       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4600       cmp(str2, str2end);
4601       br(LE, BMLOOPSTR2);
4602       add(sp, sp, ASIZE);
4603       b(NOMATCH);
4604     BIND(BMLOOPSTR1_LASTCMP);
4605       cmp(ch1, ch2);
4606       br(NE, BMSKIP);
4607     BIND(BMMATCH);
4608       sub(result, str2, tmp5);
4609       if (!str2_isL) lsr(result, result, 1);
4610       add(sp, sp, ASIZE);
4611       b(DONE);
4612 
4613     BIND(LINEARSTUB);
4614     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4615     br(LT, LINEAR_MEDIUM);
4616     mov(result, zr);
4617     RuntimeAddress stub = NULL;
4618     if (isL) {
4619       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4620       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4621     } else if (str1_isL) {
4622       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4623        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4624     } else {
4625       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4626       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4627     }
4628     trampoline_call(stub);
4629     b(DONE);
4630   }
4631 
4632   BIND(LINEARSEARCH);
4633   {
4634     Label DO1, DO2, DO3;
4635 
4636     Register str2tmp = tmp2;
4637     Register first = tmp3;
4638 
4639     if (icnt1 == -1)
4640     {
4641         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4642 
4643         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4644         br(LT, DOSHORT);
4645       BIND(LINEAR_MEDIUM);
4646         (this->*str1_load_1chr)(first, Address(str1));
4647         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4648         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4649         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4650         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4651 
4652       BIND(FIRST_LOOP);
4653         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4654         cmp(first, ch2);
4655         br(EQ, STR1_LOOP);
4656       BIND(STR2_NEXT);
4657         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4658         br(LE, FIRST_LOOP);
4659         b(NOMATCH);
4660 
4661       BIND(STR1_LOOP);
4662         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4663         add(cnt2tmp, cnt2_neg, str2_chr_size);
4664         br(GE, MATCH);
4665 
4666       BIND(STR1_NEXT);
4667         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4668         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4669         cmp(ch1, ch2);
4670         br(NE, STR2_NEXT);
4671         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4672         add(cnt2tmp, cnt2tmp, str2_chr_size);
4673         br(LT, STR1_NEXT);
4674         b(MATCH);
4675 
4676       BIND(DOSHORT);
4677       if (str1_isL == str2_isL) {
4678         cmp(cnt1, (u1)2);
4679         br(LT, DO1);
4680         br(GT, DO3);
4681       }
4682     }
4683 
4684     if (icnt1 == 4) {
4685       Label CH1_LOOP;
4686 
4687         (this->*load_4chr)(ch1, str1);
4688         sub(result_tmp, cnt2, 4);
4689         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4690         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4691 
4692       BIND(CH1_LOOP);
4693         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4694         cmp(ch1, ch2);
4695         br(EQ, MATCH);
4696         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4697         br(LE, CH1_LOOP);
4698         b(NOMATCH);
4699       }
4700 
4701     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4702       Label CH1_LOOP;
4703 
4704       BIND(DO2);
4705         (this->*load_2chr)(ch1, str1);
4706         if (icnt1 == 2) {
4707           sub(result_tmp, cnt2, 2);
4708         }
4709         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4710         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4711       BIND(CH1_LOOP);
4712         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4713         cmp(ch1, ch2);
4714         br(EQ, MATCH);
4715         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4716         br(LE, CH1_LOOP);
4717         b(NOMATCH);
4718     }
4719 
4720     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4721       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4722 
4723       BIND(DO3);
4724         (this->*load_2chr)(first, str1);
4725         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4726         if (icnt1 == 3) {
4727           sub(result_tmp, cnt2, 3);
4728         }
4729         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4730         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4731       BIND(FIRST_LOOP);
4732         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4733         cmpw(first, ch2);
4734         br(EQ, STR1_LOOP);
4735       BIND(STR2_NEXT);
4736         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4737         br(LE, FIRST_LOOP);
4738         b(NOMATCH);
4739 
4740       BIND(STR1_LOOP);
4741         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4742         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4743         cmp(ch1, ch2);
4744         br(NE, STR2_NEXT);
4745         b(MATCH);
4746     }
4747 
4748     if (icnt1 == -1 || icnt1 == 1) {
4749       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4750 
4751       BIND(DO1);
4752         (this->*str1_load_1chr)(ch1, str1);
4753         cmp(cnt2, (u1)8);
4754         br(LT, DO1_SHORT);
4755 
4756         sub(result_tmp, cnt2, 8/str2_chr_size);
4757         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4758         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4759         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4760 
4761         if (str2_isL) {
4762           orr(ch1, ch1, ch1, LSL, 8);
4763         }
4764         orr(ch1, ch1, ch1, LSL, 16);
4765         orr(ch1, ch1, ch1, LSL, 32);
4766       BIND(CH1_LOOP);
4767         ldr(ch2, Address(str2, cnt2_neg));
4768         eor(ch2, ch1, ch2);
4769         sub(tmp1, ch2, tmp3);
4770         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4771         bics(tmp1, tmp1, tmp2);
4772         br(NE, HAS_ZERO);
4773         adds(cnt2_neg, cnt2_neg, 8);
4774         br(LT, CH1_LOOP);
4775 
4776         cmp(cnt2_neg, (u1)8);
4777         mov(cnt2_neg, 0);
4778         br(LT, CH1_LOOP);
4779         b(NOMATCH);
4780 
4781       BIND(HAS_ZERO);
4782         rev(tmp1, tmp1);
4783         clz(tmp1, tmp1);
4784         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4785         b(MATCH);
4786 
4787       BIND(DO1_SHORT);
4788         mov(result_tmp, cnt2);
4789         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4790         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4791       BIND(DO1_LOOP);
4792         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4793         cmpw(ch1, ch2);
4794         br(EQ, MATCH);
4795         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4796         br(LT, DO1_LOOP);
4797     }
4798   }
4799   BIND(NOMATCH);
4800     mov(result, -1);
4801     b(DONE);
4802   BIND(MATCH);
4803     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4804   BIND(DONE);
4805 }
4806 
4807 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4808 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4809 
4810 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4811                                          Register ch, Register result,
4812                                          Register tmp1, Register tmp2, Register tmp3)
4813 {
4814   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4815   Register cnt1_neg = cnt1;
4816   Register ch1 = rscratch1;
4817   Register result_tmp = rscratch2;
4818 
4819   cmp(cnt1, (u1)4);
4820   br(LT, DO1_SHORT);
4821 
4822   orr(ch, ch, ch, LSL, 16);
4823   orr(ch, ch, ch, LSL, 32);
4824 
4825   sub(cnt1, cnt1, 4);
4826   mov(result_tmp, cnt1);
4827   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4828   sub(cnt1_neg, zr, cnt1, LSL, 1);
4829 
4830   mov(tmp3, 0x0001000100010001);
4831 
4832   BIND(CH1_LOOP);
4833     ldr(ch1, Address(str1, cnt1_neg));
4834     eor(ch1, ch, ch1);
4835     sub(tmp1, ch1, tmp3);
4836     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4837     bics(tmp1, tmp1, tmp2);
4838     br(NE, HAS_ZERO);
4839     adds(cnt1_neg, cnt1_neg, 8);
4840     br(LT, CH1_LOOP);
4841 
4842     cmp(cnt1_neg, (u1)8);
4843     mov(cnt1_neg, 0);
4844     br(LT, CH1_LOOP);
4845     b(NOMATCH);
4846 
4847   BIND(HAS_ZERO);
4848     rev(tmp1, tmp1);
4849     clz(tmp1, tmp1);
4850     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4851     b(MATCH);
4852 
4853   BIND(DO1_SHORT);
4854     mov(result_tmp, cnt1);
4855     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4856     sub(cnt1_neg, zr, cnt1, LSL, 1);
4857   BIND(DO1_LOOP);
4858     ldrh(ch1, Address(str1, cnt1_neg));
4859     cmpw(ch, ch1);
4860     br(EQ, MATCH);
4861     adds(cnt1_neg, cnt1_neg, 2);
4862     br(LT, DO1_LOOP);
4863   BIND(NOMATCH);
4864     mov(result, -1);
4865     b(DONE);
4866   BIND(MATCH);
4867     add(result, result_tmp, cnt1_neg, ASR, 1);
4868   BIND(DONE);
4869 }
4870 
4871 // Compare strings.
4872 void MacroAssembler::string_compare(Register str1, Register str2,
4873     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4874     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4875   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4876       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4877       SHORT_LOOP_START, TAIL_CHECK;
4878 
4879   const u1 STUB_THRESHOLD = 64 + 8;
4880   bool isLL = ae == StrIntrinsicNode::LL;
4881   bool isLU = ae == StrIntrinsicNode::LU;
4882   bool isUL = ae == StrIntrinsicNode::UL;
4883 
4884   bool str1_isL = isLL || isLU;
4885   bool str2_isL = isLL || isUL;
4886 
4887   int str1_chr_shift = str1_isL ? 0 : 1;
4888   int str2_chr_shift = str2_isL ? 0 : 1;
4889   int str1_chr_size = str1_isL ? 1 : 2;
4890   int str2_chr_size = str2_isL ? 1 : 2;
4891   int minCharsInWord = isLL ? wordSize : wordSize/2;
4892 
4893   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4894   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4895                                       (chr_insn)&MacroAssembler::ldrh;
4896   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4897                                       (chr_insn)&MacroAssembler::ldrh;
4898   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4899                             (uxt_insn)&MacroAssembler::uxthw;
4900 
4901   BLOCK_COMMENT("string_compare {");
4902 
4903   // Bizzarely, the counts are passed in bytes, regardless of whether they
4904   // are L or U strings, however the result is always in characters.
4905   if (!str1_isL) asrw(cnt1, cnt1, 1);
4906   if (!str2_isL) asrw(cnt2, cnt2, 1);
4907 
4908   // Compute the minimum of the string lengths and save the difference.
4909   subsw(result, cnt1, cnt2);
4910   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4911 
4912   // A very short string
4913   cmpw(cnt2, minCharsInWord);
4914   br(Assembler::LE, SHORT_STRING);
4915 
4916   // Compare longwords
4917   // load first parts of strings and finish initialization while loading
4918   {
4919     if (str1_isL == str2_isL) { // LL or UU
4920       ldr(tmp1, Address(str1));
4921       cmp(str1, str2);
4922       br(Assembler::EQ, DONE);
4923       ldr(tmp2, Address(str2));
4924       cmp(cnt2, STUB_THRESHOLD);
4925       br(GE, STUB);
4926       subsw(cnt2, cnt2, minCharsInWord);
4927       br(EQ, TAIL_CHECK);
4928       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4929       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4930       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4931     } else if (isLU) {
4932       ldrs(vtmp, Address(str1));
4933       cmp(str1, str2);
4934       br(Assembler::EQ, DONE);
4935       ldr(tmp2, Address(str2));
4936       cmp(cnt2, STUB_THRESHOLD);
4937       br(GE, STUB);
4938       subw(cnt2, cnt2, 4);
4939       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4940       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4941       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4942       zip1(vtmp, T8B, vtmp, vtmpZ);
4943       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4944       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4945       add(cnt1, cnt1, 4);
4946       fmovd(tmp1, vtmp);
4947     } else { // UL case
4948       ldr(tmp1, Address(str1));
4949       cmp(str1, str2);
4950       br(Assembler::EQ, DONE);
4951       ldrs(vtmp, Address(str2));
4952       cmp(cnt2, STUB_THRESHOLD);
4953       br(GE, STUB);
4954       subw(cnt2, cnt2, 4);
4955       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4956       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4957       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4958       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4959       zip1(vtmp, T8B, vtmp, vtmpZ);
4960       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4961       add(cnt1, cnt1, 8);
4962       fmovd(tmp2, vtmp);
4963     }
4964     adds(cnt2, cnt2, isUL ? 4 : 8);
4965     br(GE, TAIL);
4966     eor(rscratch2, tmp1, tmp2);
4967     cbnz(rscratch2, DIFFERENCE);
4968     // main loop
4969     bind(NEXT_WORD);
4970     if (str1_isL == str2_isL) {
4971       ldr(tmp1, Address(str1, cnt2));
4972       ldr(tmp2, Address(str2, cnt2));
4973       adds(cnt2, cnt2, 8);
4974     } else if (isLU) {
4975       ldrs(vtmp, Address(str1, cnt1));
4976       ldr(tmp2, Address(str2, cnt2));
4977       add(cnt1, cnt1, 4);
4978       zip1(vtmp, T8B, vtmp, vtmpZ);
4979       fmovd(tmp1, vtmp);
4980       adds(cnt2, cnt2, 8);
4981     } else { // UL
4982       ldrs(vtmp, Address(str2, cnt2));
4983       ldr(tmp1, Address(str1, cnt1));
4984       zip1(vtmp, T8B, vtmp, vtmpZ);
4985       add(cnt1, cnt1, 8);
4986       fmovd(tmp2, vtmp);
4987       adds(cnt2, cnt2, 4);
4988     }
4989     br(GE, TAIL);
4990 
4991     eor(rscratch2, tmp1, tmp2);
4992     cbz(rscratch2, NEXT_WORD);
4993     b(DIFFERENCE);
4994     bind(TAIL);
4995     eor(rscratch2, tmp1, tmp2);
4996     cbnz(rscratch2, DIFFERENCE);
4997     // Last longword.  In the case where length == 4 we compare the
4998     // same longword twice, but that's still faster than another
4999     // conditional branch.
5000     if (str1_isL == str2_isL) {
5001       ldr(tmp1, Address(str1));
5002       ldr(tmp2, Address(str2));
5003     } else if (isLU) {
5004       ldrs(vtmp, Address(str1));
5005       ldr(tmp2, Address(str2));
5006       zip1(vtmp, T8B, vtmp, vtmpZ);
5007       fmovd(tmp1, vtmp);
5008     } else { // UL
5009       ldrs(vtmp, Address(str2));
5010       ldr(tmp1, Address(str1));
5011       zip1(vtmp, T8B, vtmp, vtmpZ);
5012       fmovd(tmp2, vtmp);
5013     }
5014     bind(TAIL_CHECK);
5015     eor(rscratch2, tmp1, tmp2);
5016     cbz(rscratch2, DONE);
5017 
5018     // Find the first different characters in the longwords and
5019     // compute their difference.
5020     bind(DIFFERENCE);
5021     rev(rscratch2, rscratch2);
5022     clz(rscratch2, rscratch2);
5023     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5024     lsrv(tmp1, tmp1, rscratch2);
5025     (this->*ext_chr)(tmp1, tmp1);
5026     lsrv(tmp2, tmp2, rscratch2);
5027     (this->*ext_chr)(tmp2, tmp2);
5028     subw(result, tmp1, tmp2);
5029     b(DONE);
5030   }
5031 
5032   bind(STUB);
5033     RuntimeAddress stub = NULL;
5034     switch(ae) {
5035       case StrIntrinsicNode::LL:
5036         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5037         break;
5038       case StrIntrinsicNode::UU:
5039         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5040         break;
5041       case StrIntrinsicNode::LU:
5042         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5043         break;
5044       case StrIntrinsicNode::UL:
5045         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5046         break;
5047       default:
5048         ShouldNotReachHere();
5049      }
5050     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5051     trampoline_call(stub);
5052     b(DONE);
5053 
5054   bind(SHORT_STRING);
5055   // Is the minimum length zero?
5056   cbz(cnt2, DONE);
5057   // arrange code to do most branches while loading and loading next characters
5058   // while comparing previous
5059   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5060   subs(cnt2, cnt2, 1);
5061   br(EQ, SHORT_LAST_INIT);
5062   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5063   b(SHORT_LOOP_START);
5064   bind(SHORT_LOOP);
5065   subs(cnt2, cnt2, 1);
5066   br(EQ, SHORT_LAST);
5067   bind(SHORT_LOOP_START);
5068   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5069   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5070   cmp(tmp1, cnt1);
5071   br(NE, SHORT_LOOP_TAIL);
5072   subs(cnt2, cnt2, 1);
5073   br(EQ, SHORT_LAST2);
5074   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5075   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5076   cmp(tmp2, rscratch1);
5077   br(EQ, SHORT_LOOP);
5078   sub(result, tmp2, rscratch1);
5079   b(DONE);
5080   bind(SHORT_LOOP_TAIL);
5081   sub(result, tmp1, cnt1);
5082   b(DONE);
5083   bind(SHORT_LAST2);
5084   cmp(tmp2, rscratch1);
5085   br(EQ, DONE);
5086   sub(result, tmp2, rscratch1);
5087 
5088   b(DONE);
5089   bind(SHORT_LAST_INIT);
5090   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5091   bind(SHORT_LAST);
5092   cmp(tmp1, cnt1);
5093   br(EQ, DONE);
5094   sub(result, tmp1, cnt1);
5095 
5096   bind(DONE);
5097 
5098   BLOCK_COMMENT("} string_compare");
5099 }
5100 #endif // COMPILER2
5101 
5102 // This method checks if provided byte array contains byte with highest bit set.
5103 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5104     // Simple and most common case of aligned small array which is not at the
5105     // end of memory page is placed here. All other cases are in stub.
5106     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5107     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5108     assert_different_registers(ary1, len, result);
5109 
5110     cmpw(len, 0);
5111     br(LE, SET_RESULT);
5112     cmpw(len, 4 * wordSize);
5113     br(GE, STUB_LONG); // size > 32 then go to stub
5114 
5115     int shift = 64 - exact_log2(os::vm_page_size());
5116     lsl(rscratch1, ary1, shift);
5117     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5118     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5119     br(CS, STUB); // at the end of page then go to stub
5120     subs(len, len, wordSize);
5121     br(LT, END);
5122 
5123   BIND(LOOP);
5124     ldr(rscratch1, Address(post(ary1, wordSize)));
5125     tst(rscratch1, UPPER_BIT_MASK);
5126     br(NE, SET_RESULT);
5127     subs(len, len, wordSize);
5128     br(GE, LOOP);
5129     cmpw(len, -wordSize);
5130     br(EQ, SET_RESULT);
5131 
5132   BIND(END);
5133     ldr(result, Address(ary1));
5134     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5135     lslv(result, result, len);
5136     tst(result, UPPER_BIT_MASK);
5137     b(SET_RESULT);
5138 
5139   BIND(STUB);
5140     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5141     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5142     trampoline_call(has_neg);
5143     b(DONE);
5144 
5145   BIND(STUB_LONG);
5146     RuntimeAddress has_neg_long =  RuntimeAddress(
5147             StubRoutines::aarch64::has_negatives_long());
5148     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5149     trampoline_call(has_neg_long);
5150     b(DONE);
5151 
5152   BIND(SET_RESULT);
5153     cset(result, NE); // set true or false
5154 
5155   BIND(DONE);
5156 }
5157 
5158 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5159                                    Register tmp4, Register tmp5, Register result,
5160                                    Register cnt1, int elem_size) {
5161   Label DONE, SAME;
5162   Register tmp1 = rscratch1;
5163   Register tmp2 = rscratch2;
5164   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5165   int elem_per_word = wordSize/elem_size;
5166   int log_elem_size = exact_log2(elem_size);
5167   int length_offset = arrayOopDesc::length_offset_in_bytes();
5168   int base_offset
5169     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5170   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5171 
5172   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5173   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5174 
5175 #ifndef PRODUCT
5176   {
5177     const char kind = (elem_size == 2) ? 'U' : 'L';
5178     char comment[64];
5179     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5180     BLOCK_COMMENT(comment);
5181   }
5182 #endif
5183 
5184   // if (a1 == a2)
5185   //     return true;
5186   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5187   br(EQ, SAME);
5188 
5189   if (UseSimpleArrayEquals) {
5190     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5191     // if (a1 == null || a2 == null)
5192     //     return false;
5193     // a1 & a2 == 0 means (some-pointer is null) or
5194     // (very-rare-or-even-probably-impossible-pointer-values)
5195     // so, we can save one branch in most cases
5196     tst(a1, a2);
5197     mov(result, false);
5198     br(EQ, A_MIGHT_BE_NULL);
5199     // if (a1.length != a2.length)
5200     //      return false;
5201     bind(A_IS_NOT_NULL);
5202     ldrw(cnt1, Address(a1, length_offset));
5203     ldrw(cnt2, Address(a2, length_offset));
5204     eorw(tmp5, cnt1, cnt2);
5205     cbnzw(tmp5, DONE);
5206     lea(a1, Address(a1, base_offset));
5207     lea(a2, Address(a2, base_offset));
5208     // Check for short strings, i.e. smaller than wordSize.
5209     subs(cnt1, cnt1, elem_per_word);
5210     br(Assembler::LT, SHORT);
5211     // Main 8 byte comparison loop.
5212     bind(NEXT_WORD); {
5213       ldr(tmp1, Address(post(a1, wordSize)));
5214       ldr(tmp2, Address(post(a2, wordSize)));
5215       subs(cnt1, cnt1, elem_per_word);
5216       eor(tmp5, tmp1, tmp2);
5217       cbnz(tmp5, DONE);
5218     } br(GT, NEXT_WORD);
5219     // Last longword.  In the case where length == 4 we compare the
5220     // same longword twice, but that's still faster than another
5221     // conditional branch.
5222     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5223     // length == 4.
5224     if (log_elem_size > 0)
5225       lsl(cnt1, cnt1, log_elem_size);
5226     ldr(tmp3, Address(a1, cnt1));
5227     ldr(tmp4, Address(a2, cnt1));
5228     eor(tmp5, tmp3, tmp4);
5229     cbnz(tmp5, DONE);
5230     b(SAME);
5231     bind(A_MIGHT_BE_NULL);
5232     // in case both a1 and a2 are not-null, proceed with loads
5233     cbz(a1, DONE);
5234     cbz(a2, DONE);
5235     b(A_IS_NOT_NULL);
5236     bind(SHORT);
5237 
5238     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5239     {
5240       ldrw(tmp1, Address(post(a1, 4)));
5241       ldrw(tmp2, Address(post(a2, 4)));
5242       eorw(tmp5, tmp1, tmp2);
5243       cbnzw(tmp5, DONE);
5244     }
5245     bind(TAIL03);
5246     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5247     {
5248       ldrh(tmp3, Address(post(a1, 2)));
5249       ldrh(tmp4, Address(post(a2, 2)));
5250       eorw(tmp5, tmp3, tmp4);
5251       cbnzw(tmp5, DONE);
5252     }
5253     bind(TAIL01);
5254     if (elem_size == 1) { // Only needed when comparing byte arrays.
5255       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5256       {
5257         ldrb(tmp1, a1);
5258         ldrb(tmp2, a2);
5259         eorw(tmp5, tmp1, tmp2);
5260         cbnzw(tmp5, DONE);
5261       }
5262     }
5263   } else {
5264     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5265         CSET_EQ, LAST_CHECK;
5266     mov(result, false);
5267     cbz(a1, DONE);
5268     ldrw(cnt1, Address(a1, length_offset));
5269     cbz(a2, DONE);
5270     ldrw(cnt2, Address(a2, length_offset));
5271     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5272     // faster to perform another branch before comparing a1 and a2
5273     cmp(cnt1, (u1)elem_per_word);
5274     br(LE, SHORT); // short or same
5275     ldr(tmp3, Address(pre(a1, base_offset)));
5276     subs(zr, cnt1, stubBytesThreshold);
5277     br(GE, STUB);
5278     ldr(tmp4, Address(pre(a2, base_offset)));
5279     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5280     cmp(cnt2, cnt1);
5281     br(NE, DONE);
5282 
5283     // Main 16 byte comparison loop with 2 exits
5284     bind(NEXT_DWORD); {
5285       ldr(tmp1, Address(pre(a1, wordSize)));
5286       ldr(tmp2, Address(pre(a2, wordSize)));
5287       subs(cnt1, cnt1, 2 * elem_per_word);
5288       br(LE, TAIL);
5289       eor(tmp4, tmp3, tmp4);
5290       cbnz(tmp4, DONE);
5291       ldr(tmp3, Address(pre(a1, wordSize)));
5292       ldr(tmp4, Address(pre(a2, wordSize)));
5293       cmp(cnt1, (u1)elem_per_word);
5294       br(LE, TAIL2);
5295       cmp(tmp1, tmp2);
5296     } br(EQ, NEXT_DWORD);
5297     b(DONE);
5298 
5299     bind(TAIL);
5300     eor(tmp4, tmp3, tmp4);
5301     eor(tmp2, tmp1, tmp2);
5302     lslv(tmp2, tmp2, tmp5);
5303     orr(tmp5, tmp4, tmp2);
5304     cmp(tmp5, zr);
5305     b(CSET_EQ);
5306 
5307     bind(TAIL2);
5308     eor(tmp2, tmp1, tmp2);
5309     cbnz(tmp2, DONE);
5310     b(LAST_CHECK);
5311 
5312     bind(STUB);
5313     ldr(tmp4, Address(pre(a2, base_offset)));
5314     cmp(cnt2, cnt1);
5315     br(NE, DONE);
5316     if (elem_size == 2) { // convert to byte counter
5317       lsl(cnt1, cnt1, 1);
5318     }
5319     eor(tmp5, tmp3, tmp4);
5320     cbnz(tmp5, DONE);
5321     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5322     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5323     trampoline_call(stub);
5324     b(DONE);
5325 
5326     bind(EARLY_OUT);
5327     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5328     // so, if a2 == null => return false(0), else return true, so we can return a2
5329     mov(result, a2);
5330     b(DONE);
5331     bind(SHORT);
5332     cmp(cnt2, cnt1);
5333     br(NE, DONE);
5334     cbz(cnt1, SAME);
5335     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5336     ldr(tmp3, Address(a1, base_offset));
5337     ldr(tmp4, Address(a2, base_offset));
5338     bind(LAST_CHECK);
5339     eor(tmp4, tmp3, tmp4);
5340     lslv(tmp5, tmp4, tmp5);
5341     cmp(tmp5, zr);
5342     bind(CSET_EQ);
5343     cset(result, EQ);
5344     b(DONE);
5345   }
5346 
5347   bind(SAME);
5348   mov(result, true);
5349   // That's it.
5350   bind(DONE);
5351 
5352   BLOCK_COMMENT("} array_equals");
5353 }
5354 
5355 // Compare Strings
5356 
5357 // For Strings we're passed the address of the first characters in a1
5358 // and a2 and the length in cnt1.
5359 // elem_size is the element size in bytes: either 1 or 2.
5360 // There are two implementations.  For arrays >= 8 bytes, all
5361 // comparisons (including the final one, which may overlap) are
5362 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5363 // halfword, then a short, and then a byte.
5364 
5365 void MacroAssembler::string_equals(Register a1, Register a2,
5366                                    Register result, Register cnt1, int elem_size)
5367 {
5368   Label SAME, DONE, SHORT, NEXT_WORD;
5369   Register tmp1 = rscratch1;
5370   Register tmp2 = rscratch2;
5371   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5372 
5373   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5374   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5375 
5376 #ifndef PRODUCT
5377   {
5378     const char kind = (elem_size == 2) ? 'U' : 'L';
5379     char comment[64];
5380     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5381     BLOCK_COMMENT(comment);
5382   }
5383 #endif
5384 
5385   mov(result, false);
5386 
5387   // Check for short strings, i.e. smaller than wordSize.
5388   subs(cnt1, cnt1, wordSize);
5389   br(Assembler::LT, SHORT);
5390   // Main 8 byte comparison loop.
5391   bind(NEXT_WORD); {
5392     ldr(tmp1, Address(post(a1, wordSize)));
5393     ldr(tmp2, Address(post(a2, wordSize)));
5394     subs(cnt1, cnt1, wordSize);
5395     eor(tmp1, tmp1, tmp2);
5396     cbnz(tmp1, DONE);
5397   } br(GT, NEXT_WORD);
5398   // Last longword.  In the case where length == 4 we compare the
5399   // same longword twice, but that's still faster than another
5400   // conditional branch.
5401   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5402   // length == 4.
5403   ldr(tmp1, Address(a1, cnt1));
5404   ldr(tmp2, Address(a2, cnt1));
5405   eor(tmp2, tmp1, tmp2);
5406   cbnz(tmp2, DONE);
5407   b(SAME);
5408 
5409   bind(SHORT);
5410   Label TAIL03, TAIL01;
5411 
5412   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5413   {
5414     ldrw(tmp1, Address(post(a1, 4)));
5415     ldrw(tmp2, Address(post(a2, 4)));
5416     eorw(tmp1, tmp1, tmp2);
5417     cbnzw(tmp1, DONE);
5418   }
5419   bind(TAIL03);
5420   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5421   {
5422     ldrh(tmp1, Address(post(a1, 2)));
5423     ldrh(tmp2, Address(post(a2, 2)));
5424     eorw(tmp1, tmp1, tmp2);
5425     cbnzw(tmp1, DONE);
5426   }
5427   bind(TAIL01);
5428   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5429     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5430     {
5431       ldrb(tmp1, a1);
5432       ldrb(tmp2, a2);
5433       eorw(tmp1, tmp1, tmp2);
5434       cbnzw(tmp1, DONE);
5435     }
5436   }
5437   // Arrays are equal.
5438   bind(SAME);
5439   mov(result, true);
5440 
5441   // That's it.
5442   bind(DONE);
5443   BLOCK_COMMENT("} string_equals");
5444 }
5445 
5446 
5447 // The size of the blocks erased by the zero_blocks stub.  We must
5448 // handle anything smaller than this ourselves in zero_words().
5449 const int MacroAssembler::zero_words_block_size = 8;
5450 
5451 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5452 // possible, handling small word counts locally and delegating
5453 // anything larger to the zero_blocks stub.  It is expanded many times
5454 // in compiled code, so it is important to keep it short.
5455 
5456 // ptr:   Address of a buffer to be zeroed.
5457 // cnt:   Count in HeapWords.
5458 //
5459 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5460 void MacroAssembler::zero_words(Register ptr, Register cnt)
5461 {
5462   assert(is_power_of_2(zero_words_block_size), "adjust this");
5463   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5464 
5465   BLOCK_COMMENT("zero_words {");
5466   cmp(cnt, (u1)zero_words_block_size);
5467   Label around;
5468   br(LO, around);
5469   {
5470     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5471     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5472     if (StubRoutines::aarch64::complete()) {
5473       trampoline_call(zero_blocks);
5474     } else {
5475       bl(zero_blocks);
5476     }
5477   }
5478   bind(around);
5479   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5480     Label l;
5481     tbz(cnt, exact_log2(i), l);
5482     for (int j = 0; j < i; j += 2) {
5483       stp(zr, zr, post(ptr, 16));
5484     }
5485     bind(l);
5486   }
5487   {
5488     Label l;
5489     tbz(cnt, 0, l);
5490     str(zr, Address(ptr));
5491     bind(l);
5492   }
5493   BLOCK_COMMENT("} zero_words");
5494 }
5495 
5496 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5497 // cnt:          Immediate count in HeapWords.
5498 #define SmallArraySize (18 * BytesPerLong)
5499 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5500 {
5501   BLOCK_COMMENT("zero_words {");
5502   int i = cnt & 1;  // store any odd word to start
5503   if (i) str(zr, Address(base));
5504 
5505   if (cnt <= SmallArraySize / BytesPerLong) {
5506     for (; i < (int)cnt; i += 2)
5507       stp(zr, zr, Address(base, i * wordSize));
5508   } else {
5509     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5510     int remainder = cnt % (2 * unroll);
5511     for (; i < remainder; i += 2)
5512       stp(zr, zr, Address(base, i * wordSize));
5513 
5514     Label loop;
5515     Register cnt_reg = rscratch1;
5516     Register loop_base = rscratch2;
5517     cnt = cnt - remainder;
5518     mov(cnt_reg, cnt);
5519     // adjust base and prebias by -2 * wordSize so we can pre-increment
5520     add(loop_base, base, (remainder - 2) * wordSize);
5521     bind(loop);
5522     sub(cnt_reg, cnt_reg, 2 * unroll);
5523     for (i = 1; i < unroll; i++)
5524       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5525     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5526     cbnz(cnt_reg, loop);
5527   }
5528   BLOCK_COMMENT("} zero_words");
5529 }
5530 
5531 // Zero blocks of memory by using DC ZVA.
5532 //
5533 // Aligns the base address first sufficently for DC ZVA, then uses
5534 // DC ZVA repeatedly for every full block.  cnt is the size to be
5535 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5536 // in cnt.
5537 //
5538 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5539 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5540 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5541   Register tmp = rscratch1;
5542   Register tmp2 = rscratch2;
5543   int zva_length = VM_Version::zva_length();
5544   Label initial_table_end, loop_zva;
5545   Label fini;
5546 
5547   // Base must be 16 byte aligned. If not just return and let caller handle it
5548   tst(base, 0x0f);
5549   br(Assembler::NE, fini);
5550   // Align base with ZVA length.
5551   neg(tmp, base);
5552   andr(tmp, tmp, zva_length - 1);
5553 
5554   // tmp: the number of bytes to be filled to align the base with ZVA length.
5555   add(base, base, tmp);
5556   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5557   adr(tmp2, initial_table_end);
5558   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5559   br(tmp2);
5560 
5561   for (int i = -zva_length + 16; i < 0; i += 16)
5562     stp(zr, zr, Address(base, i));
5563   bind(initial_table_end);
5564 
5565   sub(cnt, cnt, zva_length >> 3);
5566   bind(loop_zva);
5567   dc(Assembler::ZVA, base);
5568   subs(cnt, cnt, zva_length >> 3);
5569   add(base, base, zva_length);
5570   br(Assembler::GE, loop_zva);
5571   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5572   bind(fini);
5573 }
5574 
5575 // base:   Address of a buffer to be filled, 8 bytes aligned.
5576 // cnt:    Count in 8-byte unit.
5577 // value:  Value to be filled with.
5578 // base will point to the end of the buffer after filling.
5579 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5580 {
5581 //  Algorithm:
5582 //
5583 //    scratch1 = cnt & 7;
5584 //    cnt -= scratch1;
5585 //    p += scratch1;
5586 //    switch (scratch1) {
5587 //      do {
5588 //        cnt -= 8;
5589 //          p[-8] = v;
5590 //        case 7:
5591 //          p[-7] = v;
5592 //        case 6:
5593 //          p[-6] = v;
5594 //          // ...
5595 //        case 1:
5596 //          p[-1] = v;
5597 //        case 0:
5598 //          p += 8;
5599 //      } while (cnt);
5600 //    }
5601 
5602   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5603 
5604   Label fini, skip, entry, loop;
5605   const int unroll = 8; // Number of stp instructions we'll unroll
5606 
5607   cbz(cnt, fini);
5608   tbz(base, 3, skip);
5609   str(value, Address(post(base, 8)));
5610   sub(cnt, cnt, 1);
5611   bind(skip);
5612 
5613   andr(rscratch1, cnt, (unroll-1) * 2);
5614   sub(cnt, cnt, rscratch1);
5615   add(base, base, rscratch1, Assembler::LSL, 3);
5616   adr(rscratch2, entry);
5617   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5618   br(rscratch2);
5619 
5620   bind(loop);
5621   add(base, base, unroll * 16);
5622   for (int i = -unroll; i < 0; i++)
5623     stp(value, value, Address(base, i * 16));
5624   bind(entry);
5625   subs(cnt, cnt, unroll * 2);
5626   br(Assembler::GE, loop);
5627 
5628   tbz(cnt, 0, fini);
5629   str(value, Address(post(base, 8)));
5630   bind(fini);
5631 }
5632 
5633 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5634 // java/lang/StringUTF16.compress.
5635 void MacroAssembler::encode_iso_array(Register src, Register dst,
5636                       Register len, Register result,
5637                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5638                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5639 {
5640     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5641         NEXT_32_START, NEXT_32_PRFM_START;
5642     Register tmp1 = rscratch1, tmp2 = rscratch2;
5643 
5644       mov(result, len); // Save initial len
5645 
5646 #ifndef BUILTIN_SIM
5647       cmp(len, (u1)8); // handle shortest strings first
5648       br(LT, LOOP_1);
5649       cmp(len, (u1)32);
5650       br(LT, NEXT_8);
5651       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5652       // to convert chars to bytes
5653       if (SoftwarePrefetchHintDistance >= 0) {
5654         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5655         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5656         br(LE, NEXT_32_START);
5657         b(NEXT_32_PRFM_START);
5658         BIND(NEXT_32_PRFM);
5659           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5660         BIND(NEXT_32_PRFM_START);
5661           prfm(Address(src, SoftwarePrefetchHintDistance));
5662           orr(v4, T16B, Vtmp1, Vtmp2);
5663           orr(v5, T16B, Vtmp3, Vtmp4);
5664           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5665           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5666           uzp2(v5, T16B, v4, v5); // high bytes
5667           umov(tmp2, v5, D, 1);
5668           fmovd(tmp1, v5);
5669           orr(tmp1, tmp1, tmp2);
5670           cbnz(tmp1, LOOP_8);
5671           stpq(Vtmp1, Vtmp3, dst);
5672           sub(len, len, 32);
5673           add(dst, dst, 32);
5674           add(src, src, 64);
5675           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5676           br(GE, NEXT_32_PRFM);
5677           cmp(len, (u1)32);
5678           br(LT, LOOP_8);
5679         BIND(NEXT_32);
5680           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5681         BIND(NEXT_32_START);
5682       } else {
5683         BIND(NEXT_32);
5684           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5685       }
5686       prfm(Address(src, SoftwarePrefetchHintDistance));
5687       uzp1(v4, T16B, Vtmp1, Vtmp2);
5688       uzp1(v5, T16B, Vtmp3, Vtmp4);
5689       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5690       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5691       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5692       umov(tmp2, Vtmp1, D, 1);
5693       fmovd(tmp1, Vtmp1);
5694       orr(tmp1, tmp1, tmp2);
5695       cbnz(tmp1, LOOP_8);
5696       stpq(v4, v5, dst);
5697       sub(len, len, 32);
5698       add(dst, dst, 32);
5699       add(src, src, 64);
5700       cmp(len, (u1)32);
5701       br(GE, NEXT_32);
5702       cbz(len, DONE);
5703 
5704     BIND(LOOP_8);
5705       cmp(len, (u1)8);
5706       br(LT, LOOP_1);
5707     BIND(NEXT_8);
5708       ld1(Vtmp1, T8H, src);
5709       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5710       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5711       fmovd(tmp1, Vtmp3);
5712       cbnz(tmp1, NEXT_1);
5713       strd(Vtmp2, dst);
5714 
5715       sub(len, len, 8);
5716       add(dst, dst, 8);
5717       add(src, src, 16);
5718       cmp(len, (u1)8);
5719       br(GE, NEXT_8);
5720 
5721     BIND(LOOP_1);
5722 #endif
5723     cbz(len, DONE);
5724     BIND(NEXT_1);
5725       ldrh(tmp1, Address(post(src, 2)));
5726       tst(tmp1, 0xff00);
5727       br(NE, SET_RESULT);
5728       strb(tmp1, Address(post(dst, 1)));
5729       subs(len, len, 1);
5730       br(GT, NEXT_1);
5731 
5732     BIND(SET_RESULT);
5733       sub(result, result, len); // Return index where we stopped
5734                                 // Return len == 0 if we processed all
5735                                 // characters
5736     BIND(DONE);
5737 }
5738 
5739 
5740 // Inflate byte[] array to char[].
5741 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5742                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5743                                         Register tmp4) {
5744   Label big, done, after_init, to_stub;
5745 
5746   assert_different_registers(src, dst, len, tmp4, rscratch1);
5747 
5748   fmovd(vtmp1, zr);
5749   lsrw(tmp4, len, 3);
5750   bind(after_init);
5751   cbnzw(tmp4, big);
5752   // Short string: less than 8 bytes.
5753   {
5754     Label loop, tiny;
5755 
5756     cmpw(len, 4);
5757     br(LT, tiny);
5758     // Use SIMD to do 4 bytes.
5759     ldrs(vtmp2, post(src, 4));
5760     zip1(vtmp3, T8B, vtmp2, vtmp1);
5761     subw(len, len, 4);
5762     strd(vtmp3, post(dst, 8));
5763 
5764     cbzw(len, done);
5765 
5766     // Do the remaining bytes by steam.
5767     bind(loop);
5768     ldrb(tmp4, post(src, 1));
5769     strh(tmp4, post(dst, 2));
5770     subw(len, len, 1);
5771 
5772     bind(tiny);
5773     cbnz(len, loop);
5774 
5775     b(done);
5776   }
5777 
5778   if (SoftwarePrefetchHintDistance >= 0) {
5779     bind(to_stub);
5780       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5781       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5782       trampoline_call(stub);
5783       b(after_init);
5784   }
5785 
5786   // Unpack the bytes 8 at a time.
5787   bind(big);
5788   {
5789     Label loop, around, loop_last, loop_start;
5790 
5791     if (SoftwarePrefetchHintDistance >= 0) {
5792       const int large_loop_threshold = (64 + 16)/8;
5793       ldrd(vtmp2, post(src, 8));
5794       andw(len, len, 7);
5795       cmp(tmp4, (u1)large_loop_threshold);
5796       br(GE, to_stub);
5797       b(loop_start);
5798 
5799       bind(loop);
5800       ldrd(vtmp2, post(src, 8));
5801       bind(loop_start);
5802       subs(tmp4, tmp4, 1);
5803       br(EQ, loop_last);
5804       zip1(vtmp2, T16B, vtmp2, vtmp1);
5805       ldrd(vtmp3, post(src, 8));
5806       st1(vtmp2, T8H, post(dst, 16));
5807       subs(tmp4, tmp4, 1);
5808       zip1(vtmp3, T16B, vtmp3, vtmp1);
5809       st1(vtmp3, T8H, post(dst, 16));
5810       br(NE, loop);
5811       b(around);
5812       bind(loop_last);
5813       zip1(vtmp2, T16B, vtmp2, vtmp1);
5814       st1(vtmp2, T8H, post(dst, 16));
5815       bind(around);
5816       cbz(len, done);
5817     } else {
5818       andw(len, len, 7);
5819       bind(loop);
5820       ldrd(vtmp2, post(src, 8));
5821       sub(tmp4, tmp4, 1);
5822       zip1(vtmp3, T16B, vtmp2, vtmp1);
5823       st1(vtmp3, T8H, post(dst, 16));
5824       cbnz(tmp4, loop);
5825     }
5826   }
5827 
5828   // Do the tail of up to 8 bytes.
5829   add(src, src, len);
5830   ldrd(vtmp3, Address(src, -8));
5831   add(dst, dst, len, ext::uxtw, 1);
5832   zip1(vtmp3, T16B, vtmp3, vtmp1);
5833   strq(vtmp3, Address(dst, -16));
5834 
5835   bind(done);
5836 }
5837 
5838 // Compress char[] array to byte[].
5839 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5840                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5841                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5842                                          Register result) {
5843   encode_iso_array(src, dst, len, result,
5844                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5845   cmp(len, zr);
5846   csel(result, result, zr, EQ);
5847 }
5848 
5849 // get_thread() can be called anywhere inside generated code so we
5850 // need to save whatever non-callee save context might get clobbered
5851 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5852 // the call setup code.
5853 //
5854 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5855 //
5856 void MacroAssembler::get_thread(Register dst) {
5857   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5858   push(saved_regs, sp);
5859 
5860   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5861   blrt(lr, 1, 0, 1);
5862   if (dst != c_rarg0) {
5863     mov(dst, c_rarg0);
5864   }
5865 
5866   pop(saved_regs, sp);
5867 }