1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   assert(last_java_pc != NULL, "must provide a valid PC");
 376 
 377   adr(scratch, last_java_pc);
 378   str(scratch, Address(rthread,
 379                        JavaThread::frame_anchor_offset()
 380                        + JavaFrameAnchor::last_Java_pc_offset()));
 381 
 382   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 383 }
 384 
 385 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 386                                          Register last_java_fp,
 387                                          Label &L,
 388                                          Register scratch) {
 389   if (L.is_bound()) {
 390     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 391   } else {
 392     InstructionMark im(this);
 393     L.add_patch_at(code(), locator());
 394     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 395   }
 396 }
 397 
 398 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 399   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 400   assert(CodeCache::find_blob(entry.target()) != NULL,
 401          "destination of far call not found in code cache");
 402   if (far_branches()) {
 403     unsigned long offset;
 404     // We can use ADRP here because we know that the total size of
 405     // the code cache cannot exceed 2Gb.
 406     adrp(tmp, entry, offset);
 407     add(tmp, tmp, offset);
 408     if (cbuf) cbuf->set_insts_mark();
 409     blr(tmp);
 410   } else {
 411     if (cbuf) cbuf->set_insts_mark();
 412     bl(entry);
 413   }
 414 }
 415 
 416 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 417   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 418   assert(CodeCache::find_blob(entry.target()) != NULL,
 419          "destination of far call not found in code cache");
 420   if (far_branches()) {
 421     unsigned long offset;
 422     // We can use ADRP here because we know that the total size of
 423     // the code cache cannot exceed 2Gb.
 424     adrp(tmp, entry, offset);
 425     add(tmp, tmp, offset);
 426     if (cbuf) cbuf->set_insts_mark();
 427     br(tmp);
 428   } else {
 429     if (cbuf) cbuf->set_insts_mark();
 430     b(entry);
 431   }
 432 }
 433 
 434 void MacroAssembler::reserved_stack_check() {
 435     // testing if reserved zone needs to be enabled
 436     Label no_reserved_zone_enabling;
 437 
 438     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 439     cmp(sp, rscratch1);
 440     br(Assembler::LO, no_reserved_zone_enabling);
 441 
 442     enter();   // LR and FP are live.
 443     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 444     mov(c_rarg0, rthread);
 445     blr(rscratch1);
 446     leave();
 447 
 448     // We have already removed our own frame.
 449     // throw_delayed_StackOverflowError will think that it's been
 450     // called by our caller.
 451     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 452     br(rscratch1);
 453     should_not_reach_here();
 454 
 455     bind(no_reserved_zone_enabling);
 456 }
 457 
 458 int MacroAssembler::biased_locking_enter(Register lock_reg,
 459                                          Register obj_reg,
 460                                          Register swap_reg,
 461                                          Register tmp_reg,
 462                                          bool swap_reg_contains_mark,
 463                                          Label& done,
 464                                          Label* slow_case,
 465                                          BiasedLockingCounters* counters) {
 466   assert(UseBiasedLocking, "why call this otherwise?");
 467   assert_different_registers(lock_reg, obj_reg, swap_reg);
 468 
 469   if (PrintBiasedLockingStatistics && counters == NULL)
 470     counters = BiasedLocking::counters();
 471 
 472   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 473   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 474   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 475   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 476   Address saved_mark_addr(lock_reg, 0);
 477 
 478   // Biased locking
 479   // See whether the lock is currently biased toward our thread and
 480   // whether the epoch is still valid
 481   // Note that the runtime guarantees sufficient alignment of JavaThread
 482   // pointers to allow age to be placed into low bits
 483   // First check to see whether biasing is even enabled for this object
 484   Label cas_label;
 485   int null_check_offset = -1;
 486   if (!swap_reg_contains_mark) {
 487     null_check_offset = offset();
 488     ldr(swap_reg, mark_addr);
 489   }
 490   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 491   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 492   br(Assembler::NE, cas_label);
 493   // The bias pattern is present in the object's header. Need to check
 494   // whether the bias owner and the epoch are both still current.
 495   load_prototype_header(tmp_reg, obj_reg);
 496   orr(tmp_reg, tmp_reg, rthread);
 497   eor(tmp_reg, swap_reg, tmp_reg);
 498   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 499   if (counters != NULL) {
 500     Label around;
 501     cbnz(tmp_reg, around);
 502     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 503     b(done);
 504     bind(around);
 505   } else {
 506     cbz(tmp_reg, done);
 507   }
 508 
 509   Label try_revoke_bias;
 510   Label try_rebias;
 511 
 512   // At this point we know that the header has the bias pattern and
 513   // that we are not the bias owner in the current epoch. We need to
 514   // figure out more details about the state of the header in order to
 515   // know what operations can be legally performed on the object's
 516   // header.
 517 
 518   // If the low three bits in the xor result aren't clear, that means
 519   // the prototype header is no longer biased and we have to revoke
 520   // the bias on this object.
 521   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 522   cbnz(rscratch1, try_revoke_bias);
 523 
 524   // Biasing is still enabled for this data type. See whether the
 525   // epoch of the current bias is still valid, meaning that the epoch
 526   // bits of the mark word are equal to the epoch bits of the
 527   // prototype header. (Note that the prototype header's epoch bits
 528   // only change at a safepoint.) If not, attempt to rebias the object
 529   // toward the current thread. Note that we must be absolutely sure
 530   // that the current epoch is invalid in order to do this because
 531   // otherwise the manipulations it performs on the mark word are
 532   // illegal.
 533   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 534   cbnz(rscratch1, try_rebias);
 535 
 536   // The epoch of the current bias is still valid but we know nothing
 537   // about the owner; it might be set or it might be clear. Try to
 538   // acquire the bias of the object using an atomic operation. If this
 539   // fails we will go in to the runtime to revoke the object's bias.
 540   // Note that we first construct the presumed unbiased header so we
 541   // don't accidentally blow away another thread's valid bias.
 542   {
 543     Label here;
 544     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 545     andr(swap_reg, swap_reg, rscratch1);
 546     orr(tmp_reg, swap_reg, rthread);
 547     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 548     // If the biasing toward our thread failed, this means that
 549     // another thread succeeded in biasing it toward itself and we
 550     // need to revoke that bias. The revocation will occur in the
 551     // interpreter runtime in the slow case.
 552     bind(here);
 553     if (counters != NULL) {
 554       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 555                   tmp_reg, rscratch1, rscratch2);
 556     }
 557   }
 558   b(done);
 559 
 560   bind(try_rebias);
 561   // At this point we know the epoch has expired, meaning that the
 562   // current "bias owner", if any, is actually invalid. Under these
 563   // circumstances _only_, we are allowed to use the current header's
 564   // value as the comparison value when doing the cas to acquire the
 565   // bias in the current epoch. In other words, we allow transfer of
 566   // the bias from one thread to another directly in this situation.
 567   //
 568   // FIXME: due to a lack of registers we currently blow away the age
 569   // bits in this situation. Should attempt to preserve them.
 570   {
 571     Label here;
 572     load_prototype_header(tmp_reg, obj_reg);
 573     orr(tmp_reg, rthread, tmp_reg);
 574     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 575     // If the biasing toward our thread failed, then another thread
 576     // succeeded in biasing it toward itself and we need to revoke that
 577     // bias. The revocation will occur in the runtime in the slow case.
 578     bind(here);
 579     if (counters != NULL) {
 580       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 581                   tmp_reg, rscratch1, rscratch2);
 582     }
 583   }
 584   b(done);
 585 
 586   bind(try_revoke_bias);
 587   // The prototype mark in the klass doesn't have the bias bit set any
 588   // more, indicating that objects of this data type are not supposed
 589   // to be biased any more. We are going to try to reset the mark of
 590   // this object to the prototype value and fall through to the
 591   // CAS-based locking scheme. Note that if our CAS fails, it means
 592   // that another thread raced us for the privilege of revoking the
 593   // bias of this particular object, so it's okay to continue in the
 594   // normal locking code.
 595   //
 596   // FIXME: due to a lack of registers we currently blow away the age
 597   // bits in this situation. Should attempt to preserve them.
 598   {
 599     Label here, nope;
 600     load_prototype_header(tmp_reg, obj_reg);
 601     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 602     bind(here);
 603 
 604     // Fall through to the normal CAS-based lock, because no matter what
 605     // the result of the above CAS, some thread must have succeeded in
 606     // removing the bias bit from the object's header.
 607     if (counters != NULL) {
 608       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 609                   rscratch1, rscratch2);
 610     }
 611     bind(nope);
 612   }
 613 
 614   bind(cas_label);
 615 
 616   return null_check_offset;
 617 }
 618 
 619 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 620   assert(UseBiasedLocking, "why call this otherwise?");
 621 
 622   // Check for biased locking unlock case, which is a no-op
 623   // Note: we do not have to check the thread ID for two reasons.
 624   // First, the interpreter checks for IllegalMonitorStateException at
 625   // a higher level. Second, if the bias was revoked while we held the
 626   // lock, the object could not be rebiased toward another thread, so
 627   // the bias bit would be clear.
 628   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 629   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 630   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 631   br(Assembler::EQ, done);
 632 }
 633 
 634 static void pass_arg0(MacroAssembler* masm, Register arg) {
 635   if (c_rarg0 != arg ) {
 636     masm->mov(c_rarg0, arg);
 637   }
 638 }
 639 
 640 static void pass_arg1(MacroAssembler* masm, Register arg) {
 641   if (c_rarg1 != arg ) {
 642     masm->mov(c_rarg1, arg);
 643   }
 644 }
 645 
 646 static void pass_arg2(MacroAssembler* masm, Register arg) {
 647   if (c_rarg2 != arg ) {
 648     masm->mov(c_rarg2, arg);
 649   }
 650 }
 651 
 652 static void pass_arg3(MacroAssembler* masm, Register arg) {
 653   if (c_rarg3 != arg ) {
 654     masm->mov(c_rarg3, arg);
 655   }
 656 }
 657 
 658 void MacroAssembler::call_VM_base(Register oop_result,
 659                                   Register java_thread,
 660                                   Register last_java_sp,
 661                                   address  entry_point,
 662                                   int      number_of_arguments,
 663                                   bool     check_exceptions) {
 664    // determine java_thread register
 665   if (!java_thread->is_valid()) {
 666     java_thread = rthread;
 667   }
 668 
 669   // determine last_java_sp register
 670   if (!last_java_sp->is_valid()) {
 671     last_java_sp = esp;
 672   }
 673 
 674   // debugging support
 675   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 676   assert(java_thread == rthread, "unexpected register");
 677 #ifdef ASSERT
 678   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 679   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 680 #endif // ASSERT
 681 
 682   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 683   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 684 
 685   // push java thread (becomes first argument of C function)
 686 
 687   mov(c_rarg0, java_thread);
 688 
 689   // set last Java frame before call
 690   assert(last_java_sp != rfp, "can't use rfp");
 691 
 692   Label l;
 693   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 694 
 695   // do the call, remove parameters
 696   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 697 
 698   // reset last Java frame
 699   // Only interpreter should have to clear fp
 700   reset_last_Java_frame(true);
 701 
 702    // C++ interp handles this in the interpreter
 703   check_and_handle_popframe(java_thread);
 704   check_and_handle_earlyret(java_thread);
 705 
 706   if (check_exceptions) {
 707     // check for pending exceptions (java_thread is set upon return)
 708     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 709     Label ok;
 710     cbz(rscratch1, ok);
 711     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 712     br(rscratch1);
 713     bind(ok);
 714   }
 715 
 716   // get oop result if there is one and reset the value in the thread
 717   if (oop_result->is_valid()) {
 718     get_vm_result(oop_result, java_thread);
 719   }
 720 }
 721 
 722 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 723   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 724 }
 725 
 726 // Maybe emit a call via a trampoline.  If the code cache is small
 727 // trampolines won't be emitted.
 728 
 729 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 730   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 731   assert(entry.rspec().type() == relocInfo::runtime_call_type
 732          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 733          || entry.rspec().type() == relocInfo::static_call_type
 734          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 735 
 736   // We need a trampoline if branches are far.
 737   if (far_branches()) {
 738     // We don't want to emit a trampoline if C2 is generating dummy
 739     // code during its branch shortening phase.
 740     CompileTask* task = ciEnv::current()->task();
 741     bool in_scratch_emit_size =
 742       (task != NULL && is_c2_compile(task->comp_level()) &&
 743        Compile::current()->in_scratch_emit_size());
 744     if (!in_scratch_emit_size) {
 745       address stub = emit_trampoline_stub(offset(), entry.target());
 746       if (stub == NULL) {
 747         return NULL; // CodeCache is full
 748       }
 749     }
 750   }
 751 
 752   if (cbuf) cbuf->set_insts_mark();
 753   relocate(entry.rspec());
 754   if (!far_branches()) {
 755     bl(entry.target());
 756   } else {
 757     bl(pc());
 758   }
 759   // just need to return a non-null address
 760   return pc();
 761 }
 762 
 763 
 764 // Emit a trampoline stub for a call to a target which is too far away.
 765 //
 766 // code sequences:
 767 //
 768 // call-site:
 769 //   branch-and-link to <destination> or <trampoline stub>
 770 //
 771 // Related trampoline stub for this call site in the stub section:
 772 //   load the call target from the constant pool
 773 //   branch (LR still points to the call site above)
 774 
 775 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 776                                              address dest) {
 777   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 778   if (stub == NULL) {
 779     return NULL;  // CodeBuffer::expand failed
 780   }
 781 
 782   // Create a trampoline stub relocation which relates this trampoline stub
 783   // with the call instruction at insts_call_instruction_offset in the
 784   // instructions code-section.
 785   align(wordSize);
 786   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 787                                             + insts_call_instruction_offset));
 788   const int stub_start_offset = offset();
 789 
 790   // Now, create the trampoline stub's code:
 791   // - load the call
 792   // - call
 793   Label target;
 794   ldr(rscratch1, target);
 795   br(rscratch1);
 796   bind(target);
 797   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 798          "should be");
 799   emit_int64((int64_t)dest);
 800 
 801   const address stub_start_addr = addr_at(stub_start_offset);
 802 
 803   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 804 
 805   end_a_stub();
 806   return stub_start_addr;
 807 }
 808 
 809 void MacroAssembler::emit_static_call_stub() {
 810   // CompiledDirectStaticCall::set_to_interpreted knows the
 811   // exact layout of this stub.
 812 
 813   isb();
 814   mov_metadata(rmethod, (Metadata*)NULL);
 815 
 816   // Jump to the entry point of the i2c stub.
 817   movptr(rscratch1, 0);
 818   br(rscratch1);
 819 }
 820 
 821 void MacroAssembler::c2bool(Register x) {
 822   // implements x == 0 ? 0 : 1
 823   // note: must only look at least-significant byte of x
 824   //       since C-style booleans are stored in one byte
 825   //       only! (was bug)
 826   tst(x, 0xff);
 827   cset(x, Assembler::NE);
 828 }
 829 
 830 address MacroAssembler::ic_call(address entry, jint method_index) {
 831   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 832   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 833   // unsigned long offset;
 834   // ldr_constant(rscratch2, const_ptr);
 835   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 836   return trampoline_call(Address(entry, rh));
 837 }
 838 
 839 // Implementation of call_VM versions
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              bool check_exceptions) {
 844   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 845 }
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              Register arg_1,
 850                              bool check_exceptions) {
 851   pass_arg1(this, arg_1);
 852   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              address entry_point,
 868                              Register arg_1,
 869                              Register arg_2,
 870                              Register arg_3,
 871                              bool check_exceptions) {
 872   assert(arg_1 != c_rarg3, "smashed arg");
 873   assert(arg_2 != c_rarg3, "smashed arg");
 874   pass_arg3(this, arg_3);
 875 
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878 
 879   pass_arg1(this, arg_1);
 880   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 881 }
 882 
 883 void MacroAssembler::call_VM(Register oop_result,
 884                              Register last_java_sp,
 885                              address entry_point,
 886                              int number_of_arguments,
 887                              bool check_exceptions) {
 888   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              Register arg_1,
 895                              bool check_exceptions) {
 896   pass_arg1(this, arg_1);
 897   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 898 }
 899 
 900 void MacroAssembler::call_VM(Register oop_result,
 901                              Register last_java_sp,
 902                              address entry_point,
 903                              Register arg_1,
 904                              Register arg_2,
 905                              bool check_exceptions) {
 906 
 907   assert(arg_1 != c_rarg2, "smashed arg");
 908   pass_arg2(this, arg_2);
 909   pass_arg1(this, arg_1);
 910   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 911 }
 912 
 913 void MacroAssembler::call_VM(Register oop_result,
 914                              Register last_java_sp,
 915                              address entry_point,
 916                              Register arg_1,
 917                              Register arg_2,
 918                              Register arg_3,
 919                              bool check_exceptions) {
 920   assert(arg_1 != c_rarg3, "smashed arg");
 921   assert(arg_2 != c_rarg3, "smashed arg");
 922   pass_arg3(this, arg_3);
 923   assert(arg_1 != c_rarg2, "smashed arg");
 924   pass_arg2(this, arg_2);
 925   pass_arg1(this, arg_1);
 926   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 927 }
 928 
 929 
 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 931   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 933   verify_oop(oop_result, "broken oop in call_VM_base");
 934 }
 935 
 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 937   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 939 }
 940 
 941 void MacroAssembler::align(int modulus) {
 942   while (offset() % modulus != 0) nop();
 943 }
 944 
 945 // these are no-ops overridden by InterpreterMacroAssembler
 946 
 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 948 
 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 950 
 951 
 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 953                                                       Register tmp,
 954                                                       int offset) {
 955   intptr_t value = *delayed_value_addr;
 956   if (value != 0)
 957     return RegisterOrConstant(value + offset);
 958 
 959   // load indirectly to solve generation ordering problem
 960   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 961 
 962   if (offset != 0)
 963     add(tmp, tmp, offset);
 964 
 965   return RegisterOrConstant(tmp);
 966 }
 967 
 968 // Look up the method for a megamorphic invokeinterface call.
 969 // The target method is determined by <intf_klass, itable_index>.
 970 // The receiver klass is in recv_klass.
 971 // On success, the result will be in method_result, and execution falls through.
 972 // On failure, execution transfers to the given label.
 973 void MacroAssembler::lookup_interface_method(Register recv_klass,
 974                                              Register intf_klass,
 975                                              RegisterOrConstant itable_index,
 976                                              Register method_result,
 977                                              Register scan_temp,
 978                                              Label& L_no_such_interface,
 979                          bool return_method) {
 980   assert_different_registers(recv_klass, intf_klass, scan_temp);
 981   assert_different_registers(method_result, intf_klass, scan_temp);
 982   assert(recv_klass != method_result || !return_method,
 983      "recv_klass can be destroyed when method isn't needed");
 984   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 985          "caller must use same register for non-constant itable index as for method");
 986 
 987   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 988   int vtable_base = in_bytes(Klass::vtable_start_offset());
 989   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 990   int scan_step   = itableOffsetEntry::size() * wordSize;
 991   int vte_size    = vtableEntry::size_in_bytes();
 992   assert(vte_size == wordSize, "else adjust times_vte_scale");
 993 
 994   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 995 
 996   // %%% Could store the aligned, prescaled offset in the klassoop.
 997   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 998   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 999   add(scan_temp, scan_temp, vtable_base);
1000 
1001   if (return_method) {
1002     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1003     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1004     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1005     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1006     if (itentry_off)
1007       add(recv_klass, recv_klass, itentry_off);
1008   }
1009 
1010   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1011   //   if (scan->interface() == intf) {
1012   //     result = (klass + scan->offset() + itable_index);
1013   //   }
1014   // }
1015   Label search, found_method;
1016 
1017   for (int peel = 1; peel >= 0; peel--) {
1018     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1019     cmp(intf_klass, method_result);
1020 
1021     if (peel) {
1022       br(Assembler::EQ, found_method);
1023     } else {
1024       br(Assembler::NE, search);
1025       // (invert the test to fall through to found_method...)
1026     }
1027 
1028     if (!peel)  break;
1029 
1030     bind(search);
1031 
1032     // Check that the previous entry is non-null.  A null entry means that
1033     // the receiver class doesn't implement the interface, and wasn't the
1034     // same as when the caller was compiled.
1035     cbz(method_result, L_no_such_interface);
1036     add(scan_temp, scan_temp, scan_step);
1037   }
1038 
1039   bind(found_method);
1040 
1041   // Got a hit.
1042   if (return_method) {
1043     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1044     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1045   }
1046 }
1047 
1048 // virtual method calling
1049 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1050                                            RegisterOrConstant vtable_index,
1051                                            Register method_result) {
1052   const int base = in_bytes(Klass::vtable_start_offset());
1053   assert(vtableEntry::size() * wordSize == 8,
1054          "adjust the scaling in the code below");
1055   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1056 
1057   if (vtable_index.is_register()) {
1058     lea(method_result, Address(recv_klass,
1059                                vtable_index.as_register(),
1060                                Address::lsl(LogBytesPerWord)));
1061     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1062   } else {
1063     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1064     ldr(method_result,
1065         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1066   }
1067 }
1068 
1069 void MacroAssembler::check_klass_subtype(Register sub_klass,
1070                            Register super_klass,
1071                            Register temp_reg,
1072                            Label& L_success) {
1073   Label L_failure;
1074   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1075   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1076   bind(L_failure);
1077 }
1078 
1079 
1080 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1081                                                    Register super_klass,
1082                                                    Register temp_reg,
1083                                                    Label* L_success,
1084                                                    Label* L_failure,
1085                                                    Label* L_slow_path,
1086                                         RegisterOrConstant super_check_offset) {
1087   assert_different_registers(sub_klass, super_klass, temp_reg);
1088   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1089   if (super_check_offset.is_register()) {
1090     assert_different_registers(sub_klass, super_klass,
1091                                super_check_offset.as_register());
1092   } else if (must_load_sco) {
1093     assert(temp_reg != noreg, "supply either a temp or a register offset");
1094   }
1095 
1096   Label L_fallthrough;
1097   int label_nulls = 0;
1098   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1099   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1100   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1101   assert(label_nulls <= 1, "at most one NULL in the batch");
1102 
1103   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1104   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1105   Address super_check_offset_addr(super_klass, sco_offset);
1106 
1107   // Hacked jmp, which may only be used just before L_fallthrough.
1108 #define final_jmp(label)                                                \
1109   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1110   else                            b(label)                /*omit semi*/
1111 
1112   // If the pointers are equal, we are done (e.g., String[] elements).
1113   // This self-check enables sharing of secondary supertype arrays among
1114   // non-primary types such as array-of-interface.  Otherwise, each such
1115   // type would need its own customized SSA.
1116   // We move this check to the front of the fast path because many
1117   // type checks are in fact trivially successful in this manner,
1118   // so we get a nicely predicted branch right at the start of the check.
1119   cmp(sub_klass, super_klass);
1120   br(Assembler::EQ, *L_success);
1121 
1122   // Check the supertype display:
1123   if (must_load_sco) {
1124     ldrw(temp_reg, super_check_offset_addr);
1125     super_check_offset = RegisterOrConstant(temp_reg);
1126   }
1127   Address super_check_addr(sub_klass, super_check_offset);
1128   ldr(rscratch1, super_check_addr);
1129   cmp(super_klass, rscratch1); // load displayed supertype
1130 
1131   // This check has worked decisively for primary supers.
1132   // Secondary supers are sought in the super_cache ('super_cache_addr').
1133   // (Secondary supers are interfaces and very deeply nested subtypes.)
1134   // This works in the same check above because of a tricky aliasing
1135   // between the super_cache and the primary super display elements.
1136   // (The 'super_check_addr' can address either, as the case requires.)
1137   // Note that the cache is updated below if it does not help us find
1138   // what we need immediately.
1139   // So if it was a primary super, we can just fail immediately.
1140   // Otherwise, it's the slow path for us (no success at this point).
1141 
1142   if (super_check_offset.is_register()) {
1143     br(Assembler::EQ, *L_success);
1144     cmp(super_check_offset.as_register(), sc_offset);
1145     if (L_failure == &L_fallthrough) {
1146       br(Assembler::EQ, *L_slow_path);
1147     } else {
1148       br(Assembler::NE, *L_failure);
1149       final_jmp(*L_slow_path);
1150     }
1151   } else if (super_check_offset.as_constant() == sc_offset) {
1152     // Need a slow path; fast failure is impossible.
1153     if (L_slow_path == &L_fallthrough) {
1154       br(Assembler::EQ, *L_success);
1155     } else {
1156       br(Assembler::NE, *L_slow_path);
1157       final_jmp(*L_success);
1158     }
1159   } else {
1160     // No slow path; it's a fast decision.
1161     if (L_failure == &L_fallthrough) {
1162       br(Assembler::EQ, *L_success);
1163     } else {
1164       br(Assembler::NE, *L_failure);
1165       final_jmp(*L_success);
1166     }
1167   }
1168 
1169   bind(L_fallthrough);
1170 
1171 #undef final_jmp
1172 }
1173 
1174 // These two are taken from x86, but they look generally useful
1175 
1176 // scans count pointer sized words at [addr] for occurence of value,
1177 // generic
1178 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1179                                 Register scratch) {
1180   Label Lloop, Lexit;
1181   cbz(count, Lexit);
1182   bind(Lloop);
1183   ldr(scratch, post(addr, wordSize));
1184   cmp(value, scratch);
1185   br(EQ, Lexit);
1186   sub(count, count, 1);
1187   cbnz(count, Lloop);
1188   bind(Lexit);
1189 }
1190 
1191 // scans count 4 byte words at [addr] for occurence of value,
1192 // generic
1193 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1194                                 Register scratch) {
1195   Label Lloop, Lexit;
1196   cbz(count, Lexit);
1197   bind(Lloop);
1198   ldrw(scratch, post(addr, wordSize));
1199   cmpw(value, scratch);
1200   br(EQ, Lexit);
1201   sub(count, count, 1);
1202   cbnz(count, Lloop);
1203   bind(Lexit);
1204 }
1205 
1206 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1207                                                    Register super_klass,
1208                                                    Register temp_reg,
1209                                                    Register temp2_reg,
1210                                                    Label* L_success,
1211                                                    Label* L_failure,
1212                                                    bool set_cond_codes) {
1213   assert_different_registers(sub_klass, super_klass, temp_reg);
1214   if (temp2_reg != noreg)
1215     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1216 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1217 
1218   Label L_fallthrough;
1219   int label_nulls = 0;
1220   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1221   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1222   assert(label_nulls <= 1, "at most one NULL in the batch");
1223 
1224   // a couple of useful fields in sub_klass:
1225   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1226   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1227   Address secondary_supers_addr(sub_klass, ss_offset);
1228   Address super_cache_addr(     sub_klass, sc_offset);
1229 
1230   BLOCK_COMMENT("check_klass_subtype_slow_path");
1231 
1232   // Do a linear scan of the secondary super-klass chain.
1233   // This code is rarely used, so simplicity is a virtue here.
1234   // The repne_scan instruction uses fixed registers, which we must spill.
1235   // Don't worry too much about pre-existing connections with the input regs.
1236 
1237   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1238   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1239 
1240   RegSet pushed_registers;
1241   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1242   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1243 
1244   if (super_klass != r0 || UseCompressedOops) {
1245     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1246   }
1247 
1248   push(pushed_registers, sp);
1249 
1250   // Get super_klass value into r0 (even if it was in r5 or r2).
1251   if (super_klass != r0) {
1252     mov(r0, super_klass);
1253   }
1254 
1255 #ifndef PRODUCT
1256   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1257   Address pst_counter_addr(rscratch2);
1258   ldr(rscratch1, pst_counter_addr);
1259   add(rscratch1, rscratch1, 1);
1260   str(rscratch1, pst_counter_addr);
1261 #endif //PRODUCT
1262 
1263   // We will consult the secondary-super array.
1264   ldr(r5, secondary_supers_addr);
1265   // Load the array length.
1266   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1267   // Skip to start of data.
1268   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1269 
1270   cmp(sp, zr); // Clear Z flag; SP is never zero
1271   // Scan R2 words at [R5] for an occurrence of R0.
1272   // Set NZ/Z based on last compare.
1273   repne_scan(r5, r0, r2, rscratch1);
1274 
1275   // Unspill the temp. registers:
1276   pop(pushed_registers, sp);
1277 
1278   br(Assembler::NE, *L_failure);
1279 
1280   // Success.  Cache the super we found and proceed in triumph.
1281   str(super_klass, super_cache_addr);
1282 
1283   if (L_success != &L_fallthrough) {
1284     b(*L_success);
1285   }
1286 
1287 #undef IS_A_TEMP
1288 
1289   bind(L_fallthrough);
1290 }
1291 
1292 
1293 void MacroAssembler::verify_oop(Register reg, const char* s) {
1294   if (!VerifyOops) return;
1295 
1296   // Pass register number to verify_oop_subroutine
1297   const char* b = NULL;
1298   {
1299     ResourceMark rm;
1300     stringStream ss;
1301     ss.print("verify_oop: %s: %s", reg->name(), s);
1302     b = code_string(ss.as_string());
1303   }
1304   BLOCK_COMMENT("verify_oop {");
1305 
1306   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1307   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1308 
1309   mov(r0, reg);
1310   movptr(rscratch1, (uintptr_t)(address)b);
1311 
1312   // call indirectly to solve generation ordering problem
1313   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1314   ldr(rscratch2, Address(rscratch2));
1315   blr(rscratch2);
1316 
1317   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1318   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1319 
1320   BLOCK_COMMENT("} verify_oop");
1321 }
1322 
1323 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1324   if (!VerifyOops) return;
1325 
1326   const char* b = NULL;
1327   {
1328     ResourceMark rm;
1329     stringStream ss;
1330     ss.print("verify_oop_addr: %s", s);
1331     b = code_string(ss.as_string());
1332   }
1333   BLOCK_COMMENT("verify_oop_addr {");
1334 
1335   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1336   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1337 
1338   // addr may contain sp so we will have to adjust it based on the
1339   // pushes that we just did.
1340   if (addr.uses(sp)) {
1341     lea(r0, addr);
1342     ldr(r0, Address(r0, 4 * wordSize));
1343   } else {
1344     ldr(r0, addr);
1345   }
1346   movptr(rscratch1, (uintptr_t)(address)b);
1347 
1348   // call indirectly to solve generation ordering problem
1349   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1350   ldr(rscratch2, Address(rscratch2));
1351   blr(rscratch2);
1352 
1353   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1354   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1355 
1356   BLOCK_COMMENT("} verify_oop_addr");
1357 }
1358 
1359 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1360                                          int extra_slot_offset) {
1361   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1362   int stackElementSize = Interpreter::stackElementSize;
1363   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1364 #ifdef ASSERT
1365   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1366   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1367 #endif
1368   if (arg_slot.is_constant()) {
1369     return Address(esp, arg_slot.as_constant() * stackElementSize
1370                    + offset);
1371   } else {
1372     add(rscratch1, esp, arg_slot.as_register(),
1373         ext::uxtx, exact_log2(stackElementSize));
1374     return Address(rscratch1, offset);
1375   }
1376 }
1377 
1378 void MacroAssembler::call_VM_leaf_base(address entry_point,
1379                                        int number_of_arguments,
1380                                        Label *retaddr) {
1381   Label E, L;
1382 
1383   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1384 
1385   mov(rscratch1, entry_point);
1386   blr(rscratch1);
1387   if (retaddr)
1388     bind(*retaddr);
1389 
1390   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1391   maybe_isb();
1392 }
1393 
1394 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1395   call_VM_leaf_base(entry_point, number_of_arguments);
1396 }
1397 
1398 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1399   pass_arg0(this, arg_0);
1400   call_VM_leaf_base(entry_point, 1);
1401 }
1402 
1403 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1404   pass_arg0(this, arg_0);
1405   pass_arg1(this, arg_1);
1406   call_VM_leaf_base(entry_point, 2);
1407 }
1408 
1409 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1410                                   Register arg_1, Register arg_2) {
1411   pass_arg0(this, arg_0);
1412   pass_arg1(this, arg_1);
1413   pass_arg2(this, arg_2);
1414   call_VM_leaf_base(entry_point, 3);
1415 }
1416 
1417 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1418   pass_arg0(this, arg_0);
1419   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1420 }
1421 
1422 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1423 
1424   assert(arg_0 != c_rarg1, "smashed arg");
1425   pass_arg1(this, arg_1);
1426   pass_arg0(this, arg_0);
1427   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1428 }
1429 
1430 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1431   assert(arg_0 != c_rarg2, "smashed arg");
1432   assert(arg_1 != c_rarg2, "smashed arg");
1433   pass_arg2(this, arg_2);
1434   assert(arg_0 != c_rarg1, "smashed arg");
1435   pass_arg1(this, arg_1);
1436   pass_arg0(this, arg_0);
1437   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1438 }
1439 
1440 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1441   assert(arg_0 != c_rarg3, "smashed arg");
1442   assert(arg_1 != c_rarg3, "smashed arg");
1443   assert(arg_2 != c_rarg3, "smashed arg");
1444   pass_arg3(this, arg_3);
1445   assert(arg_0 != c_rarg2, "smashed arg");
1446   assert(arg_1 != c_rarg2, "smashed arg");
1447   pass_arg2(this, arg_2);
1448   assert(arg_0 != c_rarg1, "smashed arg");
1449   pass_arg1(this, arg_1);
1450   pass_arg0(this, arg_0);
1451   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1452 }
1453 
1454 void MacroAssembler::null_check(Register reg, int offset) {
1455   if (needs_explicit_null_check(offset)) {
1456     // provoke OS NULL exception if reg = NULL by
1457     // accessing M[reg] w/o changing any registers
1458     // NOTE: this is plenty to provoke a segv
1459     ldr(zr, Address(reg));
1460   } else {
1461     // nothing to do, (later) access of M[reg + offset]
1462     // will provoke OS NULL exception if reg = NULL
1463   }
1464 }
1465 
1466 // MacroAssembler protected routines needed to implement
1467 // public methods
1468 
1469 void MacroAssembler::mov(Register r, Address dest) {
1470   code_section()->relocate(pc(), dest.rspec());
1471   u_int64_t imm64 = (u_int64_t)dest.target();
1472   movptr(r, imm64);
1473 }
1474 
1475 // Move a constant pointer into r.  In AArch64 mode the virtual
1476 // address space is 48 bits in size, so we only need three
1477 // instructions to create a patchable instruction sequence that can
1478 // reach anywhere.
1479 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1480 #ifndef PRODUCT
1481   {
1482     char buffer[64];
1483     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1484     block_comment(buffer);
1485   }
1486 #endif
1487   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1488   movz(r, imm64 & 0xffff);
1489   imm64 >>= 16;
1490   movk(r, imm64 & 0xffff, 16);
1491   imm64 >>= 16;
1492   movk(r, imm64 & 0xffff, 32);
1493 }
1494 
1495 // Macro to mov replicated immediate to vector register.
1496 //  Vd will get the following values for different arrangements in T
1497 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1498 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1499 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1500 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1501 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1502 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1503 //   T1D/T2D: invalid
1504 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1505   assert(T != T1D && T != T2D, "invalid arrangement");
1506   if (T == T8B || T == T16B) {
1507     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1508     movi(Vd, T, imm32 & 0xff, 0);
1509     return;
1510   }
1511   u_int32_t nimm32 = ~imm32;
1512   if (T == T4H || T == T8H) {
1513     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1514     imm32 &= 0xffff;
1515     nimm32 &= 0xffff;
1516   }
1517   u_int32_t x = imm32;
1518   int movi_cnt = 0;
1519   int movn_cnt = 0;
1520   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1521   x = nimm32;
1522   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1523   if (movn_cnt < movi_cnt) imm32 = nimm32;
1524   unsigned lsl = 0;
1525   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1526   if (movn_cnt < movi_cnt)
1527     mvni(Vd, T, imm32 & 0xff, lsl);
1528   else
1529     movi(Vd, T, imm32 & 0xff, lsl);
1530   imm32 >>= 8; lsl += 8;
1531   while (imm32) {
1532     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1533     if (movn_cnt < movi_cnt)
1534       bici(Vd, T, imm32 & 0xff, lsl);
1535     else
1536       orri(Vd, T, imm32 & 0xff, lsl);
1537     lsl += 8; imm32 >>= 8;
1538   }
1539 }
1540 
1541 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1542 {
1543 #ifndef PRODUCT
1544   {
1545     char buffer[64];
1546     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1547     block_comment(buffer);
1548   }
1549 #endif
1550   if (operand_valid_for_logical_immediate(false, imm64)) {
1551     orr(dst, zr, imm64);
1552   } else {
1553     // we can use a combination of MOVZ or MOVN with
1554     // MOVK to build up the constant
1555     u_int64_t imm_h[4];
1556     int zero_count = 0;
1557     int neg_count = 0;
1558     int i;
1559     for (i = 0; i < 4; i++) {
1560       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1561       if (imm_h[i] == 0) {
1562         zero_count++;
1563       } else if (imm_h[i] == 0xffffL) {
1564         neg_count++;
1565       }
1566     }
1567     if (zero_count == 4) {
1568       // one MOVZ will do
1569       movz(dst, 0);
1570     } else if (neg_count == 4) {
1571       // one MOVN will do
1572       movn(dst, 0);
1573     } else if (zero_count == 3) {
1574       for (i = 0; i < 4; i++) {
1575         if (imm_h[i] != 0L) {
1576           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1577           break;
1578         }
1579       }
1580     } else if (neg_count == 3) {
1581       // one MOVN will do
1582       for (int i = 0; i < 4; i++) {
1583         if (imm_h[i] != 0xffffL) {
1584           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1585           break;
1586         }
1587       }
1588     } else if (zero_count == 2) {
1589       // one MOVZ and one MOVK will do
1590       for (i = 0; i < 3; i++) {
1591         if (imm_h[i] != 0L) {
1592           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1593           i++;
1594           break;
1595         }
1596       }
1597       for (;i < 4; i++) {
1598         if (imm_h[i] != 0L) {
1599           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1600         }
1601       }
1602     } else if (neg_count == 2) {
1603       // one MOVN and one MOVK will do
1604       for (i = 0; i < 4; i++) {
1605         if (imm_h[i] != 0xffffL) {
1606           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1607           i++;
1608           break;
1609         }
1610       }
1611       for (;i < 4; i++) {
1612         if (imm_h[i] != 0xffffL) {
1613           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1614         }
1615       }
1616     } else if (zero_count == 1) {
1617       // one MOVZ and two MOVKs will do
1618       for (i = 0; i < 4; i++) {
1619         if (imm_h[i] != 0L) {
1620           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1621           i++;
1622           break;
1623         }
1624       }
1625       for (;i < 4; i++) {
1626         if (imm_h[i] != 0x0L) {
1627           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1628         }
1629       }
1630     } else if (neg_count == 1) {
1631       // one MOVN and two MOVKs will do
1632       for (i = 0; i < 4; i++) {
1633         if (imm_h[i] != 0xffffL) {
1634           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1635           i++;
1636           break;
1637         }
1638       }
1639       for (;i < 4; i++) {
1640         if (imm_h[i] != 0xffffL) {
1641           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1642         }
1643       }
1644     } else {
1645       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1646       movz(dst, (u_int32_t)imm_h[0], 0);
1647       for (i = 1; i < 4; i++) {
1648         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649       }
1650     }
1651   }
1652 }
1653 
1654 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1655 {
1656 #ifndef PRODUCT
1657     {
1658       char buffer[64];
1659       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1660       block_comment(buffer);
1661     }
1662 #endif
1663   if (operand_valid_for_logical_immediate(true, imm32)) {
1664     orrw(dst, zr, imm32);
1665   } else {
1666     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1667     // constant
1668     u_int32_t imm_h[2];
1669     imm_h[0] = imm32 & 0xffff;
1670     imm_h[1] = ((imm32 >> 16) & 0xffff);
1671     if (imm_h[0] == 0) {
1672       movzw(dst, imm_h[1], 16);
1673     } else if (imm_h[0] == 0xffff) {
1674       movnw(dst, imm_h[1] ^ 0xffff, 16);
1675     } else if (imm_h[1] == 0) {
1676       movzw(dst, imm_h[0], 0);
1677     } else if (imm_h[1] == 0xffff) {
1678       movnw(dst, imm_h[0] ^ 0xffff, 0);
1679     } else {
1680       // use a MOVZ and MOVK (makes it easier to debug)
1681       movzw(dst, imm_h[0], 0);
1682       movkw(dst, imm_h[1], 16);
1683     }
1684   }
1685 }
1686 
1687 // Form an address from base + offset in Rd.  Rd may or may
1688 // not actually be used: you must use the Address that is returned.
1689 // It is up to you to ensure that the shift provided matches the size
1690 // of your data.
1691 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1692   if (Address::offset_ok_for_immed(byte_offset, shift))
1693     // It fits; no need for any heroics
1694     return Address(base, byte_offset);
1695 
1696   // Don't do anything clever with negative or misaligned offsets
1697   unsigned mask = (1 << shift) - 1;
1698   if (byte_offset < 0 || byte_offset & mask) {
1699     mov(Rd, byte_offset);
1700     add(Rd, base, Rd);
1701     return Address(Rd);
1702   }
1703 
1704   // See if we can do this with two 12-bit offsets
1705   {
1706     unsigned long word_offset = byte_offset >> shift;
1707     unsigned long masked_offset = word_offset & 0xfff000;
1708     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1709         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1710       add(Rd, base, masked_offset << shift);
1711       word_offset -= masked_offset;
1712       return Address(Rd, word_offset << shift);
1713     }
1714   }
1715 
1716   // Do it the hard way
1717   mov(Rd, byte_offset);
1718   add(Rd, base, Rd);
1719   return Address(Rd);
1720 }
1721 
1722 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1723   if (UseLSE) {
1724     mov(tmp, 1);
1725     ldadd(Assembler::word, tmp, zr, counter_addr);
1726     return;
1727   }
1728   Label retry_load;
1729   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1730     prfm(Address(counter_addr), PSTL1STRM);
1731   bind(retry_load);
1732   // flush and load exclusive from the memory location
1733   ldxrw(tmp, counter_addr);
1734   addw(tmp, tmp, 1);
1735   // if we store+flush with no intervening write tmp wil be zero
1736   stxrw(tmp2, tmp, counter_addr);
1737   cbnzw(tmp2, retry_load);
1738 }
1739 
1740 
1741 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1742                                     bool want_remainder, Register scratch)
1743 {
1744   // Full implementation of Java idiv and irem.  The function
1745   // returns the (pc) offset of the div instruction - may be needed
1746   // for implicit exceptions.
1747   //
1748   // constraint : ra/rb =/= scratch
1749   //         normal case
1750   //
1751   // input : ra: dividend
1752   //         rb: divisor
1753   //
1754   // result: either
1755   //         quotient  (= ra idiv rb)
1756   //         remainder (= ra irem rb)
1757 
1758   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1759 
1760   int idivl_offset = offset();
1761   if (! want_remainder) {
1762     sdivw(result, ra, rb);
1763   } else {
1764     sdivw(scratch, ra, rb);
1765     Assembler::msubw(result, scratch, rb, ra);
1766   }
1767 
1768   return idivl_offset;
1769 }
1770 
1771 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1772                                     bool want_remainder, Register scratch)
1773 {
1774   // Full implementation of Java ldiv and lrem.  The function
1775   // returns the (pc) offset of the div instruction - may be needed
1776   // for implicit exceptions.
1777   //
1778   // constraint : ra/rb =/= scratch
1779   //         normal case
1780   //
1781   // input : ra: dividend
1782   //         rb: divisor
1783   //
1784   // result: either
1785   //         quotient  (= ra idiv rb)
1786   //         remainder (= ra irem rb)
1787 
1788   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1789 
1790   int idivq_offset = offset();
1791   if (! want_remainder) {
1792     sdiv(result, ra, rb);
1793   } else {
1794     sdiv(scratch, ra, rb);
1795     Assembler::msub(result, scratch, rb, ra);
1796   }
1797 
1798   return idivq_offset;
1799 }
1800 
1801 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1802   address prev = pc() - NativeMembar::instruction_size;
1803   address last = code()->last_insn();
1804   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1805     NativeMembar *bar = NativeMembar_at(prev);
1806     // We are merging two memory barrier instructions.  On AArch64 we
1807     // can do this simply by ORing them together.
1808     bar->set_kind(bar->get_kind() | order_constraint);
1809     BLOCK_COMMENT("merged membar");
1810   } else {
1811     code()->set_last_insn(pc());
1812     dmb(Assembler::barrier(order_constraint));
1813   }
1814 }
1815 
1816 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1817   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1818     merge_ldst(rt, adr, size_in_bytes, is_store);
1819     code()->clear_last_insn();
1820     return true;
1821   } else {
1822     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1823     const unsigned mask = size_in_bytes - 1;
1824     if (adr.getMode() == Address::base_plus_offset &&
1825         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1826       code()->set_last_insn(pc());
1827     }
1828     return false;
1829   }
1830 }
1831 
1832 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1833   // We always try to merge two adjacent loads into one ldp.
1834   if (!try_merge_ldst(Rx, adr, 8, false)) {
1835     Assembler::ldr(Rx, adr);
1836   }
1837 }
1838 
1839 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1840   // We always try to merge two adjacent loads into one ldp.
1841   if (!try_merge_ldst(Rw, adr, 4, false)) {
1842     Assembler::ldrw(Rw, adr);
1843   }
1844 }
1845 
1846 void MacroAssembler::str(Register Rx, const Address &adr) {
1847   // We always try to merge two adjacent stores into one stp.
1848   if (!try_merge_ldst(Rx, adr, 8, true)) {
1849     Assembler::str(Rx, adr);
1850   }
1851 }
1852 
1853 void MacroAssembler::strw(Register Rw, const Address &adr) {
1854   // We always try to merge two adjacent stores into one stp.
1855   if (!try_merge_ldst(Rw, adr, 4, true)) {
1856     Assembler::strw(Rw, adr);
1857   }
1858 }
1859 
1860 // MacroAssembler routines found actually to be needed
1861 
1862 void MacroAssembler::push(Register src)
1863 {
1864   str(src, Address(pre(esp, -1 * wordSize)));
1865 }
1866 
1867 void MacroAssembler::pop(Register dst)
1868 {
1869   ldr(dst, Address(post(esp, 1 * wordSize)));
1870 }
1871 
1872 // Note: load_unsigned_short used to be called load_unsigned_word.
1873 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1874   int off = offset();
1875   ldrh(dst, src);
1876   return off;
1877 }
1878 
1879 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1880   int off = offset();
1881   ldrb(dst, src);
1882   return off;
1883 }
1884 
1885 int MacroAssembler::load_signed_short(Register dst, Address src) {
1886   int off = offset();
1887   ldrsh(dst, src);
1888   return off;
1889 }
1890 
1891 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1892   int off = offset();
1893   ldrsb(dst, src);
1894   return off;
1895 }
1896 
1897 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1898   int off = offset();
1899   ldrshw(dst, src);
1900   return off;
1901 }
1902 
1903 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1904   int off = offset();
1905   ldrsbw(dst, src);
1906   return off;
1907 }
1908 
1909 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1910   switch (size_in_bytes) {
1911   case  8:  ldr(dst, src); break;
1912   case  4:  ldrw(dst, src); break;
1913   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1914   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1915   default:  ShouldNotReachHere();
1916   }
1917 }
1918 
1919 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1920   switch (size_in_bytes) {
1921   case  8:  str(src, dst); break;
1922   case  4:  strw(src, dst); break;
1923   case  2:  strh(src, dst); break;
1924   case  1:  strb(src, dst); break;
1925   default:  ShouldNotReachHere();
1926   }
1927 }
1928 
1929 void MacroAssembler::decrementw(Register reg, int value)
1930 {
1931   if (value < 0)  { incrementw(reg, -value);      return; }
1932   if (value == 0) {                               return; }
1933   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1934   /* else */ {
1935     guarantee(reg != rscratch2, "invalid dst for register decrement");
1936     movw(rscratch2, (unsigned)value);
1937     subw(reg, reg, rscratch2);
1938   }
1939 }
1940 
1941 void MacroAssembler::decrement(Register reg, int value)
1942 {
1943   if (value < 0)  { increment(reg, -value);      return; }
1944   if (value == 0) {                              return; }
1945   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1946   /* else */ {
1947     assert(reg != rscratch2, "invalid dst for register decrement");
1948     mov(rscratch2, (unsigned long)value);
1949     sub(reg, reg, rscratch2);
1950   }
1951 }
1952 
1953 void MacroAssembler::decrementw(Address dst, int value)
1954 {
1955   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1956   if (dst.getMode() == Address::literal) {
1957     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1958     lea(rscratch2, dst);
1959     dst = Address(rscratch2);
1960   }
1961   ldrw(rscratch1, dst);
1962   decrementw(rscratch1, value);
1963   strw(rscratch1, dst);
1964 }
1965 
1966 void MacroAssembler::decrement(Address dst, int value)
1967 {
1968   assert(!dst.uses(rscratch1), "invalid address for decrement");
1969   if (dst.getMode() == Address::literal) {
1970     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1971     lea(rscratch2, dst);
1972     dst = Address(rscratch2);
1973   }
1974   ldr(rscratch1, dst);
1975   decrement(rscratch1, value);
1976   str(rscratch1, dst);
1977 }
1978 
1979 void MacroAssembler::incrementw(Register reg, int value)
1980 {
1981   if (value < 0)  { decrementw(reg, -value);      return; }
1982   if (value == 0) {                               return; }
1983   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1984   /* else */ {
1985     assert(reg != rscratch2, "invalid dst for register increment");
1986     movw(rscratch2, (unsigned)value);
1987     addw(reg, reg, rscratch2);
1988   }
1989 }
1990 
1991 void MacroAssembler::increment(Register reg, int value)
1992 {
1993   if (value < 0)  { decrement(reg, -value);      return; }
1994   if (value == 0) {                              return; }
1995   if (value < (1 << 12)) { add(reg, reg, value); return; }
1996   /* else */ {
1997     assert(reg != rscratch2, "invalid dst for register increment");
1998     movw(rscratch2, (unsigned)value);
1999     add(reg, reg, rscratch2);
2000   }
2001 }
2002 
2003 void MacroAssembler::incrementw(Address dst, int value)
2004 {
2005   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2006   if (dst.getMode() == Address::literal) {
2007     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2008     lea(rscratch2, dst);
2009     dst = Address(rscratch2);
2010   }
2011   ldrw(rscratch1, dst);
2012   incrementw(rscratch1, value);
2013   strw(rscratch1, dst);
2014 }
2015 
2016 void MacroAssembler::increment(Address dst, int value)
2017 {
2018   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2019   if (dst.getMode() == Address::literal) {
2020     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2021     lea(rscratch2, dst);
2022     dst = Address(rscratch2);
2023   }
2024   ldr(rscratch1, dst);
2025   increment(rscratch1, value);
2026   str(rscratch1, dst);
2027 }
2028 
2029 
2030 void MacroAssembler::pusha() {
2031   push(0x7fffffff, sp);
2032 }
2033 
2034 void MacroAssembler::popa() {
2035   pop(0x7fffffff, sp);
2036 }
2037 
2038 // Push lots of registers in the bit set supplied.  Don't push sp.
2039 // Return the number of words pushed
2040 int MacroAssembler::push(unsigned int bitset, Register stack) {
2041   int words_pushed = 0;
2042 
2043   // Scan bitset to accumulate register pairs
2044   unsigned char regs[32];
2045   int count = 0;
2046   for (int reg = 0; reg <= 30; reg++) {
2047     if (1 & bitset)
2048       regs[count++] = reg;
2049     bitset >>= 1;
2050   }
2051   regs[count++] = zr->encoding_nocheck();
2052   count &= ~1;  // Only push an even nuber of regs
2053 
2054   if (count) {
2055     stp(as_Register(regs[0]), as_Register(regs[1]),
2056        Address(pre(stack, -count * wordSize)));
2057     words_pushed += 2;
2058   }
2059   for (int i = 2; i < count; i += 2) {
2060     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2061        Address(stack, i * wordSize));
2062     words_pushed += 2;
2063   }
2064 
2065   assert(words_pushed == count, "oops, pushed != count");
2066 
2067   return count;
2068 }
2069 
2070 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2071   int words_pushed = 0;
2072 
2073   // Scan bitset to accumulate register pairs
2074   unsigned char regs[32];
2075   int count = 0;
2076   for (int reg = 0; reg <= 30; reg++) {
2077     if (1 & bitset)
2078       regs[count++] = reg;
2079     bitset >>= 1;
2080   }
2081   regs[count++] = zr->encoding_nocheck();
2082   count &= ~1;
2083 
2084   for (int i = 2; i < count; i += 2) {
2085     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2086        Address(stack, i * wordSize));
2087     words_pushed += 2;
2088   }
2089   if (count) {
2090     ldp(as_Register(regs[0]), as_Register(regs[1]),
2091        Address(post(stack, count * wordSize)));
2092     words_pushed += 2;
2093   }
2094 
2095   assert(words_pushed == count, "oops, pushed != count");
2096 
2097   return count;
2098 }
2099 #ifdef ASSERT
2100 void MacroAssembler::verify_heapbase(const char* msg) {
2101 #if 0
2102   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2103   assert (Universe::heap() != NULL, "java heap should be initialized");
2104   if (CheckCompressedOops) {
2105     Label ok;
2106     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2107     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2108     br(Assembler::EQ, ok);
2109     stop(msg);
2110     bind(ok);
2111     pop(1 << rscratch1->encoding(), sp);
2112   }
2113 #endif
2114 }
2115 #endif
2116 
2117 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2118   Label done, not_weak;
2119   cbz(value, done);           // Use NULL as-is.
2120 
2121   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2122   tbz(r0, 0, not_weak);    // Test for jweak tag.
2123 
2124   // Resolve jweak.
2125   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2126                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2127   verify_oop(value);
2128   b(done);
2129 
2130   bind(not_weak);
2131   // Resolve (untagged) jobject.
2132   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2133   verify_oop(value);
2134   bind(done);
2135 }
2136 
2137 void MacroAssembler::stop(const char* msg) {
2138   address ip = pc();
2139   pusha();
2140   movptr(c_rarg0, (uintptr_t)(address)msg);
2141   movptr(c_rarg1, (uintptr_t)(address)ip);
2142   mov(c_rarg2, sp);
2143   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2144   blr(c_rarg3);
2145   hlt(0);
2146 }
2147 
2148 void MacroAssembler::warn(const char* msg) {
2149   pusha();
2150   mov(c_rarg0, (address)msg);
2151   mov(lr, CAST_FROM_FN_PTR(address, warning));
2152   blr(lr);
2153   popa();
2154 }
2155 
2156 void MacroAssembler::unimplemented(const char* what) {
2157   const char* buf = NULL;
2158   {
2159     ResourceMark rm;
2160     stringStream ss;
2161     ss.print("unimplemented: %s", what);
2162     buf = code_string(ss.as_string());
2163   }
2164   stop(buf);
2165 }
2166 
2167 // If a constant does not fit in an immediate field, generate some
2168 // number of MOV instructions and then perform the operation.
2169 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2170                                            add_sub_imm_insn insn1,
2171                                            add_sub_reg_insn insn2) {
2172   assert(Rd != zr, "Rd = zr and not setting flags?");
2173   if (operand_valid_for_add_sub_immediate((int)imm)) {
2174     (this->*insn1)(Rd, Rn, imm);
2175   } else {
2176     if (uabs(imm) < (1 << 24)) {
2177        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2178        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2179     } else {
2180        assert_different_registers(Rd, Rn);
2181        mov(Rd, (uint64_t)imm);
2182        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2183     }
2184   }
2185 }
2186 
2187 // Seperate vsn which sets the flags. Optimisations are more restricted
2188 // because we must set the flags correctly.
2189 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2190                                            add_sub_imm_insn insn1,
2191                                            add_sub_reg_insn insn2) {
2192   if (operand_valid_for_add_sub_immediate((int)imm)) {
2193     (this->*insn1)(Rd, Rn, imm);
2194   } else {
2195     assert_different_registers(Rd, Rn);
2196     assert(Rd != zr, "overflow in immediate operand");
2197     mov(Rd, (uint64_t)imm);
2198     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2199   }
2200 }
2201 
2202 
2203 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2204   if (increment.is_register()) {
2205     add(Rd, Rn, increment.as_register());
2206   } else {
2207     add(Rd, Rn, increment.as_constant());
2208   }
2209 }
2210 
2211 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2212   if (increment.is_register()) {
2213     addw(Rd, Rn, increment.as_register());
2214   } else {
2215     addw(Rd, Rn, increment.as_constant());
2216   }
2217 }
2218 
2219 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2220   if (decrement.is_register()) {
2221     sub(Rd, Rn, decrement.as_register());
2222   } else {
2223     sub(Rd, Rn, decrement.as_constant());
2224   }
2225 }
2226 
2227 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2228   if (decrement.is_register()) {
2229     subw(Rd, Rn, decrement.as_register());
2230   } else {
2231     subw(Rd, Rn, decrement.as_constant());
2232   }
2233 }
2234 
2235 void MacroAssembler::reinit_heapbase()
2236 {
2237   if (UseCompressedOops) {
2238     if (Universe::is_fully_initialized()) {
2239       mov(rheapbase, Universe::narrow_ptrs_base());
2240     } else {
2241       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2242       ldr(rheapbase, Address(rheapbase));
2243     }
2244   }
2245 }
2246 
2247 // this simulates the behaviour of the x86 cmpxchg instruction using a
2248 // load linked/store conditional pair. we use the acquire/release
2249 // versions of these instructions so that we flush pending writes as
2250 // per Java semantics.
2251 
2252 // n.b the x86 version assumes the old value to be compared against is
2253 // in rax and updates rax with the value located in memory if the
2254 // cmpxchg fails. we supply a register for the old value explicitly
2255 
2256 // the aarch64 load linked/store conditional instructions do not
2257 // accept an offset. so, unlike x86, we must provide a plain register
2258 // to identify the memory word to be compared/exchanged rather than a
2259 // register+offset Address.
2260 
2261 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2262                                 Label &succeed, Label *fail) {
2263   // oldv holds comparison value
2264   // newv holds value to write in exchange
2265   // addr identifies memory word to compare against/update
2266   if (UseLSE) {
2267     mov(tmp, oldv);
2268     casal(Assembler::xword, oldv, newv, addr);
2269     cmp(tmp, oldv);
2270     br(Assembler::EQ, succeed);
2271     membar(AnyAny);
2272   } else {
2273     Label retry_load, nope;
2274     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2275       prfm(Address(addr), PSTL1STRM);
2276     bind(retry_load);
2277     // flush and load exclusive from the memory location
2278     // and fail if it is not what we expect
2279     ldaxr(tmp, addr);
2280     cmp(tmp, oldv);
2281     br(Assembler::NE, nope);
2282     // if we store+flush with no intervening write tmp wil be zero
2283     stlxr(tmp, newv, addr);
2284     cbzw(tmp, succeed);
2285     // retry so we only ever return after a load fails to compare
2286     // ensures we don't return a stale value after a failed write.
2287     b(retry_load);
2288     // if the memory word differs we return it in oldv and signal a fail
2289     bind(nope);
2290     membar(AnyAny);
2291     mov(oldv, tmp);
2292   }
2293   if (fail)
2294     b(*fail);
2295 }
2296 
2297 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2298                                         Label &succeed, Label *fail) {
2299   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2300   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2301 }
2302 
2303 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2304                                 Label &succeed, Label *fail) {
2305   // oldv holds comparison value
2306   // newv holds value to write in exchange
2307   // addr identifies memory word to compare against/update
2308   // tmp returns 0/1 for success/failure
2309   if (UseLSE) {
2310     mov(tmp, oldv);
2311     casal(Assembler::word, oldv, newv, addr);
2312     cmp(tmp, oldv);
2313     br(Assembler::EQ, succeed);
2314     membar(AnyAny);
2315   } else {
2316     Label retry_load, nope;
2317     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2318       prfm(Address(addr), PSTL1STRM);
2319     bind(retry_load);
2320     // flush and load exclusive from the memory location
2321     // and fail if it is not what we expect
2322     ldaxrw(tmp, addr);
2323     cmp(tmp, oldv);
2324     br(Assembler::NE, nope);
2325     // if we store+flush with no intervening write tmp wil be zero
2326     stlxrw(tmp, newv, addr);
2327     cbzw(tmp, succeed);
2328     // retry so we only ever return after a load fails to compare
2329     // ensures we don't return a stale value after a failed write.
2330     b(retry_load);
2331     // if the memory word differs we return it in oldv and signal a fail
2332     bind(nope);
2333     membar(AnyAny);
2334     mov(oldv, tmp);
2335   }
2336   if (fail)
2337     b(*fail);
2338 }
2339 
2340 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2341 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2342 // Pass a register for the result, otherwise pass noreg.
2343 
2344 // Clobbers rscratch1
2345 void MacroAssembler::cmpxchg(Register addr, Register expected,
2346                              Register new_val,
2347                              enum operand_size size,
2348                              bool acquire, bool release,
2349                              bool weak,
2350                              Register result) {
2351   if (result == noreg)  result = rscratch1;
2352   BLOCK_COMMENT("cmpxchg {");
2353   if (UseLSE) {
2354     mov(result, expected);
2355     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2356     compare_eq(result, expected, size);
2357   } else {
2358     Label retry_load, done;
2359     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2360       prfm(Address(addr), PSTL1STRM);
2361     bind(retry_load);
2362     load_exclusive(result, addr, size, acquire);
2363     compare_eq(result, expected, size);
2364     br(Assembler::NE, done);
2365     store_exclusive(rscratch1, new_val, addr, size, release);
2366     if (weak) {
2367       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2368     } else {
2369       cbnzw(rscratch1, retry_load);
2370     }
2371     bind(done);
2372   }
2373   BLOCK_COMMENT("} cmpxchg");
2374 }
2375 
2376 // A generic comparison. Only compares for equality, clobbers rscratch1.
2377 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2378   if (size == xword) {
2379     cmp(rm, rn);
2380   } else if (size == word) {
2381     cmpw(rm, rn);
2382   } else if (size == halfword) {
2383     eorw(rscratch1, rm, rn);
2384     ands(zr, rscratch1, 0xffff);
2385   } else if (size == byte) {
2386     eorw(rscratch1, rm, rn);
2387     ands(zr, rscratch1, 0xff);
2388   } else {
2389     ShouldNotReachHere();
2390   }
2391 }
2392 
2393 
2394 static bool different(Register a, RegisterOrConstant b, Register c) {
2395   if (b.is_constant())
2396     return a != c;
2397   else
2398     return a != b.as_register() && a != c && b.as_register() != c;
2399 }
2400 
2401 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2402 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2403   if (UseLSE) {                                                         \
2404     prev = prev->is_valid() ? prev : zr;                                \
2405     if (incr.is_register()) {                                           \
2406       AOP(sz, incr.as_register(), prev, addr);                          \
2407     } else {                                                            \
2408       mov(rscratch2, incr.as_constant());                               \
2409       AOP(sz, rscratch2, prev, addr);                                   \
2410     }                                                                   \
2411     return;                                                             \
2412   }                                                                     \
2413   Register result = rscratch2;                                          \
2414   if (prev->is_valid())                                                 \
2415     result = different(prev, incr, addr) ? prev : rscratch2;            \
2416                                                                         \
2417   Label retry_load;                                                     \
2418   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2419     prfm(Address(addr), PSTL1STRM);                                     \
2420   bind(retry_load);                                                     \
2421   LDXR(result, addr);                                                   \
2422   OP(rscratch1, result, incr);                                          \
2423   STXR(rscratch2, rscratch1, addr);                                     \
2424   cbnzw(rscratch2, retry_load);                                         \
2425   if (prev->is_valid() && prev != result) {                             \
2426     IOP(prev, rscratch1, incr);                                         \
2427   }                                                                     \
2428 }
2429 
2430 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2431 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2432 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2433 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2434 
2435 #undef ATOMIC_OP
2436 
2437 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2438 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2439   if (UseLSE) {                                                         \
2440     prev = prev->is_valid() ? prev : zr;                                \
2441     AOP(sz, newv, prev, addr);                                          \
2442     return;                                                             \
2443   }                                                                     \
2444   Register result = rscratch2;                                          \
2445   if (prev->is_valid())                                                 \
2446     result = different(prev, newv, addr) ? prev : rscratch2;            \
2447                                                                         \
2448   Label retry_load;                                                     \
2449   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2450     prfm(Address(addr), PSTL1STRM);                                     \
2451   bind(retry_load);                                                     \
2452   LDXR(result, addr);                                                   \
2453   STXR(rscratch1, newv, addr);                                          \
2454   cbnzw(rscratch1, retry_load);                                         \
2455   if (prev->is_valid() && prev != result)                               \
2456     mov(prev, result);                                                  \
2457 }
2458 
2459 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2460 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2461 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2462 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2463 
2464 #undef ATOMIC_XCHG
2465 
2466 #ifndef PRODUCT
2467 extern "C" void findpc(intptr_t x);
2468 #endif
2469 
2470 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2471 {
2472   // In order to get locks to work, we need to fake a in_VM state
2473   if (ShowMessageBoxOnError ) {
2474     JavaThread* thread = JavaThread::current();
2475     JavaThreadState saved_state = thread->thread_state();
2476     thread->set_thread_state(_thread_in_vm);
2477 #ifndef PRODUCT
2478     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2479       ttyLocker ttyl;
2480       BytecodeCounter::print();
2481     }
2482 #endif
2483     if (os::message_box(msg, "Execution stopped, print registers?")) {
2484       ttyLocker ttyl;
2485       tty->print_cr(" pc = 0x%016lx", pc);
2486 #ifndef PRODUCT
2487       tty->cr();
2488       findpc(pc);
2489       tty->cr();
2490 #endif
2491       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2492       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2493       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2494       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2495       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2496       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2497       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2498       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2499       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2500       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2501       tty->print_cr("r10 = 0x%016lx", regs[10]);
2502       tty->print_cr("r11 = 0x%016lx", regs[11]);
2503       tty->print_cr("r12 = 0x%016lx", regs[12]);
2504       tty->print_cr("r13 = 0x%016lx", regs[13]);
2505       tty->print_cr("r14 = 0x%016lx", regs[14]);
2506       tty->print_cr("r15 = 0x%016lx", regs[15]);
2507       tty->print_cr("r16 = 0x%016lx", regs[16]);
2508       tty->print_cr("r17 = 0x%016lx", regs[17]);
2509       tty->print_cr("r18 = 0x%016lx", regs[18]);
2510       tty->print_cr("r19 = 0x%016lx", regs[19]);
2511       tty->print_cr("r20 = 0x%016lx", regs[20]);
2512       tty->print_cr("r21 = 0x%016lx", regs[21]);
2513       tty->print_cr("r22 = 0x%016lx", regs[22]);
2514       tty->print_cr("r23 = 0x%016lx", regs[23]);
2515       tty->print_cr("r24 = 0x%016lx", regs[24]);
2516       tty->print_cr("r25 = 0x%016lx", regs[25]);
2517       tty->print_cr("r26 = 0x%016lx", regs[26]);
2518       tty->print_cr("r27 = 0x%016lx", regs[27]);
2519       tty->print_cr("r28 = 0x%016lx", regs[28]);
2520       tty->print_cr("r30 = 0x%016lx", regs[30]);
2521       tty->print_cr("r31 = 0x%016lx", regs[31]);
2522       BREAKPOINT;
2523     }
2524     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2525   } else {
2526     ttyLocker ttyl;
2527     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2528                     msg);
2529     assert(false, "DEBUG MESSAGE: %s", msg);
2530   }
2531 }
2532 
2533 void MacroAssembler::push_call_clobbered_registers() {
2534   int step = 4 * wordSize;
2535   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2536   sub(sp, sp, step);
2537   mov(rscratch1, -step);
2538   // Push v0-v7, v16-v31.
2539   for (int i = 31; i>= 4; i -= 4) {
2540     if (i <= v7->encoding() || i >= v16->encoding())
2541       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2542           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2543   }
2544   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2545       as_FloatRegister(3), T1D, Address(sp));
2546 }
2547 
2548 void MacroAssembler::pop_call_clobbered_registers() {
2549   for (int i = 0; i < 32; i += 4) {
2550     if (i <= v7->encoding() || i >= v16->encoding())
2551       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2552           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2553   }
2554 
2555   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2556 }
2557 
2558 void MacroAssembler::push_CPU_state(bool save_vectors) {
2559   int step = (save_vectors ? 8 : 4) * wordSize;
2560   push(0x3fffffff, sp);         // integer registers except lr & sp
2561   mov(rscratch1, -step);
2562   sub(sp, sp, step);
2563   for (int i = 28; i >= 4; i -= 4) {
2564     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2565         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2566   }
2567   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2568 }
2569 
2570 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2571   int step = (restore_vectors ? 8 : 4) * wordSize;
2572   for (int i = 0; i <= 28; i += 4)
2573     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2574         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2575   pop(0x3fffffff, sp);         // integer registers except lr & sp
2576 }
2577 
2578 /**
2579  * Helpers for multiply_to_len().
2580  */
2581 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2582                                      Register src1, Register src2) {
2583   adds(dest_lo, dest_lo, src1);
2584   adc(dest_hi, dest_hi, zr);
2585   adds(dest_lo, dest_lo, src2);
2586   adc(final_dest_hi, dest_hi, zr);
2587 }
2588 
2589 // Generate an address from (r + r1 extend offset).  "size" is the
2590 // size of the operand.  The result may be in rscratch2.
2591 Address MacroAssembler::offsetted_address(Register r, Register r1,
2592                                           Address::extend ext, int offset, int size) {
2593   if (offset || (ext.shift() % size != 0)) {
2594     lea(rscratch2, Address(r, r1, ext));
2595     return Address(rscratch2, offset);
2596   } else {
2597     return Address(r, r1, ext);
2598   }
2599 }
2600 
2601 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2602 {
2603   assert(offset >= 0, "spill to negative address?");
2604   // Offset reachable ?
2605   //   Not aligned - 9 bits signed offset
2606   //   Aligned - 12 bits unsigned offset shifted
2607   Register base = sp;
2608   if ((offset & (size-1)) && offset >= (1<<8)) {
2609     add(tmp, base, offset & ((1<<12)-1));
2610     base = tmp;
2611     offset &= -1u<<12;
2612   }
2613 
2614   if (offset >= (1<<12) * size) {
2615     add(tmp, base, offset & (((1<<12)-1)<<12));
2616     base = tmp;
2617     offset &= ~(((1<<12)-1)<<12);
2618   }
2619 
2620   return Address(base, offset);
2621 }
2622 
2623 // Checks whether offset is aligned.
2624 // Returns true if it is, else false.
2625 bool MacroAssembler::merge_alignment_check(Register base,
2626                                            size_t size,
2627                                            long cur_offset,
2628                                            long prev_offset) const {
2629   if (AvoidUnalignedAccesses) {
2630     if (base == sp) {
2631       // Checks whether low offset if aligned to pair of registers.
2632       long pair_mask = size * 2 - 1;
2633       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2634       return (offset & pair_mask) == 0;
2635     } else { // If base is not sp, we can't guarantee the access is aligned.
2636       return false;
2637     }
2638   } else {
2639     long mask = size - 1;
2640     // Load/store pair instruction only supports element size aligned offset.
2641     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2642   }
2643 }
2644 
2645 // Checks whether current and previous loads/stores can be merged.
2646 // Returns true if it can be merged, else false.
2647 bool MacroAssembler::ldst_can_merge(Register rt,
2648                                     const Address &adr,
2649                                     size_t cur_size_in_bytes,
2650                                     bool is_store) const {
2651   address prev = pc() - NativeInstruction::instruction_size;
2652   address last = code()->last_insn();
2653 
2654   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2655     return false;
2656   }
2657 
2658   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2659     return false;
2660   }
2661 
2662   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2663   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2664 
2665   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2666   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2667 
2668   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2669     return false;
2670   }
2671 
2672   long max_offset = 63 * prev_size_in_bytes;
2673   long min_offset = -64 * prev_size_in_bytes;
2674 
2675   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2676 
2677   // Only same base can be merged.
2678   if (adr.base() != prev_ldst->base()) {
2679     return false;
2680   }
2681 
2682   long cur_offset = adr.offset();
2683   long prev_offset = prev_ldst->offset();
2684   size_t diff = abs(cur_offset - prev_offset);
2685   if (diff != prev_size_in_bytes) {
2686     return false;
2687   }
2688 
2689   // Following cases can not be merged:
2690   // ldr x2, [x2, #8]
2691   // ldr x3, [x2, #16]
2692   // or:
2693   // ldr x2, [x3, #8]
2694   // ldr x2, [x3, #16]
2695   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2696   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2697     return false;
2698   }
2699 
2700   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2701   // Offset range must be in ldp/stp instruction's range.
2702   if (low_offset > max_offset || low_offset < min_offset) {
2703     return false;
2704   }
2705 
2706   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2707     return true;
2708   }
2709 
2710   return false;
2711 }
2712 
2713 // Merge current load/store with previous load/store into ldp/stp.
2714 void MacroAssembler::merge_ldst(Register rt,
2715                                 const Address &adr,
2716                                 size_t cur_size_in_bytes,
2717                                 bool is_store) {
2718 
2719   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2720 
2721   Register rt_low, rt_high;
2722   address prev = pc() - NativeInstruction::instruction_size;
2723   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2724 
2725   long offset;
2726 
2727   if (adr.offset() < prev_ldst->offset()) {
2728     offset = adr.offset();
2729     rt_low = rt;
2730     rt_high = prev_ldst->target();
2731   } else {
2732     offset = prev_ldst->offset();
2733     rt_low = prev_ldst->target();
2734     rt_high = rt;
2735   }
2736 
2737   Address adr_p = Address(prev_ldst->base(), offset);
2738   // Overwrite previous generated binary.
2739   code_section()->set_end(prev);
2740 
2741   const int sz = prev_ldst->size_in_bytes();
2742   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2743   if (!is_store) {
2744     BLOCK_COMMENT("merged ldr pair");
2745     if (sz == 8) {
2746       ldp(rt_low, rt_high, adr_p);
2747     } else {
2748       ldpw(rt_low, rt_high, adr_p);
2749     }
2750   } else {
2751     BLOCK_COMMENT("merged str pair");
2752     if (sz == 8) {
2753       stp(rt_low, rt_high, adr_p);
2754     } else {
2755       stpw(rt_low, rt_high, adr_p);
2756     }
2757   }
2758 }
2759 
2760 /**
2761  * Multiply 64 bit by 64 bit first loop.
2762  */
2763 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2764                                            Register y, Register y_idx, Register z,
2765                                            Register carry, Register product,
2766                                            Register idx, Register kdx) {
2767   //
2768   //  jlong carry, x[], y[], z[];
2769   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2770   //    huge_128 product = y[idx] * x[xstart] + carry;
2771   //    z[kdx] = (jlong)product;
2772   //    carry  = (jlong)(product >>> 64);
2773   //  }
2774   //  z[xstart] = carry;
2775   //
2776 
2777   Label L_first_loop, L_first_loop_exit;
2778   Label L_one_x, L_one_y, L_multiply;
2779 
2780   subsw(xstart, xstart, 1);
2781   br(Assembler::MI, L_one_x);
2782 
2783   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2784   ldr(x_xstart, Address(rscratch1));
2785   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2786 
2787   bind(L_first_loop);
2788   subsw(idx, idx, 1);
2789   br(Assembler::MI, L_first_loop_exit);
2790   subsw(idx, idx, 1);
2791   br(Assembler::MI, L_one_y);
2792   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2793   ldr(y_idx, Address(rscratch1));
2794   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2795   bind(L_multiply);
2796 
2797   // AArch64 has a multiply-accumulate instruction that we can't use
2798   // here because it has no way to process carries, so we have to use
2799   // separate add and adc instructions.  Bah.
2800   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2801   mul(product, x_xstart, y_idx);
2802   adds(product, product, carry);
2803   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2804 
2805   subw(kdx, kdx, 2);
2806   ror(product, product, 32); // back to big-endian
2807   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2808 
2809   b(L_first_loop);
2810 
2811   bind(L_one_y);
2812   ldrw(y_idx, Address(y,  0));
2813   b(L_multiply);
2814 
2815   bind(L_one_x);
2816   ldrw(x_xstart, Address(x,  0));
2817   b(L_first_loop);
2818 
2819   bind(L_first_loop_exit);
2820 }
2821 
2822 /**
2823  * Multiply 128 bit by 128. Unrolled inner loop.
2824  *
2825  */
2826 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2827                                              Register carry, Register carry2,
2828                                              Register idx, Register jdx,
2829                                              Register yz_idx1, Register yz_idx2,
2830                                              Register tmp, Register tmp3, Register tmp4,
2831                                              Register tmp6, Register product_hi) {
2832 
2833   //   jlong carry, x[], y[], z[];
2834   //   int kdx = ystart+1;
2835   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2836   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2837   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2838   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2839   //     carry  = (jlong)(tmp4 >>> 64);
2840   //     z[kdx+idx+1] = (jlong)tmp3;
2841   //     z[kdx+idx] = (jlong)tmp4;
2842   //   }
2843   //   idx += 2;
2844   //   if (idx > 0) {
2845   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2846   //     z[kdx+idx] = (jlong)yz_idx1;
2847   //     carry  = (jlong)(yz_idx1 >>> 64);
2848   //   }
2849   //
2850 
2851   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2852 
2853   lsrw(jdx, idx, 2);
2854 
2855   bind(L_third_loop);
2856 
2857   subsw(jdx, jdx, 1);
2858   br(Assembler::MI, L_third_loop_exit);
2859   subw(idx, idx, 4);
2860 
2861   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2862 
2863   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2864 
2865   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2866 
2867   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2868   ror(yz_idx2, yz_idx2, 32);
2869 
2870   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2871 
2872   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2873   umulh(tmp4, product_hi, yz_idx1);
2874 
2875   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2876   ror(rscratch2, rscratch2, 32);
2877 
2878   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2879   umulh(carry2, product_hi, yz_idx2);
2880 
2881   // propagate sum of both multiplications into carry:tmp4:tmp3
2882   adds(tmp3, tmp3, carry);
2883   adc(tmp4, tmp4, zr);
2884   adds(tmp3, tmp3, rscratch1);
2885   adcs(tmp4, tmp4, tmp);
2886   adc(carry, carry2, zr);
2887   adds(tmp4, tmp4, rscratch2);
2888   adc(carry, carry, zr);
2889 
2890   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2891   ror(tmp4, tmp4, 32);
2892   stp(tmp4, tmp3, Address(tmp6, 0));
2893 
2894   b(L_third_loop);
2895   bind (L_third_loop_exit);
2896 
2897   andw (idx, idx, 0x3);
2898   cbz(idx, L_post_third_loop_done);
2899 
2900   Label L_check_1;
2901   subsw(idx, idx, 2);
2902   br(Assembler::MI, L_check_1);
2903 
2904   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2905   ldr(yz_idx1, Address(rscratch1, 0));
2906   ror(yz_idx1, yz_idx1, 32);
2907   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2908   umulh(tmp4, product_hi, yz_idx1);
2909   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2910   ldr(yz_idx2, Address(rscratch1, 0));
2911   ror(yz_idx2, yz_idx2, 32);
2912 
2913   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2914 
2915   ror(tmp3, tmp3, 32);
2916   str(tmp3, Address(rscratch1, 0));
2917 
2918   bind (L_check_1);
2919 
2920   andw (idx, idx, 0x1);
2921   subsw(idx, idx, 1);
2922   br(Assembler::MI, L_post_third_loop_done);
2923   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2924   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2925   umulh(carry2, tmp4, product_hi);
2926   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2927 
2928   add2_with_carry(carry2, tmp3, tmp4, carry);
2929 
2930   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2931   extr(carry, carry2, tmp3, 32);
2932 
2933   bind(L_post_third_loop_done);
2934 }
2935 
2936 /**
2937  * Code for BigInteger::multiplyToLen() instrinsic.
2938  *
2939  * r0: x
2940  * r1: xlen
2941  * r2: y
2942  * r3: ylen
2943  * r4:  z
2944  * r5: zlen
2945  * r10: tmp1
2946  * r11: tmp2
2947  * r12: tmp3
2948  * r13: tmp4
2949  * r14: tmp5
2950  * r15: tmp6
2951  * r16: tmp7
2952  *
2953  */
2954 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2955                                      Register z, Register zlen,
2956                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2957                                      Register tmp5, Register tmp6, Register product_hi) {
2958 
2959   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2960 
2961   const Register idx = tmp1;
2962   const Register kdx = tmp2;
2963   const Register xstart = tmp3;
2964 
2965   const Register y_idx = tmp4;
2966   const Register carry = tmp5;
2967   const Register product  = xlen;
2968   const Register x_xstart = zlen;  // reuse register
2969 
2970   // First Loop.
2971   //
2972   //  final static long LONG_MASK = 0xffffffffL;
2973   //  int xstart = xlen - 1;
2974   //  int ystart = ylen - 1;
2975   //  long carry = 0;
2976   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2977   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2978   //    z[kdx] = (int)product;
2979   //    carry = product >>> 32;
2980   //  }
2981   //  z[xstart] = (int)carry;
2982   //
2983 
2984   movw(idx, ylen);      // idx = ylen;
2985   movw(kdx, zlen);      // kdx = xlen+ylen;
2986   mov(carry, zr);       // carry = 0;
2987 
2988   Label L_done;
2989 
2990   movw(xstart, xlen);
2991   subsw(xstart, xstart, 1);
2992   br(Assembler::MI, L_done);
2993 
2994   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2995 
2996   Label L_second_loop;
2997   cbzw(kdx, L_second_loop);
2998 
2999   Label L_carry;
3000   subw(kdx, kdx, 1);
3001   cbzw(kdx, L_carry);
3002 
3003   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3004   lsr(carry, carry, 32);
3005   subw(kdx, kdx, 1);
3006 
3007   bind(L_carry);
3008   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3009 
3010   // Second and third (nested) loops.
3011   //
3012   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3013   //   carry = 0;
3014   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3015   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3016   //                    (z[k] & LONG_MASK) + carry;
3017   //     z[k] = (int)product;
3018   //     carry = product >>> 32;
3019   //   }
3020   //   z[i] = (int)carry;
3021   // }
3022   //
3023   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3024 
3025   const Register jdx = tmp1;
3026 
3027   bind(L_second_loop);
3028   mov(carry, zr);                // carry = 0;
3029   movw(jdx, ylen);               // j = ystart+1
3030 
3031   subsw(xstart, xstart, 1);      // i = xstart-1;
3032   br(Assembler::MI, L_done);
3033 
3034   str(z, Address(pre(sp, -4 * wordSize)));
3035 
3036   Label L_last_x;
3037   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3038   subsw(xstart, xstart, 1);       // i = xstart-1;
3039   br(Assembler::MI, L_last_x);
3040 
3041   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3042   ldr(product_hi, Address(rscratch1));
3043   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3044 
3045   Label L_third_loop_prologue;
3046   bind(L_third_loop_prologue);
3047 
3048   str(ylen, Address(sp, wordSize));
3049   stp(x, xstart, Address(sp, 2 * wordSize));
3050   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3051                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3052   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3053   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3054 
3055   addw(tmp3, xlen, 1);
3056   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3057   subsw(tmp3, tmp3, 1);
3058   br(Assembler::MI, L_done);
3059 
3060   lsr(carry, carry, 32);
3061   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3062   b(L_second_loop);
3063 
3064   // Next infrequent code is moved outside loops.
3065   bind(L_last_x);
3066   ldrw(product_hi, Address(x,  0));
3067   b(L_third_loop_prologue);
3068 
3069   bind(L_done);
3070 }
3071 
3072 // Code for BigInteger::mulAdd instrinsic
3073 // out     = r0
3074 // in      = r1
3075 // offset  = r2  (already out.length-offset)
3076 // len     = r3
3077 // k       = r4
3078 //
3079 // pseudo code from java implementation:
3080 // carry = 0;
3081 // offset = out.length-offset - 1;
3082 // for (int j=len-1; j >= 0; j--) {
3083 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3084 //     out[offset--] = (int)product;
3085 //     carry = product >>> 32;
3086 // }
3087 // return (int)carry;
3088 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3089       Register len, Register k) {
3090     Label LOOP, END;
3091     // pre-loop
3092     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3093     csel(out, zr, out, Assembler::EQ);
3094     br(Assembler::EQ, END);
3095     add(in, in, len, LSL, 2); // in[j+1] address
3096     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3097     mov(out, zr); // used to keep carry now
3098     BIND(LOOP);
3099     ldrw(rscratch1, Address(pre(in, -4)));
3100     madd(rscratch1, rscratch1, k, out);
3101     ldrw(rscratch2, Address(pre(offset, -4)));
3102     add(rscratch1, rscratch1, rscratch2);
3103     strw(rscratch1, Address(offset));
3104     lsr(out, rscratch1, 32);
3105     subs(len, len, 1);
3106     br(Assembler::NE, LOOP);
3107     BIND(END);
3108 }
3109 
3110 /**
3111  * Emits code to update CRC-32 with a byte value according to constants in table
3112  *
3113  * @param [in,out]crc   Register containing the crc.
3114  * @param [in]val       Register containing the byte to fold into the CRC.
3115  * @param [in]table     Register containing the table of crc constants.
3116  *
3117  * uint32_t crc;
3118  * val = crc_table[(val ^ crc) & 0xFF];
3119  * crc = val ^ (crc >> 8);
3120  *
3121  */
3122 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3123   eor(val, val, crc);
3124   andr(val, val, 0xff);
3125   ldrw(val, Address(table, val, Address::lsl(2)));
3126   eor(crc, val, crc, Assembler::LSR, 8);
3127 }
3128 
3129 /**
3130  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3131  *
3132  * @param [in,out]crc   Register containing the crc.
3133  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3134  * @param [in]table0    Register containing table 0 of crc constants.
3135  * @param [in]table1    Register containing table 1 of crc constants.
3136  * @param [in]table2    Register containing table 2 of crc constants.
3137  * @param [in]table3    Register containing table 3 of crc constants.
3138  *
3139  * uint32_t crc;
3140  *   v = crc ^ v
3141  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3142  *
3143  */
3144 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3145         Register table0, Register table1, Register table2, Register table3,
3146         bool upper) {
3147   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3148   uxtb(tmp, v);
3149   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3150   ubfx(tmp, v, 8, 8);
3151   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3152   eor(crc, crc, tmp);
3153   ubfx(tmp, v, 16, 8);
3154   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3155   eor(crc, crc, tmp);
3156   ubfx(tmp, v, 24, 8);
3157   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3158   eor(crc, crc, tmp);
3159 }
3160 
3161 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3162         Register len, Register tmp0, Register tmp1, Register tmp2,
3163         Register tmp3) {
3164     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3165     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3166 
3167     mvnw(crc, crc);
3168 
3169     subs(len, len, 128);
3170     br(Assembler::GE, CRC_by64_pre);
3171   BIND(CRC_less64);
3172     adds(len, len, 128-32);
3173     br(Assembler::GE, CRC_by32_loop);
3174   BIND(CRC_less32);
3175     adds(len, len, 32-4);
3176     br(Assembler::GE, CRC_by4_loop);
3177     adds(len, len, 4);
3178     br(Assembler::GT, CRC_by1_loop);
3179     b(L_exit);
3180 
3181   BIND(CRC_by32_loop);
3182     ldp(tmp0, tmp1, Address(post(buf, 16)));
3183     subs(len, len, 32);
3184     crc32x(crc, crc, tmp0);
3185     ldr(tmp2, Address(post(buf, 8)));
3186     crc32x(crc, crc, tmp1);
3187     ldr(tmp3, Address(post(buf, 8)));
3188     crc32x(crc, crc, tmp2);
3189     crc32x(crc, crc, tmp3);
3190     br(Assembler::GE, CRC_by32_loop);
3191     cmn(len, 32);
3192     br(Assembler::NE, CRC_less32);
3193     b(L_exit);
3194 
3195   BIND(CRC_by4_loop);
3196     ldrw(tmp0, Address(post(buf, 4)));
3197     subs(len, len, 4);
3198     crc32w(crc, crc, tmp0);
3199     br(Assembler::GE, CRC_by4_loop);
3200     adds(len, len, 4);
3201     br(Assembler::LE, L_exit);
3202   BIND(CRC_by1_loop);
3203     ldrb(tmp0, Address(post(buf, 1)));
3204     subs(len, len, 1);
3205     crc32b(crc, crc, tmp0);
3206     br(Assembler::GT, CRC_by1_loop);
3207     b(L_exit);
3208 
3209   BIND(CRC_by64_pre);
3210     sub(buf, buf, 8);
3211     ldp(tmp0, tmp1, Address(buf, 8));
3212     crc32x(crc, crc, tmp0);
3213     ldr(tmp2, Address(buf, 24));
3214     crc32x(crc, crc, tmp1);
3215     ldr(tmp3, Address(buf, 32));
3216     crc32x(crc, crc, tmp2);
3217     ldr(tmp0, Address(buf, 40));
3218     crc32x(crc, crc, tmp3);
3219     ldr(tmp1, Address(buf, 48));
3220     crc32x(crc, crc, tmp0);
3221     ldr(tmp2, Address(buf, 56));
3222     crc32x(crc, crc, tmp1);
3223     ldr(tmp3, Address(pre(buf, 64)));
3224 
3225     b(CRC_by64_loop);
3226 
3227     align(CodeEntryAlignment);
3228   BIND(CRC_by64_loop);
3229     subs(len, len, 64);
3230     crc32x(crc, crc, tmp2);
3231     ldr(tmp0, Address(buf, 8));
3232     crc32x(crc, crc, tmp3);
3233     ldr(tmp1, Address(buf, 16));
3234     crc32x(crc, crc, tmp0);
3235     ldr(tmp2, Address(buf, 24));
3236     crc32x(crc, crc, tmp1);
3237     ldr(tmp3, Address(buf, 32));
3238     crc32x(crc, crc, tmp2);
3239     ldr(tmp0, Address(buf, 40));
3240     crc32x(crc, crc, tmp3);
3241     ldr(tmp1, Address(buf, 48));
3242     crc32x(crc, crc, tmp0);
3243     ldr(tmp2, Address(buf, 56));
3244     crc32x(crc, crc, tmp1);
3245     ldr(tmp3, Address(pre(buf, 64)));
3246     br(Assembler::GE, CRC_by64_loop);
3247 
3248     // post-loop
3249     crc32x(crc, crc, tmp2);
3250     crc32x(crc, crc, tmp3);
3251 
3252     sub(len, len, 64);
3253     add(buf, buf, 8);
3254     cmn(len, 128);
3255     br(Assembler::NE, CRC_less64);
3256   BIND(L_exit);
3257     mvnw(crc, crc);
3258 }
3259 
3260 /**
3261  * @param crc   register containing existing CRC (32-bit)
3262  * @param buf   register pointing to input byte buffer (byte*)
3263  * @param len   register containing number of bytes
3264  * @param table register that will contain address of CRC table
3265  * @param tmp   scratch register
3266  */
3267 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3268         Register table0, Register table1, Register table2, Register table3,
3269         Register tmp, Register tmp2, Register tmp3) {
3270   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3271   unsigned long offset;
3272 
3273   if (UseCRC32) {
3274       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3275       return;
3276   }
3277 
3278     mvnw(crc, crc);
3279 
3280     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3281     if (offset) add(table0, table0, offset);
3282     add(table1, table0, 1*256*sizeof(juint));
3283     add(table2, table0, 2*256*sizeof(juint));
3284     add(table3, table0, 3*256*sizeof(juint));
3285 
3286   if (UseNeon) {
3287       cmp(len, 64);
3288       br(Assembler::LT, L_by16);
3289       eor(v16, T16B, v16, v16);
3290 
3291     Label L_fold;
3292 
3293       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3294 
3295       ld1(v0, v1, T2D, post(buf, 32));
3296       ld1r(v4, T2D, post(tmp, 8));
3297       ld1r(v5, T2D, post(tmp, 8));
3298       ld1r(v6, T2D, post(tmp, 8));
3299       ld1r(v7, T2D, post(tmp, 8));
3300       mov(v16, T4S, 0, crc);
3301 
3302       eor(v0, T16B, v0, v16);
3303       sub(len, len, 64);
3304 
3305     BIND(L_fold);
3306       pmull(v22, T8H, v0, v5, T8B);
3307       pmull(v20, T8H, v0, v7, T8B);
3308       pmull(v23, T8H, v0, v4, T8B);
3309       pmull(v21, T8H, v0, v6, T8B);
3310 
3311       pmull2(v18, T8H, v0, v5, T16B);
3312       pmull2(v16, T8H, v0, v7, T16B);
3313       pmull2(v19, T8H, v0, v4, T16B);
3314       pmull2(v17, T8H, v0, v6, T16B);
3315 
3316       uzp1(v24, T8H, v20, v22);
3317       uzp2(v25, T8H, v20, v22);
3318       eor(v20, T16B, v24, v25);
3319 
3320       uzp1(v26, T8H, v16, v18);
3321       uzp2(v27, T8H, v16, v18);
3322       eor(v16, T16B, v26, v27);
3323 
3324       ushll2(v22, T4S, v20, T8H, 8);
3325       ushll(v20, T4S, v20, T4H, 8);
3326 
3327       ushll2(v18, T4S, v16, T8H, 8);
3328       ushll(v16, T4S, v16, T4H, 8);
3329 
3330       eor(v22, T16B, v23, v22);
3331       eor(v18, T16B, v19, v18);
3332       eor(v20, T16B, v21, v20);
3333       eor(v16, T16B, v17, v16);
3334 
3335       uzp1(v17, T2D, v16, v20);
3336       uzp2(v21, T2D, v16, v20);
3337       eor(v17, T16B, v17, v21);
3338 
3339       ushll2(v20, T2D, v17, T4S, 16);
3340       ushll(v16, T2D, v17, T2S, 16);
3341 
3342       eor(v20, T16B, v20, v22);
3343       eor(v16, T16B, v16, v18);
3344 
3345       uzp1(v17, T2D, v20, v16);
3346       uzp2(v21, T2D, v20, v16);
3347       eor(v28, T16B, v17, v21);
3348 
3349       pmull(v22, T8H, v1, v5, T8B);
3350       pmull(v20, T8H, v1, v7, T8B);
3351       pmull(v23, T8H, v1, v4, T8B);
3352       pmull(v21, T8H, v1, v6, T8B);
3353 
3354       pmull2(v18, T8H, v1, v5, T16B);
3355       pmull2(v16, T8H, v1, v7, T16B);
3356       pmull2(v19, T8H, v1, v4, T16B);
3357       pmull2(v17, T8H, v1, v6, T16B);
3358 
3359       ld1(v0, v1, T2D, post(buf, 32));
3360 
3361       uzp1(v24, T8H, v20, v22);
3362       uzp2(v25, T8H, v20, v22);
3363       eor(v20, T16B, v24, v25);
3364 
3365       uzp1(v26, T8H, v16, v18);
3366       uzp2(v27, T8H, v16, v18);
3367       eor(v16, T16B, v26, v27);
3368 
3369       ushll2(v22, T4S, v20, T8H, 8);
3370       ushll(v20, T4S, v20, T4H, 8);
3371 
3372       ushll2(v18, T4S, v16, T8H, 8);
3373       ushll(v16, T4S, v16, T4H, 8);
3374 
3375       eor(v22, T16B, v23, v22);
3376       eor(v18, T16B, v19, v18);
3377       eor(v20, T16B, v21, v20);
3378       eor(v16, T16B, v17, v16);
3379 
3380       uzp1(v17, T2D, v16, v20);
3381       uzp2(v21, T2D, v16, v20);
3382       eor(v16, T16B, v17, v21);
3383 
3384       ushll2(v20, T2D, v16, T4S, 16);
3385       ushll(v16, T2D, v16, T2S, 16);
3386 
3387       eor(v20, T16B, v22, v20);
3388       eor(v16, T16B, v16, v18);
3389 
3390       uzp1(v17, T2D, v20, v16);
3391       uzp2(v21, T2D, v20, v16);
3392       eor(v20, T16B, v17, v21);
3393 
3394       shl(v16, T2D, v28, 1);
3395       shl(v17, T2D, v20, 1);
3396 
3397       eor(v0, T16B, v0, v16);
3398       eor(v1, T16B, v1, v17);
3399 
3400       subs(len, len, 32);
3401       br(Assembler::GE, L_fold);
3402 
3403       mov(crc, 0);
3404       mov(tmp, v0, T1D, 0);
3405       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3406       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3407       mov(tmp, v0, T1D, 1);
3408       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3409       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3410       mov(tmp, v1, T1D, 0);
3411       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3412       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3413       mov(tmp, v1, T1D, 1);
3414       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3415       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3416 
3417       add(len, len, 32);
3418   }
3419 
3420   BIND(L_by16);
3421     subs(len, len, 16);
3422     br(Assembler::GE, L_by16_loop);
3423     adds(len, len, 16-4);
3424     br(Assembler::GE, L_by4_loop);
3425     adds(len, len, 4);
3426     br(Assembler::GT, L_by1_loop);
3427     b(L_exit);
3428 
3429   BIND(L_by4_loop);
3430     ldrw(tmp, Address(post(buf, 4)));
3431     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3432     subs(len, len, 4);
3433     br(Assembler::GE, L_by4_loop);
3434     adds(len, len, 4);
3435     br(Assembler::LE, L_exit);
3436   BIND(L_by1_loop);
3437     subs(len, len, 1);
3438     ldrb(tmp, Address(post(buf, 1)));
3439     update_byte_crc32(crc, tmp, table0);
3440     br(Assembler::GT, L_by1_loop);
3441     b(L_exit);
3442 
3443     align(CodeEntryAlignment);
3444   BIND(L_by16_loop);
3445     subs(len, len, 16);
3446     ldp(tmp, tmp3, Address(post(buf, 16)));
3447     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3448     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3449     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3450     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3451     br(Assembler::GE, L_by16_loop);
3452     adds(len, len, 16-4);
3453     br(Assembler::GE, L_by4_loop);
3454     adds(len, len, 4);
3455     br(Assembler::GT, L_by1_loop);
3456   BIND(L_exit);
3457     mvnw(crc, crc);
3458 }
3459 
3460 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3461         Register len, Register tmp0, Register tmp1, Register tmp2,
3462         Register tmp3) {
3463     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3464     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3465 
3466     subs(len, len, 128);
3467     br(Assembler::GE, CRC_by64_pre);
3468   BIND(CRC_less64);
3469     adds(len, len, 128-32);
3470     br(Assembler::GE, CRC_by32_loop);
3471   BIND(CRC_less32);
3472     adds(len, len, 32-4);
3473     br(Assembler::GE, CRC_by4_loop);
3474     adds(len, len, 4);
3475     br(Assembler::GT, CRC_by1_loop);
3476     b(L_exit);
3477 
3478   BIND(CRC_by32_loop);
3479     ldp(tmp0, tmp1, Address(post(buf, 16)));
3480     subs(len, len, 32);
3481     crc32cx(crc, crc, tmp0);
3482     ldr(tmp2, Address(post(buf, 8)));
3483     crc32cx(crc, crc, tmp1);
3484     ldr(tmp3, Address(post(buf, 8)));
3485     crc32cx(crc, crc, tmp2);
3486     crc32cx(crc, crc, tmp3);
3487     br(Assembler::GE, CRC_by32_loop);
3488     cmn(len, 32);
3489     br(Assembler::NE, CRC_less32);
3490     b(L_exit);
3491 
3492   BIND(CRC_by4_loop);
3493     ldrw(tmp0, Address(post(buf, 4)));
3494     subs(len, len, 4);
3495     crc32cw(crc, crc, tmp0);
3496     br(Assembler::GE, CRC_by4_loop);
3497     adds(len, len, 4);
3498     br(Assembler::LE, L_exit);
3499   BIND(CRC_by1_loop);
3500     ldrb(tmp0, Address(post(buf, 1)));
3501     subs(len, len, 1);
3502     crc32cb(crc, crc, tmp0);
3503     br(Assembler::GT, CRC_by1_loop);
3504     b(L_exit);
3505 
3506   BIND(CRC_by64_pre);
3507     sub(buf, buf, 8);
3508     ldp(tmp0, tmp1, Address(buf, 8));
3509     crc32cx(crc, crc, tmp0);
3510     ldr(tmp2, Address(buf, 24));
3511     crc32cx(crc, crc, tmp1);
3512     ldr(tmp3, Address(buf, 32));
3513     crc32cx(crc, crc, tmp2);
3514     ldr(tmp0, Address(buf, 40));
3515     crc32cx(crc, crc, tmp3);
3516     ldr(tmp1, Address(buf, 48));
3517     crc32cx(crc, crc, tmp0);
3518     ldr(tmp2, Address(buf, 56));
3519     crc32cx(crc, crc, tmp1);
3520     ldr(tmp3, Address(pre(buf, 64)));
3521 
3522     b(CRC_by64_loop);
3523 
3524     align(CodeEntryAlignment);
3525   BIND(CRC_by64_loop);
3526     subs(len, len, 64);
3527     crc32cx(crc, crc, tmp2);
3528     ldr(tmp0, Address(buf, 8));
3529     crc32cx(crc, crc, tmp3);
3530     ldr(tmp1, Address(buf, 16));
3531     crc32cx(crc, crc, tmp0);
3532     ldr(tmp2, Address(buf, 24));
3533     crc32cx(crc, crc, tmp1);
3534     ldr(tmp3, Address(buf, 32));
3535     crc32cx(crc, crc, tmp2);
3536     ldr(tmp0, Address(buf, 40));
3537     crc32cx(crc, crc, tmp3);
3538     ldr(tmp1, Address(buf, 48));
3539     crc32cx(crc, crc, tmp0);
3540     ldr(tmp2, Address(buf, 56));
3541     crc32cx(crc, crc, tmp1);
3542     ldr(tmp3, Address(pre(buf, 64)));
3543     br(Assembler::GE, CRC_by64_loop);
3544 
3545     // post-loop
3546     crc32cx(crc, crc, tmp2);
3547     crc32cx(crc, crc, tmp3);
3548 
3549     sub(len, len, 64);
3550     add(buf, buf, 8);
3551     cmn(len, 128);
3552     br(Assembler::NE, CRC_less64);
3553   BIND(L_exit);
3554 }
3555 
3556 /**
3557  * @param crc   register containing existing CRC (32-bit)
3558  * @param buf   register pointing to input byte buffer (byte*)
3559  * @param len   register containing number of bytes
3560  * @param table register that will contain address of CRC table
3561  * @param tmp   scratch register
3562  */
3563 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3564         Register table0, Register table1, Register table2, Register table3,
3565         Register tmp, Register tmp2, Register tmp3) {
3566   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3567 }
3568 
3569 
3570 SkipIfEqual::SkipIfEqual(
3571     MacroAssembler* masm, const bool* flag_addr, bool value) {
3572   _masm = masm;
3573   unsigned long offset;
3574   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3575   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3576   _masm->cbzw(rscratch1, _label);
3577 }
3578 
3579 SkipIfEqual::~SkipIfEqual() {
3580   _masm->bind(_label);
3581 }
3582 
3583 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3584   Address adr;
3585   switch(dst.getMode()) {
3586   case Address::base_plus_offset:
3587     // This is the expected mode, although we allow all the other
3588     // forms below.
3589     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3590     break;
3591   default:
3592     lea(rscratch2, dst);
3593     adr = Address(rscratch2);
3594     break;
3595   }
3596   ldr(rscratch1, adr);
3597   add(rscratch1, rscratch1, src);
3598   str(rscratch1, adr);
3599 }
3600 
3601 void MacroAssembler::cmpptr(Register src1, Address src2) {
3602   unsigned long offset;
3603   adrp(rscratch1, src2, offset);
3604   ldr(rscratch1, Address(rscratch1, offset));
3605   cmp(src1, rscratch1);
3606 }
3607 
3608 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3609   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3610   bs->obj_equals(this, obj1, obj2);
3611 }
3612 
3613 void MacroAssembler::load_klass(Register dst, Register src) {
3614   if (UseCompressedClassPointers) {
3615     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3616     decode_klass_not_null(dst);
3617   } else {
3618     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3619   }
3620 }
3621 
3622 // ((OopHandle)result).resolve();
3623 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3624   // OopHandle::resolve is an indirection.
3625   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3626 }
3627 
3628 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3629   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3630   ldr(dst, Address(rmethod, Method::const_offset()));
3631   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3632   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3633   ldr(dst, Address(dst, mirror_offset));
3634   resolve_oop_handle(dst, tmp);
3635 }
3636 
3637 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3638   if (UseCompressedClassPointers) {
3639     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3640     if (Universe::narrow_klass_base() == NULL) {
3641       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3642       return;
3643     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3644                && Universe::narrow_klass_shift() == 0) {
3645       // Only the bottom 32 bits matter
3646       cmpw(trial_klass, tmp);
3647       return;
3648     }
3649     decode_klass_not_null(tmp);
3650   } else {
3651     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3652   }
3653   cmp(trial_klass, tmp);
3654 }
3655 
3656 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3657   load_klass(dst, src);
3658   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3659 }
3660 
3661 void MacroAssembler::store_klass(Register dst, Register src) {
3662   // FIXME: Should this be a store release?  concurrent gcs assumes
3663   // klass length is valid if klass field is not null.
3664   if (UseCompressedClassPointers) {
3665     encode_klass_not_null(src);
3666     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3667   } else {
3668     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3669   }
3670 }
3671 
3672 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3673   if (UseCompressedClassPointers) {
3674     // Store to klass gap in destination
3675     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3676   }
3677 }
3678 
3679 // Algorithm must match CompressedOops::encode.
3680 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3681 #ifdef ASSERT
3682   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3683 #endif
3684   verify_oop(s, "broken oop in encode_heap_oop");
3685   if (Universe::narrow_oop_base() == NULL) {
3686     if (Universe::narrow_oop_shift() != 0) {
3687       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3688       lsr(d, s, LogMinObjAlignmentInBytes);
3689     } else {
3690       mov(d, s);
3691     }
3692   } else {
3693     subs(d, s, rheapbase);
3694     csel(d, d, zr, Assembler::HS);
3695     lsr(d, d, LogMinObjAlignmentInBytes);
3696 
3697     /*  Old algorithm: is this any worse?
3698     Label nonnull;
3699     cbnz(r, nonnull);
3700     sub(r, r, rheapbase);
3701     bind(nonnull);
3702     lsr(r, r, LogMinObjAlignmentInBytes);
3703     */
3704   }
3705 }
3706 
3707 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3708 #ifdef ASSERT
3709   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3710   if (CheckCompressedOops) {
3711     Label ok;
3712     cbnz(r, ok);
3713     stop("null oop passed to encode_heap_oop_not_null");
3714     bind(ok);
3715   }
3716 #endif
3717   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3718   if (Universe::narrow_oop_base() != NULL) {
3719     sub(r, r, rheapbase);
3720   }
3721   if (Universe::narrow_oop_shift() != 0) {
3722     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3723     lsr(r, r, LogMinObjAlignmentInBytes);
3724   }
3725 }
3726 
3727 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3728 #ifdef ASSERT
3729   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3730   if (CheckCompressedOops) {
3731     Label ok;
3732     cbnz(src, ok);
3733     stop("null oop passed to encode_heap_oop_not_null2");
3734     bind(ok);
3735   }
3736 #endif
3737   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3738 
3739   Register data = src;
3740   if (Universe::narrow_oop_base() != NULL) {
3741     sub(dst, src, rheapbase);
3742     data = dst;
3743   }
3744   if (Universe::narrow_oop_shift() != 0) {
3745     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3746     lsr(dst, data, LogMinObjAlignmentInBytes);
3747     data = dst;
3748   }
3749   if (data == src)
3750     mov(dst, src);
3751 }
3752 
3753 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3754 #ifdef ASSERT
3755   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3756 #endif
3757   if (Universe::narrow_oop_base() == NULL) {
3758     if (Universe::narrow_oop_shift() != 0 || d != s) {
3759       lsl(d, s, Universe::narrow_oop_shift());
3760     }
3761   } else {
3762     Label done;
3763     if (d != s)
3764       mov(d, s);
3765     cbz(s, done);
3766     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3767     bind(done);
3768   }
3769   verify_oop(d, "broken oop in decode_heap_oop");
3770 }
3771 
3772 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3773   assert (UseCompressedOops, "should only be used for compressed headers");
3774   assert (Universe::heap() != NULL, "java heap should be initialized");
3775   // Cannot assert, unverified entry point counts instructions (see .ad file)
3776   // vtableStubs also counts instructions in pd_code_size_limit.
3777   // Also do not verify_oop as this is called by verify_oop.
3778   if (Universe::narrow_oop_shift() != 0) {
3779     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3780     if (Universe::narrow_oop_base() != NULL) {
3781       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3782     } else {
3783       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3784     }
3785   } else {
3786     assert (Universe::narrow_oop_base() == NULL, "sanity");
3787   }
3788 }
3789 
3790 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3791   assert (UseCompressedOops, "should only be used for compressed headers");
3792   assert (Universe::heap() != NULL, "java heap should be initialized");
3793   // Cannot assert, unverified entry point counts instructions (see .ad file)
3794   // vtableStubs also counts instructions in pd_code_size_limit.
3795   // Also do not verify_oop as this is called by verify_oop.
3796   if (Universe::narrow_oop_shift() != 0) {
3797     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3798     if (Universe::narrow_oop_base() != NULL) {
3799       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3800     } else {
3801       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3802     }
3803   } else {
3804     assert (Universe::narrow_oop_base() == NULL, "sanity");
3805     if (dst != src) {
3806       mov(dst, src);
3807     }
3808   }
3809 }
3810 
3811 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3812   if (Universe::narrow_klass_base() == NULL) {
3813     if (Universe::narrow_klass_shift() != 0) {
3814       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3815       lsr(dst, src, LogKlassAlignmentInBytes);
3816     } else {
3817       if (dst != src) mov(dst, src);
3818     }
3819     return;
3820   }
3821 
3822   if (use_XOR_for_compressed_class_base) {
3823     if (Universe::narrow_klass_shift() != 0) {
3824       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3825       lsr(dst, dst, LogKlassAlignmentInBytes);
3826     } else {
3827       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3828     }
3829     return;
3830   }
3831 
3832   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3833       && Universe::narrow_klass_shift() == 0) {
3834     movw(dst, src);
3835     return;
3836   }
3837 
3838 #ifdef ASSERT
3839   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3840 #endif
3841 
3842   Register rbase = dst;
3843   if (dst == src) rbase = rheapbase;
3844   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3845   sub(dst, src, rbase);
3846   if (Universe::narrow_klass_shift() != 0) {
3847     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3848     lsr(dst, dst, LogKlassAlignmentInBytes);
3849   }
3850   if (dst == src) reinit_heapbase();
3851 }
3852 
3853 void MacroAssembler::encode_klass_not_null(Register r) {
3854   encode_klass_not_null(r, r);
3855 }
3856 
3857 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3858   Register rbase = dst;
3859   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3860 
3861   if (Universe::narrow_klass_base() == NULL) {
3862     if (Universe::narrow_klass_shift() != 0) {
3863       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3864       lsl(dst, src, LogKlassAlignmentInBytes);
3865     } else {
3866       if (dst != src) mov(dst, src);
3867     }
3868     return;
3869   }
3870 
3871   if (use_XOR_for_compressed_class_base) {
3872     if (Universe::narrow_klass_shift() != 0) {
3873       lsl(dst, src, LogKlassAlignmentInBytes);
3874       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3875     } else {
3876       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3877     }
3878     return;
3879   }
3880 
3881   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3882       && Universe::narrow_klass_shift() == 0) {
3883     if (dst != src)
3884       movw(dst, src);
3885     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3886     return;
3887   }
3888 
3889   // Cannot assert, unverified entry point counts instructions (see .ad file)
3890   // vtableStubs also counts instructions in pd_code_size_limit.
3891   // Also do not verify_oop as this is called by verify_oop.
3892   if (dst == src) rbase = rheapbase;
3893   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3894   if (Universe::narrow_klass_shift() != 0) {
3895     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3896     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3897   } else {
3898     add(dst, rbase, src);
3899   }
3900   if (dst == src) reinit_heapbase();
3901 }
3902 
3903 void  MacroAssembler::decode_klass_not_null(Register r) {
3904   decode_klass_not_null(r, r);
3905 }
3906 
3907 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3908 #ifdef ASSERT
3909   {
3910     ThreadInVMfromUnknown tiv;
3911     assert (UseCompressedOops, "should only be used for compressed oops");
3912     assert (Universe::heap() != NULL, "java heap should be initialized");
3913     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3914     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3915   }
3916 #endif
3917   int oop_index = oop_recorder()->find_index(obj);
3918   InstructionMark im(this);
3919   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3920   code_section()->relocate(inst_mark(), rspec);
3921   movz(dst, 0xDEAD, 16);
3922   movk(dst, 0xBEEF);
3923 }
3924 
3925 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3926   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3927   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3928   int index = oop_recorder()->find_index(k);
3929   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3930 
3931   InstructionMark im(this);
3932   RelocationHolder rspec = metadata_Relocation::spec(index);
3933   code_section()->relocate(inst_mark(), rspec);
3934   narrowKlass nk = Klass::encode_klass(k);
3935   movz(dst, (nk >> 16), 16);
3936   movk(dst, nk & 0xffff);
3937 }
3938 
3939 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3940                                     Register dst, Address src,
3941                                     Register tmp1, Register thread_tmp) {
3942   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3943   decorators = AccessInternal::decorator_fixup(decorators);
3944   bool as_raw = (decorators & AS_RAW) != 0;
3945   if (as_raw) {
3946     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3947   } else {
3948     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3949   }
3950 }
3951 
3952 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3953                                      Address dst, Register src,
3954                                      Register tmp1, Register thread_tmp) {
3955   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3956   decorators = AccessInternal::decorator_fixup(decorators);
3957   bool as_raw = (decorators & AS_RAW) != 0;
3958   if (as_raw) {
3959     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3960   } else {
3961     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3962   }
3963 }
3964 
3965 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3966                                    Register thread_tmp, DecoratorSet decorators) {
3967   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3968 }
3969 
3970 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3971                                             Register thread_tmp, DecoratorSet decorators) {
3972   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
3973 }
3974 
3975 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
3976                                     Register thread_tmp, DecoratorSet decorators) {
3977   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3978 }
3979 
3980 // Used for storing NULLs.
3981 void MacroAssembler::store_heap_oop_null(Address dst) {
3982   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
3983 }
3984 
3985 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3986   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3987   int index = oop_recorder()->allocate_metadata_index(obj);
3988   RelocationHolder rspec = metadata_Relocation::spec(index);
3989   return Address((address)obj, rspec);
3990 }
3991 
3992 // Move an oop into a register.  immediate is true if we want
3993 // immediate instrcutions, i.e. we are not going to patch this
3994 // instruction while the code is being executed by another thread.  In
3995 // that case we can use move immediates rather than the constant pool.
3996 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3997   int oop_index;
3998   if (obj == NULL) {
3999     oop_index = oop_recorder()->allocate_oop_index(obj);
4000   } else {
4001 #ifdef ASSERT
4002     {
4003       ThreadInVMfromUnknown tiv;
4004       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4005     }
4006 #endif
4007     oop_index = oop_recorder()->find_index(obj);
4008   }
4009   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4010   if (! immediate) {
4011     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4012     ldr_constant(dst, Address(dummy, rspec));
4013   } else
4014     mov(dst, Address((address)obj, rspec));
4015 }
4016 
4017 // Move a metadata address into a register.
4018 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4019   int oop_index;
4020   if (obj == NULL) {
4021     oop_index = oop_recorder()->allocate_metadata_index(obj);
4022   } else {
4023     oop_index = oop_recorder()->find_index(obj);
4024   }
4025   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4026   mov(dst, Address((address)obj, rspec));
4027 }
4028 
4029 Address MacroAssembler::constant_oop_address(jobject obj) {
4030 #ifdef ASSERT
4031   {
4032     ThreadInVMfromUnknown tiv;
4033     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4034     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4035   }
4036 #endif
4037   int oop_index = oop_recorder()->find_index(obj);
4038   return Address((address)obj, oop_Relocation::spec(oop_index));
4039 }
4040 
4041 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4042 void MacroAssembler::tlab_allocate(Register obj,
4043                                    Register var_size_in_bytes,
4044                                    int con_size_in_bytes,
4045                                    Register t1,
4046                                    Register t2,
4047                                    Label& slow_case) {
4048   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4049   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4050 }
4051 
4052 // Defines obj, preserves var_size_in_bytes
4053 void MacroAssembler::eden_allocate(Register obj,
4054                                    Register var_size_in_bytes,
4055                                    int con_size_in_bytes,
4056                                    Register t1,
4057                                    Label& slow_case) {
4058   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4059   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4060 }
4061 
4062 // Zero words; len is in bytes
4063 // Destroys all registers except addr
4064 // len must be a nonzero multiple of wordSize
4065 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4066   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4067 
4068 #ifdef ASSERT
4069   { Label L;
4070     tst(len, BytesPerWord - 1);
4071     br(Assembler::EQ, L);
4072     stop("len is not a multiple of BytesPerWord");
4073     bind(L);
4074   }
4075 #endif
4076 
4077 #ifndef PRODUCT
4078   block_comment("zero memory");
4079 #endif
4080 
4081   Label loop;
4082   Label entry;
4083 
4084 //  Algorithm:
4085 //
4086 //    scratch1 = cnt & 7;
4087 //    cnt -= scratch1;
4088 //    p += scratch1;
4089 //    switch (scratch1) {
4090 //      do {
4091 //        cnt -= 8;
4092 //          p[-8] = 0;
4093 //        case 7:
4094 //          p[-7] = 0;
4095 //        case 6:
4096 //          p[-6] = 0;
4097 //          // ...
4098 //        case 1:
4099 //          p[-1] = 0;
4100 //        case 0:
4101 //          p += 8;
4102 //      } while (cnt);
4103 //    }
4104 
4105   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4106 
4107   lsr(len, len, LogBytesPerWord);
4108   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4109   sub(len, len, rscratch1);      // cnt -= unroll
4110   // t1 always points to the end of the region we're about to zero
4111   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4112   adr(rscratch2, entry);
4113   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4114   br(rscratch2);
4115   bind(loop);
4116   sub(len, len, unroll);
4117   for (int i = -unroll; i < 0; i++)
4118     Assembler::str(zr, Address(t1, i * wordSize));
4119   bind(entry);
4120   add(t1, t1, unroll * wordSize);
4121   cbnz(len, loop);
4122 }
4123 
4124 void MacroAssembler::verify_tlab() {
4125 #ifdef ASSERT
4126   if (UseTLAB && VerifyOops) {
4127     Label next, ok;
4128 
4129     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4130 
4131     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4132     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4133     cmp(rscratch2, rscratch1);
4134     br(Assembler::HS, next);
4135     STOP("assert(top >= start)");
4136     should_not_reach_here();
4137 
4138     bind(next);
4139     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4140     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4141     cmp(rscratch2, rscratch1);
4142     br(Assembler::HS, ok);
4143     STOP("assert(top <= end)");
4144     should_not_reach_here();
4145 
4146     bind(ok);
4147     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4148   }
4149 #endif
4150 }
4151 
4152 // Writes to stack successive pages until offset reached to check for
4153 // stack overflow + shadow pages.  This clobbers tmp.
4154 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4155   assert_different_registers(tmp, size, rscratch1);
4156   mov(tmp, sp);
4157   // Bang stack for total size given plus shadow page size.
4158   // Bang one page at a time because large size can bang beyond yellow and
4159   // red zones.
4160   Label loop;
4161   mov(rscratch1, os::vm_page_size());
4162   bind(loop);
4163   lea(tmp, Address(tmp, -os::vm_page_size()));
4164   subsw(size, size, rscratch1);
4165   str(size, Address(tmp));
4166   br(Assembler::GT, loop);
4167 
4168   // Bang down shadow pages too.
4169   // At this point, (tmp-0) is the last address touched, so don't
4170   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4171   // was post-decremented.)  Skip this address by starting at i=1, and
4172   // touch a few more pages below.  N.B.  It is important to touch all
4173   // the way down to and including i=StackShadowPages.
4174   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4175     // this could be any sized move but this is can be a debugging crumb
4176     // so the bigger the better.
4177     lea(tmp, Address(tmp, -os::vm_page_size()));
4178     str(size, Address(tmp));
4179   }
4180 }
4181 
4182 
4183 // Move the address of the polling page into dest.
4184 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4185   if (SafepointMechanism::uses_thread_local_poll()) {
4186     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4187   } else {
4188     unsigned long off;
4189     adrp(dest, Address(page, rtype), off);
4190     assert(off == 0, "polling page must be page aligned");
4191   }
4192 }
4193 
4194 // Move the address of the polling page into r, then read the polling
4195 // page.
4196 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4197   get_polling_page(r, page, rtype);
4198   return read_polling_page(r, rtype);
4199 }
4200 
4201 // Read the polling page.  The address of the polling page must
4202 // already be in r.
4203 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4204   InstructionMark im(this);
4205   code_section()->relocate(inst_mark(), rtype);
4206   ldrw(zr, Address(r, 0));
4207   return inst_mark();
4208 }
4209 
4210 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4211   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4212   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4213   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4214   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4215   long offset_low = dest_page - low_page;
4216   long offset_high = dest_page - high_page;
4217 
4218   assert(is_valid_AArch64_address(dest.target()), "bad address");
4219   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4220 
4221   InstructionMark im(this);
4222   code_section()->relocate(inst_mark(), dest.rspec());
4223   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4224   // the code cache so that if it is relocated we know it will still reach
4225   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4226     _adrp(reg1, dest.target());
4227   } else {
4228     unsigned long target = (unsigned long)dest.target();
4229     unsigned long adrp_target
4230       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4231 
4232     _adrp(reg1, (address)adrp_target);
4233     movk(reg1, target >> 32, 32);
4234   }
4235   byte_offset = (unsigned long)dest.target() & 0xfff;
4236 }
4237 
4238 void MacroAssembler::load_byte_map_base(Register reg) {
4239   jbyte *byte_map_base =
4240     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4241 
4242   if (is_valid_AArch64_address((address)byte_map_base)) {
4243     // Strictly speaking the byte_map_base isn't an address at all,
4244     // and it might even be negative.
4245     unsigned long offset;
4246     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4247     // We expect offset to be zero with most collectors.
4248     if (offset != 0) {
4249       add(reg, reg, offset);
4250     }
4251   } else {
4252     mov(reg, (uint64_t)byte_map_base);
4253   }
4254 }
4255 
4256 void MacroAssembler::build_frame(int framesize) {
4257   assert(framesize > 0, "framesize must be > 0");
4258   if (framesize < ((1 << 9) + 2 * wordSize)) {
4259     sub(sp, sp, framesize);
4260     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4261     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4262   } else {
4263     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4264     if (PreserveFramePointer) mov(rfp, sp);
4265     if (framesize < ((1 << 12) + 2 * wordSize))
4266       sub(sp, sp, framesize - 2 * wordSize);
4267     else {
4268       mov(rscratch1, framesize - 2 * wordSize);
4269       sub(sp, sp, rscratch1);
4270     }
4271   }
4272 }
4273 
4274 void MacroAssembler::remove_frame(int framesize) {
4275   assert(framesize > 0, "framesize must be > 0");
4276   if (framesize < ((1 << 9) + 2 * wordSize)) {
4277     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4278     add(sp, sp, framesize);
4279   } else {
4280     if (framesize < ((1 << 12) + 2 * wordSize))
4281       add(sp, sp, framesize - 2 * wordSize);
4282     else {
4283       mov(rscratch1, framesize - 2 * wordSize);
4284       add(sp, sp, rscratch1);
4285     }
4286     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4287   }
4288 }
4289 
4290 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4291 
4292 // Search for str1 in str2 and return index or -1
4293 void MacroAssembler::string_indexof(Register str2, Register str1,
4294                                     Register cnt2, Register cnt1,
4295                                     Register tmp1, Register tmp2,
4296                                     Register tmp3, Register tmp4,
4297                                     Register tmp5, Register tmp6,
4298                                     int icnt1, Register result, int ae) {
4299   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4300   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4301 
4302   Register ch1 = rscratch1;
4303   Register ch2 = rscratch2;
4304   Register cnt1tmp = tmp1;
4305   Register cnt2tmp = tmp2;
4306   Register cnt1_neg = cnt1;
4307   Register cnt2_neg = cnt2;
4308   Register result_tmp = tmp4;
4309 
4310   bool isL = ae == StrIntrinsicNode::LL;
4311 
4312   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4313   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4314   int str1_chr_shift = str1_isL ? 0:1;
4315   int str2_chr_shift = str2_isL ? 0:1;
4316   int str1_chr_size = str1_isL ? 1:2;
4317   int str2_chr_size = str2_isL ? 1:2;
4318   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4319                                       (chr_insn)&MacroAssembler::ldrh;
4320   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4321                                       (chr_insn)&MacroAssembler::ldrh;
4322   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4323   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4324 
4325   // Note, inline_string_indexOf() generates checks:
4326   // if (substr.count > string.count) return -1;
4327   // if (substr.count == 0) return 0;
4328 
4329   // We have two strings, a source string in str2, cnt2 and a pattern string
4330   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4331 
4332   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4333   // With a small pattern and source we use linear scan.
4334 
4335   if (icnt1 == -1) {
4336     sub(result_tmp, cnt2, cnt1);
4337     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4338     br(LT, LINEARSEARCH);
4339     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4340     cmp(cnt1, 256);
4341     lsr(tmp1, cnt2, 2);
4342     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4343     br(GE, LINEARSTUB);
4344   }
4345 
4346 // The Boyer Moore alogorithm is based on the description here:-
4347 //
4348 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4349 //
4350 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4351 // and the 'Good Suffix' rule.
4352 //
4353 // These rules are essentially heuristics for how far we can shift the
4354 // pattern along the search string.
4355 //
4356 // The implementation here uses the 'Bad Character' rule only because of the
4357 // complexity of initialisation for the 'Good Suffix' rule.
4358 //
4359 // This is also known as the Boyer-Moore-Horspool algorithm:-
4360 //
4361 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4362 //
4363 // This particular implementation has few java-specific optimizations.
4364 //
4365 // #define ASIZE 256
4366 //
4367 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4368 //       int i, j;
4369 //       unsigned c;
4370 //       unsigned char bc[ASIZE];
4371 //
4372 //       /* Preprocessing */
4373 //       for (i = 0; i < ASIZE; ++i)
4374 //          bc[i] = m;
4375 //       for (i = 0; i < m - 1; ) {
4376 //          c = x[i];
4377 //          ++i;
4378 //          // c < 256 for Latin1 string, so, no need for branch
4379 //          #ifdef PATTERN_STRING_IS_LATIN1
4380 //          bc[c] = m - i;
4381 //          #else
4382 //          if (c < ASIZE) bc[c] = m - i;
4383 //          #endif
4384 //       }
4385 //
4386 //       /* Searching */
4387 //       j = 0;
4388 //       while (j <= n - m) {
4389 //          c = y[i+j];
4390 //          if (x[m-1] == c)
4391 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4392 //          if (i < 0) return j;
4393 //          // c < 256 for Latin1 string, so, no need for branch
4394 //          #ifdef SOURCE_STRING_IS_LATIN1
4395 //          // LL case: (c< 256) always true. Remove branch
4396 //          j += bc[y[j+m-1]];
4397 //          #endif
4398 //          #ifndef PATTERN_STRING_IS_UTF
4399 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4400 //          if (c < ASIZE)
4401 //            j += bc[y[j+m-1]];
4402 //          else
4403 //            j += 1
4404 //          #endif
4405 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4406 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4407 //          if (c < ASIZE)
4408 //            j += bc[y[j+m-1]];
4409 //          else
4410 //            j += m
4411 //          #endif
4412 //       }
4413 //    }
4414 
4415   if (icnt1 == -1) {
4416     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4417         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4418     Register cnt1end = tmp2;
4419     Register str2end = cnt2;
4420     Register skipch = tmp2;
4421 
4422     // str1 length is >=8, so, we can read at least 1 register for cases when
4423     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4424     // UL case. We'll re-read last character in inner pre-loop code to have
4425     // single outer pre-loop load
4426     const int firstStep = isL ? 7 : 3;
4427 
4428     const int ASIZE = 256;
4429     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4430     sub(sp, sp, ASIZE);
4431     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4432     mov(ch1, sp);
4433     BIND(BM_INIT_LOOP);
4434       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4435       subs(tmp5, tmp5, 1);
4436       br(GT, BM_INIT_LOOP);
4437 
4438       sub(cnt1tmp, cnt1, 1);
4439       mov(tmp5, str2);
4440       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4441       sub(ch2, cnt1, 1);
4442       mov(tmp3, str1);
4443     BIND(BCLOOP);
4444       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4445       if (!str1_isL) {
4446         cmp(ch1, ASIZE);
4447         br(HS, BCSKIP);
4448       }
4449       strb(ch2, Address(sp, ch1));
4450     BIND(BCSKIP);
4451       subs(ch2, ch2, 1);
4452       br(GT, BCLOOP);
4453 
4454       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4455       if (str1_isL == str2_isL) {
4456         // load last 8 bytes (8LL/4UU symbols)
4457         ldr(tmp6, Address(tmp6, -wordSize));
4458       } else {
4459         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4460         // convert Latin1 to UTF. We'll have to wait until load completed, but
4461         // it's still faster than per-character loads+checks
4462         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4463         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4464         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4465         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4466         orr(ch2, ch1, ch2, LSL, 16);
4467         orr(tmp6, tmp6, tmp3, LSL, 48);
4468         orr(tmp6, tmp6, ch2, LSL, 16);
4469       }
4470     BIND(BMLOOPSTR2);
4471       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4472       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4473       if (str1_isL == str2_isL) {
4474         // re-init tmp3. It's for free because it's executed in parallel with
4475         // load above. Alternative is to initialize it before loop, but it'll
4476         // affect performance on in-order systems with 2 or more ld/st pipelines
4477         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4478       }
4479       if (!isL) { // UU/UL case
4480         lsl(ch2, cnt1tmp, 1); // offset in bytes
4481       }
4482       cmp(tmp3, skipch);
4483       br(NE, BMSKIP);
4484       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4485       mov(ch1, tmp6);
4486       if (isL) {
4487         b(BMLOOPSTR1_AFTER_LOAD);
4488       } else {
4489         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4490         b(BMLOOPSTR1_CMP);
4491       }
4492     BIND(BMLOOPSTR1);
4493       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4494       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4495     BIND(BMLOOPSTR1_AFTER_LOAD);
4496       subs(cnt1tmp, cnt1tmp, 1);
4497       br(LT, BMLOOPSTR1_LASTCMP);
4498     BIND(BMLOOPSTR1_CMP);
4499       cmp(ch1, ch2);
4500       br(EQ, BMLOOPSTR1);
4501     BIND(BMSKIP);
4502       if (!isL) {
4503         // if we've met UTF symbol while searching Latin1 pattern, then we can
4504         // skip cnt1 symbols
4505         if (str1_isL != str2_isL) {
4506           mov(result_tmp, cnt1);
4507         } else {
4508           mov(result_tmp, 1);
4509         }
4510         cmp(skipch, ASIZE);
4511         br(HS, BMADV);
4512       }
4513       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4514     BIND(BMADV);
4515       sub(cnt1tmp, cnt1, 1);
4516       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4517       cmp(str2, str2end);
4518       br(LE, BMLOOPSTR2);
4519       add(sp, sp, ASIZE);
4520       b(NOMATCH);
4521     BIND(BMLOOPSTR1_LASTCMP);
4522       cmp(ch1, ch2);
4523       br(NE, BMSKIP);
4524     BIND(BMMATCH);
4525       sub(result, str2, tmp5);
4526       if (!str2_isL) lsr(result, result, 1);
4527       add(sp, sp, ASIZE);
4528       b(DONE);
4529 
4530     BIND(LINEARSTUB);
4531     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4532     br(LT, LINEAR_MEDIUM);
4533     mov(result, zr);
4534     RuntimeAddress stub = NULL;
4535     if (isL) {
4536       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4537       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4538     } else if (str1_isL) {
4539       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4540        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4541     } else {
4542       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4543       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4544     }
4545     trampoline_call(stub);
4546     b(DONE);
4547   }
4548 
4549   BIND(LINEARSEARCH);
4550   {
4551     Label DO1, DO2, DO3;
4552 
4553     Register str2tmp = tmp2;
4554     Register first = tmp3;
4555 
4556     if (icnt1 == -1)
4557     {
4558         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4559 
4560         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4561         br(LT, DOSHORT);
4562       BIND(LINEAR_MEDIUM);
4563         (this->*str1_load_1chr)(first, Address(str1));
4564         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4565         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4566         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4567         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4568 
4569       BIND(FIRST_LOOP);
4570         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4571         cmp(first, ch2);
4572         br(EQ, STR1_LOOP);
4573       BIND(STR2_NEXT);
4574         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4575         br(LE, FIRST_LOOP);
4576         b(NOMATCH);
4577 
4578       BIND(STR1_LOOP);
4579         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4580         add(cnt2tmp, cnt2_neg, str2_chr_size);
4581         br(GE, MATCH);
4582 
4583       BIND(STR1_NEXT);
4584         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4585         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4586         cmp(ch1, ch2);
4587         br(NE, STR2_NEXT);
4588         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4589         add(cnt2tmp, cnt2tmp, str2_chr_size);
4590         br(LT, STR1_NEXT);
4591         b(MATCH);
4592 
4593       BIND(DOSHORT);
4594       if (str1_isL == str2_isL) {
4595         cmp(cnt1, 2);
4596         br(LT, DO1);
4597         br(GT, DO3);
4598       }
4599     }
4600 
4601     if (icnt1 == 4) {
4602       Label CH1_LOOP;
4603 
4604         (this->*load_4chr)(ch1, str1);
4605         sub(result_tmp, cnt2, 4);
4606         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4607         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4608 
4609       BIND(CH1_LOOP);
4610         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4611         cmp(ch1, ch2);
4612         br(EQ, MATCH);
4613         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4614         br(LE, CH1_LOOP);
4615         b(NOMATCH);
4616       }
4617 
4618     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4619       Label CH1_LOOP;
4620 
4621       BIND(DO2);
4622         (this->*load_2chr)(ch1, str1);
4623         if (icnt1 == 2) {
4624           sub(result_tmp, cnt2, 2);
4625         }
4626         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4627         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4628       BIND(CH1_LOOP);
4629         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4630         cmp(ch1, ch2);
4631         br(EQ, MATCH);
4632         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4633         br(LE, CH1_LOOP);
4634         b(NOMATCH);
4635     }
4636 
4637     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4638       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4639 
4640       BIND(DO3);
4641         (this->*load_2chr)(first, str1);
4642         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4643         if (icnt1 == 3) {
4644           sub(result_tmp, cnt2, 3);
4645         }
4646         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4647         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4648       BIND(FIRST_LOOP);
4649         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4650         cmpw(first, ch2);
4651         br(EQ, STR1_LOOP);
4652       BIND(STR2_NEXT);
4653         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4654         br(LE, FIRST_LOOP);
4655         b(NOMATCH);
4656 
4657       BIND(STR1_LOOP);
4658         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4659         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4660         cmp(ch1, ch2);
4661         br(NE, STR2_NEXT);
4662         b(MATCH);
4663     }
4664 
4665     if (icnt1 == -1 || icnt1 == 1) {
4666       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4667 
4668       BIND(DO1);
4669         (this->*str1_load_1chr)(ch1, str1);
4670         cmp(cnt2, 8);
4671         br(LT, DO1_SHORT);
4672 
4673         sub(result_tmp, cnt2, 8/str2_chr_size);
4674         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4675         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4676         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4677 
4678         if (str2_isL) {
4679           orr(ch1, ch1, ch1, LSL, 8);
4680         }
4681         orr(ch1, ch1, ch1, LSL, 16);
4682         orr(ch1, ch1, ch1, LSL, 32);
4683       BIND(CH1_LOOP);
4684         ldr(ch2, Address(str2, cnt2_neg));
4685         eor(ch2, ch1, ch2);
4686         sub(tmp1, ch2, tmp3);
4687         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4688         bics(tmp1, tmp1, tmp2);
4689         br(NE, HAS_ZERO);
4690         adds(cnt2_neg, cnt2_neg, 8);
4691         br(LT, CH1_LOOP);
4692 
4693         cmp(cnt2_neg, 8);
4694         mov(cnt2_neg, 0);
4695         br(LT, CH1_LOOP);
4696         b(NOMATCH);
4697 
4698       BIND(HAS_ZERO);
4699         rev(tmp1, tmp1);
4700         clz(tmp1, tmp1);
4701         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4702         b(MATCH);
4703 
4704       BIND(DO1_SHORT);
4705         mov(result_tmp, cnt2);
4706         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4707         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4708       BIND(DO1_LOOP);
4709         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4710         cmpw(ch1, ch2);
4711         br(EQ, MATCH);
4712         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4713         br(LT, DO1_LOOP);
4714     }
4715   }
4716   BIND(NOMATCH);
4717     mov(result, -1);
4718     b(DONE);
4719   BIND(MATCH);
4720     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4721   BIND(DONE);
4722 }
4723 
4724 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4725 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4726 
4727 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4728                                          Register ch, Register result,
4729                                          Register tmp1, Register tmp2, Register tmp3)
4730 {
4731   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4732   Register cnt1_neg = cnt1;
4733   Register ch1 = rscratch1;
4734   Register result_tmp = rscratch2;
4735 
4736   cbz(cnt1, NOMATCH);
4737 
4738   cmp(cnt1, 4);
4739   br(LT, DO1_SHORT);
4740 
4741   orr(ch, ch, ch, LSL, 16);
4742   orr(ch, ch, ch, LSL, 32);
4743 
4744   sub(cnt1, cnt1, 4);
4745   mov(result_tmp, cnt1);
4746   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4747   sub(cnt1_neg, zr, cnt1, LSL, 1);
4748 
4749   mov(tmp3, 0x0001000100010001);
4750 
4751   BIND(CH1_LOOP);
4752     ldr(ch1, Address(str1, cnt1_neg));
4753     eor(ch1, ch, ch1);
4754     sub(tmp1, ch1, tmp3);
4755     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4756     bics(tmp1, tmp1, tmp2);
4757     br(NE, HAS_ZERO);
4758     adds(cnt1_neg, cnt1_neg, 8);
4759     br(LT, CH1_LOOP);
4760 
4761     cmp(cnt1_neg, 8);
4762     mov(cnt1_neg, 0);
4763     br(LT, CH1_LOOP);
4764     b(NOMATCH);
4765 
4766   BIND(HAS_ZERO);
4767     rev(tmp1, tmp1);
4768     clz(tmp1, tmp1);
4769     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4770     b(MATCH);
4771 
4772   BIND(DO1_SHORT);
4773     mov(result_tmp, cnt1);
4774     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4775     sub(cnt1_neg, zr, cnt1, LSL, 1);
4776   BIND(DO1_LOOP);
4777     ldrh(ch1, Address(str1, cnt1_neg));
4778     cmpw(ch, ch1);
4779     br(EQ, MATCH);
4780     adds(cnt1_neg, cnt1_neg, 2);
4781     br(LT, DO1_LOOP);
4782   BIND(NOMATCH);
4783     mov(result, -1);
4784     b(DONE);
4785   BIND(MATCH);
4786     add(result, result_tmp, cnt1_neg, ASR, 1);
4787   BIND(DONE);
4788 }
4789 
4790 // Compare strings.
4791 void MacroAssembler::string_compare(Register str1, Register str2,
4792     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4793     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4794   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4795       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4796       SHORT_LOOP_START, TAIL_CHECK;
4797 
4798   const int STUB_THRESHOLD = 64 + 8;
4799   bool isLL = ae == StrIntrinsicNode::LL;
4800   bool isLU = ae == StrIntrinsicNode::LU;
4801   bool isUL = ae == StrIntrinsicNode::UL;
4802 
4803   bool str1_isL = isLL || isLU;
4804   bool str2_isL = isLL || isUL;
4805 
4806   int str1_chr_shift = str1_isL ? 0 : 1;
4807   int str2_chr_shift = str2_isL ? 0 : 1;
4808   int str1_chr_size = str1_isL ? 1 : 2;
4809   int str2_chr_size = str2_isL ? 1 : 2;
4810   int minCharsInWord = isLL ? wordSize : wordSize/2;
4811 
4812   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4813   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4814                                       (chr_insn)&MacroAssembler::ldrh;
4815   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4816                                       (chr_insn)&MacroAssembler::ldrh;
4817   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4818                             (uxt_insn)&MacroAssembler::uxthw;
4819 
4820   BLOCK_COMMENT("string_compare {");
4821 
4822   // Bizzarely, the counts are passed in bytes, regardless of whether they
4823   // are L or U strings, however the result is always in characters.
4824   if (!str1_isL) asrw(cnt1, cnt1, 1);
4825   if (!str2_isL) asrw(cnt2, cnt2, 1);
4826 
4827   // Compute the minimum of the string lengths and save the difference.
4828   subsw(result, cnt1, cnt2);
4829   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4830 
4831   // A very short string
4832   cmpw(cnt2, minCharsInWord);
4833   br(Assembler::LE, SHORT_STRING);
4834 
4835   // Compare longwords
4836   // load first parts of strings and finish initialization while loading
4837   {
4838     if (str1_isL == str2_isL) { // LL or UU
4839       ldr(tmp1, Address(str1));
4840       cmp(str1, str2);
4841       br(Assembler::EQ, DONE);
4842       ldr(tmp2, Address(str2));
4843       cmp(cnt2, STUB_THRESHOLD);
4844       br(GE, STUB);
4845       subsw(cnt2, cnt2, minCharsInWord);
4846       br(EQ, TAIL_CHECK);
4847       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4848       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4849       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4850     } else if (isLU) {
4851       ldrs(vtmp, Address(str1));
4852       cmp(str1, str2);
4853       br(Assembler::EQ, DONE);
4854       ldr(tmp2, Address(str2));
4855       cmp(cnt2, STUB_THRESHOLD);
4856       br(GE, STUB);
4857       subw(cnt2, cnt2, 4);
4858       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4859       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4860       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4861       zip1(vtmp, T8B, vtmp, vtmpZ);
4862       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4863       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4864       add(cnt1, cnt1, 4);
4865       fmovd(tmp1, vtmp);
4866     } else { // UL case
4867       ldr(tmp1, Address(str1));
4868       cmp(str1, str2);
4869       br(Assembler::EQ, DONE);
4870       ldrs(vtmp, Address(str2));
4871       cmp(cnt2, STUB_THRESHOLD);
4872       br(GE, STUB);
4873       subw(cnt2, cnt2, 4);
4874       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4875       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4876       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4877       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4878       zip1(vtmp, T8B, vtmp, vtmpZ);
4879       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4880       add(cnt1, cnt1, 8);
4881       fmovd(tmp2, vtmp);
4882     }
4883     adds(cnt2, cnt2, isUL ? 4 : 8);
4884     br(GE, TAIL);
4885     eor(rscratch2, tmp1, tmp2);
4886     cbnz(rscratch2, DIFFERENCE);
4887     // main loop
4888     bind(NEXT_WORD);
4889     if (str1_isL == str2_isL) {
4890       ldr(tmp1, Address(str1, cnt2));
4891       ldr(tmp2, Address(str2, cnt2));
4892       adds(cnt2, cnt2, 8);
4893     } else if (isLU) {
4894       ldrs(vtmp, Address(str1, cnt1));
4895       ldr(tmp2, Address(str2, cnt2));
4896       add(cnt1, cnt1, 4);
4897       zip1(vtmp, T8B, vtmp, vtmpZ);
4898       fmovd(tmp1, vtmp);
4899       adds(cnt2, cnt2, 8);
4900     } else { // UL
4901       ldrs(vtmp, Address(str2, cnt2));
4902       ldr(tmp1, Address(str1, cnt1));
4903       zip1(vtmp, T8B, vtmp, vtmpZ);
4904       add(cnt1, cnt1, 8);
4905       fmovd(tmp2, vtmp);
4906       adds(cnt2, cnt2, 4);
4907     }
4908     br(GE, TAIL);
4909 
4910     eor(rscratch2, tmp1, tmp2);
4911     cbz(rscratch2, NEXT_WORD);
4912     b(DIFFERENCE);
4913     bind(TAIL);
4914     eor(rscratch2, tmp1, tmp2);
4915     cbnz(rscratch2, DIFFERENCE);
4916     // Last longword.  In the case where length == 4 we compare the
4917     // same longword twice, but that's still faster than another
4918     // conditional branch.
4919     if (str1_isL == str2_isL) {
4920       ldr(tmp1, Address(str1));
4921       ldr(tmp2, Address(str2));
4922     } else if (isLU) {
4923       ldrs(vtmp, Address(str1));
4924       ldr(tmp2, Address(str2));
4925       zip1(vtmp, T8B, vtmp, vtmpZ);
4926       fmovd(tmp1, vtmp);
4927     } else { // UL
4928       ldrs(vtmp, Address(str2));
4929       ldr(tmp1, Address(str1));
4930       zip1(vtmp, T8B, vtmp, vtmpZ);
4931       fmovd(tmp2, vtmp);
4932     }
4933     bind(TAIL_CHECK);
4934     eor(rscratch2, tmp1, tmp2);
4935     cbz(rscratch2, DONE);
4936 
4937     // Find the first different characters in the longwords and
4938     // compute their difference.
4939     bind(DIFFERENCE);
4940     rev(rscratch2, rscratch2);
4941     clz(rscratch2, rscratch2);
4942     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4943     lsrv(tmp1, tmp1, rscratch2);
4944     (this->*ext_chr)(tmp1, tmp1);
4945     lsrv(tmp2, tmp2, rscratch2);
4946     (this->*ext_chr)(tmp2, tmp2);
4947     subw(result, tmp1, tmp2);
4948     b(DONE);
4949   }
4950 
4951   bind(STUB);
4952     RuntimeAddress stub = NULL;
4953     switch(ae) {
4954       case StrIntrinsicNode::LL:
4955         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4956         break;
4957       case StrIntrinsicNode::UU:
4958         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4959         break;
4960       case StrIntrinsicNode::LU:
4961         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4962         break;
4963       case StrIntrinsicNode::UL:
4964         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
4965         break;
4966       default:
4967         ShouldNotReachHere();
4968      }
4969     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
4970     trampoline_call(stub);
4971     b(DONE);
4972 
4973   bind(SHORT_STRING);
4974   // Is the minimum length zero?
4975   cbz(cnt2, DONE);
4976   // arrange code to do most branches while loading and loading next characters
4977   // while comparing previous
4978   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4979   subs(cnt2, cnt2, 1);
4980   br(EQ, SHORT_LAST_INIT);
4981   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4982   b(SHORT_LOOP_START);
4983   bind(SHORT_LOOP);
4984   subs(cnt2, cnt2, 1);
4985   br(EQ, SHORT_LAST);
4986   bind(SHORT_LOOP_START);
4987   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
4988   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
4989   cmp(tmp1, cnt1);
4990   br(NE, SHORT_LOOP_TAIL);
4991   subs(cnt2, cnt2, 1);
4992   br(EQ, SHORT_LAST2);
4993   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4994   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
4995   cmp(tmp2, rscratch1);
4996   br(EQ, SHORT_LOOP);
4997   sub(result, tmp2, rscratch1);
4998   b(DONE);
4999   bind(SHORT_LOOP_TAIL);
5000   sub(result, tmp1, cnt1);
5001   b(DONE);
5002   bind(SHORT_LAST2);
5003   cmp(tmp2, rscratch1);
5004   br(EQ, DONE);
5005   sub(result, tmp2, rscratch1);
5006 
5007   b(DONE);
5008   bind(SHORT_LAST_INIT);
5009   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5010   bind(SHORT_LAST);
5011   cmp(tmp1, cnt1);
5012   br(EQ, DONE);
5013   sub(result, tmp1, cnt1);
5014 
5015   bind(DONE);
5016 
5017   BLOCK_COMMENT("} string_compare");
5018 }
5019 
5020 // This method checks if provided byte array contains byte with highest bit set.
5021 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5022     // Simple and most common case of aligned small array which is not at the
5023     // end of memory page is placed here. All other cases are in stub.
5024     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5025     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5026     assert_different_registers(ary1, len, result);
5027 
5028     cmpw(len, 0);
5029     br(LE, SET_RESULT);
5030     cmpw(len, 4 * wordSize);
5031     br(GE, STUB_LONG); // size > 32 then go to stub
5032 
5033     int shift = 64 - exact_log2(os::vm_page_size());
5034     lsl(rscratch1, ary1, shift);
5035     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5036     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5037     br(CS, STUB); // at the end of page then go to stub
5038     subs(len, len, wordSize);
5039     br(LT, END);
5040 
5041   BIND(LOOP);
5042     ldr(rscratch1, Address(post(ary1, wordSize)));
5043     tst(rscratch1, UPPER_BIT_MASK);
5044     br(NE, SET_RESULT);
5045     subs(len, len, wordSize);
5046     br(GE, LOOP);
5047     cmpw(len, -wordSize);
5048     br(EQ, SET_RESULT);
5049 
5050   BIND(END);
5051     ldr(result, Address(ary1));
5052     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5053     lslv(result, result, len);
5054     tst(result, UPPER_BIT_MASK);
5055     b(SET_RESULT);
5056 
5057   BIND(STUB);
5058     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5059     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5060     trampoline_call(has_neg);
5061     b(DONE);
5062 
5063   BIND(STUB_LONG);
5064     RuntimeAddress has_neg_long =  RuntimeAddress(
5065             StubRoutines::aarch64::has_negatives_long());
5066     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5067     trampoline_call(has_neg_long);
5068     b(DONE);
5069 
5070   BIND(SET_RESULT);
5071     cset(result, NE); // set true or false
5072 
5073   BIND(DONE);
5074 }
5075 
5076 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5077                                    Register tmp4, Register tmp5, Register result,
5078                                    Register cnt1, int elem_size) {
5079   Label DONE, SAME;
5080   Register tmp1 = rscratch1;
5081   Register tmp2 = rscratch2;
5082   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5083   int elem_per_word = wordSize/elem_size;
5084   int log_elem_size = exact_log2(elem_size);
5085   int length_offset = arrayOopDesc::length_offset_in_bytes();
5086   int base_offset
5087     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5088   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5089 
5090   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5091   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5092 
5093 #ifndef PRODUCT
5094   {
5095     const char kind = (elem_size == 2) ? 'U' : 'L';
5096     char comment[64];
5097     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5098     BLOCK_COMMENT(comment);
5099   }
5100 #endif
5101 
5102   // if (a1 == a2)
5103   //     return true;
5104   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5105   br(EQ, SAME);
5106 
5107   if (UseSimpleArrayEquals) {
5108     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5109     // if (a1 == null || a2 == null)
5110     //     return false;
5111     // a1 & a2 == 0 means (some-pointer is null) or
5112     // (very-rare-or-even-probably-impossible-pointer-values)
5113     // so, we can save one branch in most cases
5114     tst(a1, a2);
5115     mov(result, false);
5116     br(EQ, A_MIGHT_BE_NULL);
5117     // if (a1.length != a2.length)
5118     //      return false;
5119     bind(A_IS_NOT_NULL);
5120     ldrw(cnt1, Address(a1, length_offset));
5121     ldrw(cnt2, Address(a2, length_offset));
5122     eorw(tmp5, cnt1, cnt2);
5123     cbnzw(tmp5, DONE);
5124     lea(a1, Address(a1, base_offset));
5125     lea(a2, Address(a2, base_offset));
5126     // Check for short strings, i.e. smaller than wordSize.
5127     subs(cnt1, cnt1, elem_per_word);
5128     br(Assembler::LT, SHORT);
5129     // Main 8 byte comparison loop.
5130     bind(NEXT_WORD); {
5131       ldr(tmp1, Address(post(a1, wordSize)));
5132       ldr(tmp2, Address(post(a2, wordSize)));
5133       subs(cnt1, cnt1, elem_per_word);
5134       eor(tmp5, tmp1, tmp2);
5135       cbnz(tmp5, DONE);
5136     } br(GT, NEXT_WORD);
5137     // Last longword.  In the case where length == 4 we compare the
5138     // same longword twice, but that's still faster than another
5139     // conditional branch.
5140     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5141     // length == 4.
5142     if (log_elem_size > 0)
5143       lsl(cnt1, cnt1, log_elem_size);
5144     ldr(tmp3, Address(a1, cnt1));
5145     ldr(tmp4, Address(a2, cnt1));
5146     eor(tmp5, tmp3, tmp4);
5147     cbnz(tmp5, DONE);
5148     b(SAME);
5149     bind(A_MIGHT_BE_NULL);
5150     // in case both a1 and a2 are not-null, proceed with loads
5151     cbz(a1, DONE);
5152     cbz(a2, DONE);
5153     b(A_IS_NOT_NULL);
5154     bind(SHORT);
5155 
5156     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5157     {
5158       ldrw(tmp1, Address(post(a1, 4)));
5159       ldrw(tmp2, Address(post(a2, 4)));
5160       eorw(tmp5, tmp1, tmp2);
5161       cbnzw(tmp5, DONE);
5162     }
5163     bind(TAIL03);
5164     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5165     {
5166       ldrh(tmp3, Address(post(a1, 2)));
5167       ldrh(tmp4, Address(post(a2, 2)));
5168       eorw(tmp5, tmp3, tmp4);
5169       cbnzw(tmp5, DONE);
5170     }
5171     bind(TAIL01);
5172     if (elem_size == 1) { // Only needed when comparing byte arrays.
5173       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5174       {
5175         ldrb(tmp1, a1);
5176         ldrb(tmp2, a2);
5177         eorw(tmp5, tmp1, tmp2);
5178         cbnzw(tmp5, DONE);
5179       }
5180     }
5181   } else {
5182     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5183         CSET_EQ, LAST_CHECK;
5184     mov(result, false);
5185     cbz(a1, DONE);
5186     ldrw(cnt1, Address(a1, length_offset));
5187     cbz(a2, DONE);
5188     ldrw(cnt2, Address(a2, length_offset));
5189     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5190     // faster to perform another branch before comparing a1 and a2
5191     cmp(cnt1, elem_per_word);
5192     br(LE, SHORT); // short or same
5193     ldr(tmp3, Address(pre(a1, base_offset)));
5194     cmp(cnt1, stubBytesThreshold);
5195     br(GE, STUB);
5196     ldr(tmp4, Address(pre(a2, base_offset)));
5197     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5198     cmp(cnt2, cnt1);
5199     br(NE, DONE);
5200 
5201     // Main 16 byte comparison loop with 2 exits
5202     bind(NEXT_DWORD); {
5203       ldr(tmp1, Address(pre(a1, wordSize)));
5204       ldr(tmp2, Address(pre(a2, wordSize)));
5205       subs(cnt1, cnt1, 2 * elem_per_word);
5206       br(LE, TAIL);
5207       eor(tmp4, tmp3, tmp4);
5208       cbnz(tmp4, DONE);
5209       ldr(tmp3, Address(pre(a1, wordSize)));
5210       ldr(tmp4, Address(pre(a2, wordSize)));
5211       cmp(cnt1, elem_per_word);
5212       br(LE, TAIL2);
5213       cmp(tmp1, tmp2);
5214     } br(EQ, NEXT_DWORD);
5215     b(DONE);
5216 
5217     bind(TAIL);
5218     eor(tmp4, tmp3, tmp4);
5219     eor(tmp2, tmp1, tmp2);
5220     lslv(tmp2, tmp2, tmp5);
5221     orr(tmp5, tmp4, tmp2);
5222     cmp(tmp5, zr);
5223     b(CSET_EQ);
5224 
5225     bind(TAIL2);
5226     eor(tmp2, tmp1, tmp2);
5227     cbnz(tmp2, DONE);
5228     b(LAST_CHECK);
5229 
5230     bind(STUB);
5231     ldr(tmp4, Address(pre(a2, base_offset)));
5232     cmp(cnt2, cnt1);
5233     br(NE, DONE);
5234     if (elem_size == 2) { // convert to byte counter
5235       lsl(cnt1, cnt1, 1);
5236     }
5237     eor(tmp5, tmp3, tmp4);
5238     cbnz(tmp5, DONE);
5239     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5240     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5241     trampoline_call(stub);
5242     b(DONE);
5243 
5244     bind(EARLY_OUT);
5245     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5246     // so, if a2 == null => return false(0), else return true, so we can return a2
5247     mov(result, a2);
5248     b(DONE);
5249     bind(SHORT);
5250     cmp(cnt2, cnt1);
5251     br(NE, DONE);
5252     cbz(cnt1, SAME);
5253     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5254     ldr(tmp3, Address(a1, base_offset));
5255     ldr(tmp4, Address(a2, base_offset));
5256     bind(LAST_CHECK);
5257     eor(tmp4, tmp3, tmp4);
5258     lslv(tmp5, tmp4, tmp5);
5259     cmp(tmp5, zr);
5260     bind(CSET_EQ);
5261     cset(result, EQ);
5262     b(DONE);
5263   }
5264 
5265   bind(SAME);
5266   mov(result, true);
5267   // That's it.
5268   bind(DONE);
5269 
5270   BLOCK_COMMENT("} array_equals");
5271 }
5272 
5273 // Compare Strings
5274 
5275 // For Strings we're passed the address of the first characters in a1
5276 // and a2 and the length in cnt1.
5277 // elem_size is the element size in bytes: either 1 or 2.
5278 // There are two implementations.  For arrays >= 8 bytes, all
5279 // comparisons (including the final one, which may overlap) are
5280 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5281 // halfword, then a short, and then a byte.
5282 
5283 void MacroAssembler::string_equals(Register a1, Register a2,
5284                                    Register result, Register cnt1, int elem_size)
5285 {
5286   Label SAME, DONE, SHORT, NEXT_WORD;
5287   Register tmp1 = rscratch1;
5288   Register tmp2 = rscratch2;
5289   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5290 
5291   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5292   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5293 
5294 #ifndef PRODUCT
5295   {
5296     const char kind = (elem_size == 2) ? 'U' : 'L';
5297     char comment[64];
5298     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5299     BLOCK_COMMENT(comment);
5300   }
5301 #endif
5302 
5303   mov(result, false);
5304 
5305   // Check for short strings, i.e. smaller than wordSize.
5306   subs(cnt1, cnt1, wordSize);
5307   br(Assembler::LT, SHORT);
5308   // Main 8 byte comparison loop.
5309   bind(NEXT_WORD); {
5310     ldr(tmp1, Address(post(a1, wordSize)));
5311     ldr(tmp2, Address(post(a2, wordSize)));
5312     subs(cnt1, cnt1, wordSize);
5313     eor(tmp1, tmp1, tmp2);
5314     cbnz(tmp1, DONE);
5315   } br(GT, NEXT_WORD);
5316   // Last longword.  In the case where length == 4 we compare the
5317   // same longword twice, but that's still faster than another
5318   // conditional branch.
5319   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5320   // length == 4.
5321   ldr(tmp1, Address(a1, cnt1));
5322   ldr(tmp2, Address(a2, cnt1));
5323   eor(tmp2, tmp1, tmp2);
5324   cbnz(tmp2, DONE);
5325   b(SAME);
5326 
5327   bind(SHORT);
5328   Label TAIL03, TAIL01;
5329 
5330   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5331   {
5332     ldrw(tmp1, Address(post(a1, 4)));
5333     ldrw(tmp2, Address(post(a2, 4)));
5334     eorw(tmp1, tmp1, tmp2);
5335     cbnzw(tmp1, DONE);
5336   }
5337   bind(TAIL03);
5338   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5339   {
5340     ldrh(tmp1, Address(post(a1, 2)));
5341     ldrh(tmp2, Address(post(a2, 2)));
5342     eorw(tmp1, tmp1, tmp2);
5343     cbnzw(tmp1, DONE);
5344   }
5345   bind(TAIL01);
5346   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5347     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5348     {
5349       ldrb(tmp1, a1);
5350       ldrb(tmp2, a2);
5351       eorw(tmp1, tmp1, tmp2);
5352       cbnzw(tmp1, DONE);
5353     }
5354   }
5355   // Arrays are equal.
5356   bind(SAME);
5357   mov(result, true);
5358 
5359   // That's it.
5360   bind(DONE);
5361   BLOCK_COMMENT("} string_equals");
5362 }
5363 
5364 
5365 // The size of the blocks erased by the zero_blocks stub.  We must
5366 // handle anything smaller than this ourselves in zero_words().
5367 const int MacroAssembler::zero_words_block_size = 8;
5368 
5369 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5370 // possible, handling small word counts locally and delegating
5371 // anything larger to the zero_blocks stub.  It is expanded many times
5372 // in compiled code, so it is important to keep it short.
5373 
5374 // ptr:   Address of a buffer to be zeroed.
5375 // cnt:   Count in HeapWords.
5376 //
5377 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5378 void MacroAssembler::zero_words(Register ptr, Register cnt)
5379 {
5380   assert(is_power_of_2(zero_words_block_size), "adjust this");
5381   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5382 
5383   BLOCK_COMMENT("zero_words {");
5384   cmp(cnt, zero_words_block_size);
5385   Label around, done, done16;
5386   br(LO, around);
5387   {
5388     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5389     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5390     if (StubRoutines::aarch64::complete()) {
5391       trampoline_call(zero_blocks);
5392     } else {
5393       bl(zero_blocks);
5394     }
5395   }
5396   bind(around);
5397   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5398     Label l;
5399     tbz(cnt, exact_log2(i), l);
5400     for (int j = 0; j < i; j += 2) {
5401       stp(zr, zr, post(ptr, 16));
5402     }
5403     bind(l);
5404   }
5405   {
5406     Label l;
5407     tbz(cnt, 0, l);
5408     str(zr, Address(ptr));
5409     bind(l);
5410   }
5411   BLOCK_COMMENT("} zero_words");
5412 }
5413 
5414 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5415 // cnt:          Immediate count in HeapWords.
5416 #define SmallArraySize (18 * BytesPerLong)
5417 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5418 {
5419   BLOCK_COMMENT("zero_words {");
5420   int i = cnt & 1;  // store any odd word to start
5421   if (i) str(zr, Address(base));
5422 
5423   if (cnt <= SmallArraySize / BytesPerLong) {
5424     for (; i < (int)cnt; i += 2)
5425       stp(zr, zr, Address(base, i * wordSize));
5426   } else {
5427     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5428     int remainder = cnt % (2 * unroll);
5429     for (; i < remainder; i += 2)
5430       stp(zr, zr, Address(base, i * wordSize));
5431 
5432     Label loop;
5433     Register cnt_reg = rscratch1;
5434     Register loop_base = rscratch2;
5435     cnt = cnt - remainder;
5436     mov(cnt_reg, cnt);
5437     // adjust base and prebias by -2 * wordSize so we can pre-increment
5438     add(loop_base, base, (remainder - 2) * wordSize);
5439     bind(loop);
5440     sub(cnt_reg, cnt_reg, 2 * unroll);
5441     for (i = 1; i < unroll; i++)
5442       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5443     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5444     cbnz(cnt_reg, loop);
5445   }
5446   BLOCK_COMMENT("} zero_words");
5447 }
5448 
5449 // Zero blocks of memory by using DC ZVA.
5450 //
5451 // Aligns the base address first sufficently for DC ZVA, then uses
5452 // DC ZVA repeatedly for every full block.  cnt is the size to be
5453 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5454 // in cnt.
5455 //
5456 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5457 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5458 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5459   Register tmp = rscratch1;
5460   Register tmp2 = rscratch2;
5461   int zva_length = VM_Version::zva_length();
5462   Label initial_table_end, loop_zva;
5463   Label fini;
5464 
5465   // Base must be 16 byte aligned. If not just return and let caller handle it
5466   tst(base, 0x0f);
5467   br(Assembler::NE, fini);
5468   // Align base with ZVA length.
5469   neg(tmp, base);
5470   andr(tmp, tmp, zva_length - 1);
5471 
5472   // tmp: the number of bytes to be filled to align the base with ZVA length.
5473   add(base, base, tmp);
5474   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5475   adr(tmp2, initial_table_end);
5476   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5477   br(tmp2);
5478 
5479   for (int i = -zva_length + 16; i < 0; i += 16)
5480     stp(zr, zr, Address(base, i));
5481   bind(initial_table_end);
5482 
5483   sub(cnt, cnt, zva_length >> 3);
5484   bind(loop_zva);
5485   dc(Assembler::ZVA, base);
5486   subs(cnt, cnt, zva_length >> 3);
5487   add(base, base, zva_length);
5488   br(Assembler::GE, loop_zva);
5489   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5490   bind(fini);
5491 }
5492 
5493 // base:   Address of a buffer to be filled, 8 bytes aligned.
5494 // cnt:    Count in 8-byte unit.
5495 // value:  Value to be filled with.
5496 // base will point to the end of the buffer after filling.
5497 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5498 {
5499 //  Algorithm:
5500 //
5501 //    scratch1 = cnt & 7;
5502 //    cnt -= scratch1;
5503 //    p += scratch1;
5504 //    switch (scratch1) {
5505 //      do {
5506 //        cnt -= 8;
5507 //          p[-8] = v;
5508 //        case 7:
5509 //          p[-7] = v;
5510 //        case 6:
5511 //          p[-6] = v;
5512 //          // ...
5513 //        case 1:
5514 //          p[-1] = v;
5515 //        case 0:
5516 //          p += 8;
5517 //      } while (cnt);
5518 //    }
5519 
5520   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5521 
5522   Label fini, skip, entry, loop;
5523   const int unroll = 8; // Number of stp instructions we'll unroll
5524 
5525   cbz(cnt, fini);
5526   tbz(base, 3, skip);
5527   str(value, Address(post(base, 8)));
5528   sub(cnt, cnt, 1);
5529   bind(skip);
5530 
5531   andr(rscratch1, cnt, (unroll-1) * 2);
5532   sub(cnt, cnt, rscratch1);
5533   add(base, base, rscratch1, Assembler::LSL, 3);
5534   adr(rscratch2, entry);
5535   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5536   br(rscratch2);
5537 
5538   bind(loop);
5539   add(base, base, unroll * 16);
5540   for (int i = -unroll; i < 0; i++)
5541     stp(value, value, Address(base, i * 16));
5542   bind(entry);
5543   subs(cnt, cnt, unroll * 2);
5544   br(Assembler::GE, loop);
5545 
5546   tbz(cnt, 0, fini);
5547   str(value, Address(post(base, 8)));
5548   bind(fini);
5549 }
5550 
5551 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5552 // java/lang/StringUTF16.compress.
5553 void MacroAssembler::encode_iso_array(Register src, Register dst,
5554                       Register len, Register result,
5555                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5556                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5557 {
5558     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5559         NEXT_32_START, NEXT_32_PRFM_START;
5560     Register tmp1 = rscratch1, tmp2 = rscratch2;
5561 
5562       mov(result, len); // Save initial len
5563 
5564       cmp(len, 8); // handle shortest strings first
5565       br(LT, LOOP_1);
5566       cmp(len, 32);
5567       br(LT, NEXT_8);
5568       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5569       // to convert chars to bytes
5570       if (SoftwarePrefetchHintDistance >= 0) {
5571         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5572         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5573         br(LE, NEXT_32_START);
5574         b(NEXT_32_PRFM_START);
5575         BIND(NEXT_32_PRFM);
5576           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5577         BIND(NEXT_32_PRFM_START);
5578           prfm(Address(src, SoftwarePrefetchHintDistance));
5579           orr(v4, T16B, Vtmp1, Vtmp2);
5580           orr(v5, T16B, Vtmp3, Vtmp4);
5581           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5582           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5583           uzp2(v5, T16B, v4, v5); // high bytes
5584           umov(tmp2, v5, D, 1);
5585           fmovd(tmp1, v5);
5586           orr(tmp1, tmp1, tmp2);
5587           cbnz(tmp1, LOOP_8);
5588           stpq(Vtmp1, Vtmp3, dst);
5589           sub(len, len, 32);
5590           add(dst, dst, 32);
5591           add(src, src, 64);
5592           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5593           br(GE, NEXT_32_PRFM);
5594           cmp(len, 32);
5595           br(LT, LOOP_8);
5596         BIND(NEXT_32);
5597           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5598         BIND(NEXT_32_START);
5599       } else {
5600         BIND(NEXT_32);
5601           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5602       }
5603       prfm(Address(src, SoftwarePrefetchHintDistance));
5604       uzp1(v4, T16B, Vtmp1, Vtmp2);
5605       uzp1(v5, T16B, Vtmp3, Vtmp4);
5606       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5607       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5608       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5609       umov(tmp2, Vtmp1, D, 1);
5610       fmovd(tmp1, Vtmp1);
5611       orr(tmp1, tmp1, tmp2);
5612       cbnz(tmp1, LOOP_8);
5613       stpq(v4, v5, dst);
5614       sub(len, len, 32);
5615       add(dst, dst, 32);
5616       add(src, src, 64);
5617       cmp(len, 32);
5618       br(GE, NEXT_32);
5619       cbz(len, DONE);
5620 
5621     BIND(LOOP_8);
5622       cmp(len, 8);
5623       br(LT, LOOP_1);
5624     BIND(NEXT_8);
5625       ld1(Vtmp1, T8H, src);
5626       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5627       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5628       fmovd(tmp1, Vtmp3);
5629       cbnz(tmp1, NEXT_1);
5630       strd(Vtmp2, dst);
5631 
5632       sub(len, len, 8);
5633       add(dst, dst, 8);
5634       add(src, src, 16);
5635       cmp(len, 8);
5636       br(GE, NEXT_8);
5637 
5638     BIND(LOOP_1);
5639 
5640     cbz(len, DONE);
5641     BIND(NEXT_1);
5642       ldrh(tmp1, Address(post(src, 2)));
5643       tst(tmp1, 0xff00);
5644       br(NE, SET_RESULT);
5645       strb(tmp1, Address(post(dst, 1)));
5646       subs(len, len, 1);
5647       br(GT, NEXT_1);
5648 
5649     BIND(SET_RESULT);
5650       sub(result, result, len); // Return index where we stopped
5651                                 // Return len == 0 if we processed all
5652                                 // characters
5653     BIND(DONE);
5654 }
5655 
5656 
5657 // Inflate byte[] array to char[].
5658 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5659                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5660                                         Register tmp4) {
5661   Label big, done, after_init, to_stub;
5662 
5663   assert_different_registers(src, dst, len, tmp4, rscratch1);
5664 
5665   fmovd(vtmp1, zr);
5666   lsrw(tmp4, len, 3);
5667   bind(after_init);
5668   cbnzw(tmp4, big);
5669   // Short string: less than 8 bytes.
5670   {
5671     Label loop, tiny;
5672 
5673     cmpw(len, 4);
5674     br(LT, tiny);
5675     // Use SIMD to do 4 bytes.
5676     ldrs(vtmp2, post(src, 4));
5677     zip1(vtmp3, T8B, vtmp2, vtmp1);
5678     subw(len, len, 4);
5679     strd(vtmp3, post(dst, 8));
5680 
5681     cbzw(len, done);
5682 
5683     // Do the remaining bytes by steam.
5684     bind(loop);
5685     ldrb(tmp4, post(src, 1));
5686     strh(tmp4, post(dst, 2));
5687     subw(len, len, 1);
5688 
5689     bind(tiny);
5690     cbnz(len, loop);
5691 
5692     b(done);
5693   }
5694 
5695   if (SoftwarePrefetchHintDistance >= 0) {
5696     bind(to_stub);
5697       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5698       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5699       trampoline_call(stub);
5700       b(after_init);
5701   }
5702 
5703   // Unpack the bytes 8 at a time.
5704   bind(big);
5705   {
5706     Label loop, around, loop_last, loop_start;
5707 
5708     if (SoftwarePrefetchHintDistance >= 0) {
5709       const int large_loop_threshold = (64 + 16)/8;
5710       ldrd(vtmp2, post(src, 8));
5711       andw(len, len, 7);
5712       cmp(tmp4, large_loop_threshold);
5713       br(GE, to_stub);
5714       b(loop_start);
5715 
5716       bind(loop);
5717       ldrd(vtmp2, post(src, 8));
5718       bind(loop_start);
5719       subs(tmp4, tmp4, 1);
5720       br(EQ, loop_last);
5721       zip1(vtmp2, T16B, vtmp2, vtmp1);
5722       ldrd(vtmp3, post(src, 8));
5723       st1(vtmp2, T8H, post(dst, 16));
5724       subs(tmp4, tmp4, 1);
5725       zip1(vtmp3, T16B, vtmp3, vtmp1);
5726       st1(vtmp3, T8H, post(dst, 16));
5727       br(NE, loop);
5728       b(around);
5729       bind(loop_last);
5730       zip1(vtmp2, T16B, vtmp2, vtmp1);
5731       st1(vtmp2, T8H, post(dst, 16));
5732       bind(around);
5733       cbz(len, done);
5734     } else {
5735       andw(len, len, 7);
5736       bind(loop);
5737       ldrd(vtmp2, post(src, 8));
5738       sub(tmp4, tmp4, 1);
5739       zip1(vtmp3, T16B, vtmp2, vtmp1);
5740       st1(vtmp3, T8H, post(dst, 16));
5741       cbnz(tmp4, loop);
5742     }
5743   }
5744 
5745   // Do the tail of up to 8 bytes.
5746   add(src, src, len);
5747   ldrd(vtmp3, Address(src, -8));
5748   add(dst, dst, len, ext::uxtw, 1);
5749   zip1(vtmp3, T16B, vtmp3, vtmp1);
5750   strq(vtmp3, Address(dst, -16));
5751 
5752   bind(done);
5753 }
5754 
5755 // Compress char[] array to byte[].
5756 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5757                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5758                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5759                                          Register result) {
5760   encode_iso_array(src, dst, len, result,
5761                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5762   cmp(len, zr);
5763   csel(result, result, zr, EQ);
5764 }
5765 
5766 // get_thread() can be called anywhere inside generated code so we
5767 // need to save whatever non-callee save context might get clobbered
5768 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5769 // the call setup code.
5770 //
5771 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5772 //
5773 void MacroAssembler::get_thread(Register dst) {
5774   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5775   push(saved_regs, sp);
5776 
5777   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5778   blr(lr);
5779   if (dst != c_rarg0) {
5780     mov(dst, c_rarg0);
5781   }
5782 
5783   pop(saved_regs, sp);
5784 }