1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   if (last_java_pc != NULL) {
 376     adr(scratch, last_java_pc);
 377   } else {
 378     // FIXME: This is almost never correct.  We should delete all
 379     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 380     // correct return address instead.
 381     adr(scratch, pc());
 382   }
 383 
 384   str(scratch, Address(rthread,
 385                        JavaThread::frame_anchor_offset()
 386                        + JavaFrameAnchor::last_Java_pc_offset()));
 387 
 388   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 389 }
 390 
 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 392                                          Register last_java_fp,
 393                                          Label &L,
 394                                          Register scratch) {
 395   if (L.is_bound()) {
 396     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 397   } else {
 398     InstructionMark im(this);
 399     L.add_patch_at(code(), locator());
 400     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 401   }
 402 }
 403 
 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 405   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 406   assert(CodeCache::find_blob(entry.target()) != NULL,
 407          "destination of far call not found in code cache");
 408   if (far_branches()) {
 409     unsigned long offset;
 410     // We can use ADRP here because we know that the total size of
 411     // the code cache cannot exceed 2Gb.
 412     adrp(tmp, entry, offset);
 413     add(tmp, tmp, offset);
 414     if (cbuf) cbuf->set_insts_mark();
 415     blr(tmp);
 416   } else {
 417     if (cbuf) cbuf->set_insts_mark();
 418     bl(entry);
 419   }
 420 }
 421 
 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 423   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 424   assert(CodeCache::find_blob(entry.target()) != NULL,
 425          "destination of far call not found in code cache");
 426   if (far_branches()) {
 427     unsigned long offset;
 428     // We can use ADRP here because we know that the total size of
 429     // the code cache cannot exceed 2Gb.
 430     adrp(tmp, entry, offset);
 431     add(tmp, tmp, offset);
 432     if (cbuf) cbuf->set_insts_mark();
 433     br(tmp);
 434   } else {
 435     if (cbuf) cbuf->set_insts_mark();
 436     b(entry);
 437   }
 438 }
 439 
 440 void MacroAssembler::reserved_stack_check() {
 441     // testing if reserved zone needs to be enabled
 442     Label no_reserved_zone_enabling;
 443 
 444     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 445     cmp(sp, rscratch1);
 446     br(Assembler::LO, no_reserved_zone_enabling);
 447 
 448     enter();   // LR and FP are live.
 449     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 450     mov(c_rarg0, rthread);
 451     blr(rscratch1);
 452     leave();
 453 
 454     // We have already removed our own frame.
 455     // throw_delayed_StackOverflowError will think that it's been
 456     // called by our caller.
 457     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 458     br(rscratch1);
 459     should_not_reach_here();
 460 
 461     bind(no_reserved_zone_enabling);
 462 }
 463 
 464 int MacroAssembler::biased_locking_enter(Register lock_reg,
 465                                          Register obj_reg,
 466                                          Register swap_reg,
 467                                          Register tmp_reg,
 468                                          bool swap_reg_contains_mark,
 469                                          Label& done,
 470                                          Label* slow_case,
 471                                          BiasedLockingCounters* counters) {
 472   assert(UseBiasedLocking, "why call this otherwise?");
 473   assert_different_registers(lock_reg, obj_reg, swap_reg);
 474 
 475   if (PrintBiasedLockingStatistics && counters == NULL)
 476     counters = BiasedLocking::counters();
 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 
 518   // At this point we know that the header has the bias pattern and
 519   // that we are not the bias owner in the current epoch. We need to
 520   // figure out more details about the state of the header in order to
 521   // know what operations can be legally performed on the object's
 522   // header.
 523 
 524   // If the low three bits in the xor result aren't clear, that means
 525   // the prototype header is no longer biased and we have to revoke
 526   // the bias on this object.
 527   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 528   cbnz(rscratch1, try_revoke_bias);
 529 
 530   // Biasing is still enabled for this data type. See whether the
 531   // epoch of the current bias is still valid, meaning that the epoch
 532   // bits of the mark word are equal to the epoch bits of the
 533   // prototype header. (Note that the prototype header's epoch bits
 534   // only change at a safepoint.) If not, attempt to rebias the object
 535   // toward the current thread. Note that we must be absolutely sure
 536   // that the current epoch is invalid in order to do this because
 537   // otherwise the manipulations it performs on the mark word are
 538   // illegal.
 539   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 540   cbnz(rscratch1, try_rebias);
 541 
 542   // The epoch of the current bias is still valid but we know nothing
 543   // about the owner; it might be set or it might be clear. Try to
 544   // acquire the bias of the object using an atomic operation. If this
 545   // fails we will go in to the runtime to revoke the object's bias.
 546   // Note that we first construct the presumed unbiased header so we
 547   // don't accidentally blow away another thread's valid bias.
 548   {
 549     Label here;
 550     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 551     andr(swap_reg, swap_reg, rscratch1);
 552     orr(tmp_reg, swap_reg, rthread);
 553     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 554     // If the biasing toward our thread failed, this means that
 555     // another thread succeeded in biasing it toward itself and we
 556     // need to revoke that bias. The revocation will occur in the
 557     // interpreter runtime in the slow case.
 558     bind(here);
 559     if (counters != NULL) {
 560       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 561                   tmp_reg, rscratch1, rscratch2);
 562     }
 563   }
 564   b(done);
 565 
 566   bind(try_rebias);
 567   // At this point we know the epoch has expired, meaning that the
 568   // current "bias owner", if any, is actually invalid. Under these
 569   // circumstances _only_, we are allowed to use the current header's
 570   // value as the comparison value when doing the cas to acquire the
 571   // bias in the current epoch. In other words, we allow transfer of
 572   // the bias from one thread to another directly in this situation.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     orr(tmp_reg, rthread, tmp_reg);
 580     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 581     // If the biasing toward our thread failed, then another thread
 582     // succeeded in biasing it toward itself and we need to revoke that
 583     // bias. The revocation will occur in the runtime in the slow case.
 584     bind(here);
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 587                   tmp_reg, rscratch1, rscratch2);
 588     }
 589   }
 590   b(done);
 591 
 592   bind(try_revoke_bias);
 593   // The prototype mark in the klass doesn't have the bias bit set any
 594   // more, indicating that objects of this data type are not supposed
 595   // to be biased any more. We are going to try to reset the mark of
 596   // this object to the prototype value and fall through to the
 597   // CAS-based locking scheme. Note that if our CAS fails, it means
 598   // that another thread raced us for the privilege of revoking the
 599   // bias of this particular object, so it's okay to continue in the
 600   // normal locking code.
 601   //
 602   // FIXME: due to a lack of registers we currently blow away the age
 603   // bits in this situation. Should attempt to preserve them.
 604   {
 605     Label here, nope;
 606     load_prototype_header(tmp_reg, obj_reg);
 607     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 608     bind(here);
 609 
 610     // Fall through to the normal CAS-based lock, because no matter what
 611     // the result of the above CAS, some thread must have succeeded in
 612     // removing the bias bit from the object's header.
 613     if (counters != NULL) {
 614       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 615                   rscratch1, rscratch2);
 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }
 657 
 658 static void pass_arg3(MacroAssembler* masm, Register arg) {
 659   if (c_rarg3 != arg ) {
 660     masm->mov(c_rarg3, arg);
 661   }
 662 }
 663 
 664 void MacroAssembler::call_VM_base(Register oop_result,
 665                                   Register java_thread,
 666                                   Register last_java_sp,
 667                                   address  entry_point,
 668                                   int      number_of_arguments,
 669                                   bool     check_exceptions) {
 670    // determine java_thread register
 671   if (!java_thread->is_valid()) {
 672     java_thread = rthread;
 673   }
 674 
 675   // determine last_java_sp register
 676   if (!last_java_sp->is_valid()) {
 677     last_java_sp = esp;
 678   }
 679 
 680   // debugging support
 681   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 682   assert(java_thread == rthread, "unexpected register");
 683 #ifdef ASSERT
 684   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 685   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 686 #endif // ASSERT
 687 
 688   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 689   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 690 
 691   // push java thread (becomes first argument of C function)
 692 
 693   mov(c_rarg0, java_thread);
 694 
 695   // set last Java frame before call
 696   assert(last_java_sp != rfp, "can't use rfp");
 697 
 698   Label l;
 699   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 700 
 701   // do the call, remove parameters
 702   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 703 
 704   // reset last Java frame
 705   // Only interpreter should have to clear fp
 706   reset_last_Java_frame(true);
 707 
 708    // C++ interp handles this in the interpreter
 709   check_and_handle_popframe(java_thread);
 710   check_and_handle_earlyret(java_thread);
 711 
 712   if (check_exceptions) {
 713     // check for pending exceptions (java_thread is set upon return)
 714     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 715     Label ok;
 716     cbz(rscratch1, ok);
 717     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 718     br(rscratch1);
 719     bind(ok);
 720   }
 721 
 722   // get oop result if there is one and reset the value in the thread
 723   if (oop_result->is_valid()) {
 724     get_vm_result(oop_result, java_thread);
 725   }
 726 }
 727 
 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 729   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 730 }
 731 
 732 // Maybe emit a call via a trampoline.  If the code cache is small
 733 // trampolines won't be emitted.
 734 
 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 736   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 737   assert(entry.rspec().type() == relocInfo::runtime_call_type
 738          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 739          || entry.rspec().type() == relocInfo::static_call_type
 740          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 741 
 742   unsigned int start_offset = offset();
 743   if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
 744     address stub = emit_trampoline_stub(start_offset, entry.target());
 745     if (stub == NULL) {
 746       return NULL; // CodeCache is full
 747     }
 748   }
 749 
 750   if (cbuf) cbuf->set_insts_mark();
 751   relocate(entry.rspec());
 752   if (!far_branches()) {
 753     bl(entry.target());
 754   } else {
 755     bl(pc());
 756   }
 757   // just need to return a non-null address
 758   return pc();
 759 }
 760 
 761 
 762 // Emit a trampoline stub for a call to a target which is too far away.
 763 //
 764 // code sequences:
 765 //
 766 // call-site:
 767 //   branch-and-link to <destination> or <trampoline stub>
 768 //
 769 // Related trampoline stub for this call site in the stub section:
 770 //   load the call target from the constant pool
 771 //   branch (LR still points to the call site above)
 772 
 773 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 774                                              address dest) {
 775   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 776   if (stub == NULL) {
 777     return NULL;  // CodeBuffer::expand failed
 778   }
 779 
 780   // Create a trampoline stub relocation which relates this trampoline stub
 781   // with the call instruction at insts_call_instruction_offset in the
 782   // instructions code-section.
 783   align(wordSize);
 784   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 785                                             + insts_call_instruction_offset));
 786   const int stub_start_offset = offset();
 787 
 788   // Now, create the trampoline stub's code:
 789   // - load the call
 790   // - call
 791   Label target;
 792   ldr(rscratch1, target);
 793   br(rscratch1);
 794   bind(target);
 795   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 796          "should be");
 797   emit_int64((int64_t)dest);
 798 
 799   const address stub_start_addr = addr_at(stub_start_offset);
 800 
 801   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 802 
 803   end_a_stub();
 804   return stub_start_addr;
 805 }
 806 
 807 address MacroAssembler::ic_call(address entry, jint method_index) {
 808   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 809   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 810   // unsigned long offset;
 811   // ldr_constant(rscratch2, const_ptr);
 812   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 813   return trampoline_call(Address(entry, rh));
 814 }
 815 
 816 // Implementation of call_VM versions
 817 
 818 void MacroAssembler::call_VM(Register oop_result,
 819                              address entry_point,
 820                              bool check_exceptions) {
 821   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 822 }
 823 
 824 void MacroAssembler::call_VM(Register oop_result,
 825                              address entry_point,
 826                              Register arg_1,
 827                              bool check_exceptions) {
 828   pass_arg1(this, arg_1);
 829   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 830 }
 831 
 832 void MacroAssembler::call_VM(Register oop_result,
 833                              address entry_point,
 834                              Register arg_1,
 835                              Register arg_2,
 836                              bool check_exceptions) {
 837   assert(arg_1 != c_rarg2, "smashed arg");
 838   pass_arg2(this, arg_2);
 839   pass_arg1(this, arg_1);
 840   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 841 }
 842 
 843 void MacroAssembler::call_VM(Register oop_result,
 844                              address entry_point,
 845                              Register arg_1,
 846                              Register arg_2,
 847                              Register arg_3,
 848                              bool check_exceptions) {
 849   assert(arg_1 != c_rarg3, "smashed arg");
 850   assert(arg_2 != c_rarg3, "smashed arg");
 851   pass_arg3(this, arg_3);
 852 
 853   assert(arg_1 != c_rarg2, "smashed arg");
 854   pass_arg2(this, arg_2);
 855 
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              Register last_java_sp,
 862                              address entry_point,
 863                              int number_of_arguments,
 864                              bool check_exceptions) {
 865   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 866 }
 867 
 868 void MacroAssembler::call_VM(Register oop_result,
 869                              Register last_java_sp,
 870                              address entry_point,
 871                              Register arg_1,
 872                              bool check_exceptions) {
 873   pass_arg1(this, arg_1);
 874   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              Register arg_1,
 881                              Register arg_2,
 882                              bool check_exceptions) {
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886   pass_arg1(this, arg_1);
 887   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 888 }
 889 
 890 void MacroAssembler::call_VM(Register oop_result,
 891                              Register last_java_sp,
 892                              address entry_point,
 893                              Register arg_1,
 894                              Register arg_2,
 895                              Register arg_3,
 896                              bool check_exceptions) {
 897   assert(arg_1 != c_rarg3, "smashed arg");
 898   assert(arg_2 != c_rarg3, "smashed arg");
 899   pass_arg3(this, arg_3);
 900   assert(arg_1 != c_rarg2, "smashed arg");
 901   pass_arg2(this, arg_2);
 902   pass_arg1(this, arg_1);
 903   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 904 }
 905 
 906 
 907 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 908   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 909   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 910   verify_oop(oop_result, "broken oop in call_VM_base");
 911 }
 912 
 913 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 914   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 915   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 916 }
 917 
 918 void MacroAssembler::align(int modulus) {
 919   while (offset() % modulus != 0) nop();
 920 }
 921 
 922 // these are no-ops overridden by InterpreterMacroAssembler
 923 
 924 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 925 
 926 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 927 
 928 
 929 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 930                                                       Register tmp,
 931                                                       int offset) {
 932   intptr_t value = *delayed_value_addr;
 933   if (value != 0)
 934     return RegisterOrConstant(value + offset);
 935 
 936   // load indirectly to solve generation ordering problem
 937   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 938 
 939   if (offset != 0)
 940     add(tmp, tmp, offset);
 941 
 942   return RegisterOrConstant(tmp);
 943 }
 944 
 945 
 946 void MacroAssembler:: notify(int type) {
 947   if (type == bytecode_start) {
 948     // set_last_Java_frame(esp, rfp, (address)NULL);
 949     Assembler:: notify(type);
 950     // reset_last_Java_frame(true);
 951   }
 952   else
 953     Assembler:: notify(type);
 954 }
 955 
 956 // Look up the method for a megamorphic invokeinterface call.
 957 // The target method is determined by <intf_klass, itable_index>.
 958 // The receiver klass is in recv_klass.
 959 // On success, the result will be in method_result, and execution falls through.
 960 // On failure, execution transfers to the given label.
 961 void MacroAssembler::lookup_interface_method(Register recv_klass,
 962                                              Register intf_klass,
 963                                              RegisterOrConstant itable_index,
 964                                              Register method_result,
 965                                              Register scan_temp,
 966                                              Label& L_no_such_interface,
 967                          bool return_method) {
 968   assert_different_registers(recv_klass, intf_klass, scan_temp);
 969   assert_different_registers(method_result, intf_klass, scan_temp);
 970   assert(recv_klass != method_result || !return_method,
 971      "recv_klass can be destroyed when method isn't needed");
 972   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 973          "caller must use same register for non-constant itable index as for method");
 974 
 975   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 976   int vtable_base = in_bytes(Klass::vtable_start_offset());
 977   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 978   int scan_step   = itableOffsetEntry::size() * wordSize;
 979   int vte_size    = vtableEntry::size_in_bytes();
 980   assert(vte_size == wordSize, "else adjust times_vte_scale");
 981 
 982   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 983 
 984   // %%% Could store the aligned, prescaled offset in the klassoop.
 985   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 986   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 987   add(scan_temp, scan_temp, vtable_base);
 988 
 989   if (return_method) {
 990     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 991     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 992     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 993     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 994     if (itentry_off)
 995       add(recv_klass, recv_klass, itentry_off);
 996   }
 997 
 998   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 999   //   if (scan->interface() == intf) {
1000   //     result = (klass + scan->offset() + itable_index);
1001   //   }
1002   // }
1003   Label search, found_method;
1004 
1005   for (int peel = 1; peel >= 0; peel--) {
1006     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1007     cmp(intf_klass, method_result);
1008 
1009     if (peel) {
1010       br(Assembler::EQ, found_method);
1011     } else {
1012       br(Assembler::NE, search);
1013       // (invert the test to fall through to found_method...)
1014     }
1015 
1016     if (!peel)  break;
1017 
1018     bind(search);
1019 
1020     // Check that the previous entry is non-null.  A null entry means that
1021     // the receiver class doesn't implement the interface, and wasn't the
1022     // same as when the caller was compiled.
1023     cbz(method_result, L_no_such_interface);
1024     add(scan_temp, scan_temp, scan_step);
1025   }
1026 
1027   bind(found_method);
1028 
1029   // Got a hit.
1030   if (return_method) {
1031     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1032     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1033   }
1034 }
1035 
1036 // virtual method calling
1037 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1038                                            RegisterOrConstant vtable_index,
1039                                            Register method_result) {
1040   const int base = in_bytes(Klass::vtable_start_offset());
1041   assert(vtableEntry::size() * wordSize == 8,
1042          "adjust the scaling in the code below");
1043   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1044 
1045   if (vtable_index.is_register()) {
1046     lea(method_result, Address(recv_klass,
1047                                vtable_index.as_register(),
1048                                Address::lsl(LogBytesPerWord)));
1049     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1050   } else {
1051     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1052     ldr(method_result,
1053         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1054   }
1055 }
1056 
1057 void MacroAssembler::check_klass_subtype(Register sub_klass,
1058                            Register super_klass,
1059                            Register temp_reg,
1060                            Label& L_success) {
1061   Label L_failure;
1062   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1063   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1064   bind(L_failure);
1065 }
1066 
1067 
1068 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1069                                                    Register super_klass,
1070                                                    Register temp_reg,
1071                                                    Label* L_success,
1072                                                    Label* L_failure,
1073                                                    Label* L_slow_path,
1074                                         RegisterOrConstant super_check_offset) {
1075   assert_different_registers(sub_klass, super_klass, temp_reg);
1076   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1077   if (super_check_offset.is_register()) {
1078     assert_different_registers(sub_klass, super_klass,
1079                                super_check_offset.as_register());
1080   } else if (must_load_sco) {
1081     assert(temp_reg != noreg, "supply either a temp or a register offset");
1082   }
1083 
1084   Label L_fallthrough;
1085   int label_nulls = 0;
1086   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1087   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1088   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1089   assert(label_nulls <= 1, "at most one NULL in the batch");
1090 
1091   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1092   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1093   Address super_check_offset_addr(super_klass, sco_offset);
1094 
1095   // Hacked jmp, which may only be used just before L_fallthrough.
1096 #define final_jmp(label)                                                \
1097   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1098   else                            b(label)                /*omit semi*/
1099 
1100   // If the pointers are equal, we are done (e.g., String[] elements).
1101   // This self-check enables sharing of secondary supertype arrays among
1102   // non-primary types such as array-of-interface.  Otherwise, each such
1103   // type would need its own customized SSA.
1104   // We move this check to the front of the fast path because many
1105   // type checks are in fact trivially successful in this manner,
1106   // so we get a nicely predicted branch right at the start of the check.
1107   cmp(sub_klass, super_klass);
1108   br(Assembler::EQ, *L_success);
1109 
1110   // Check the supertype display:
1111   if (must_load_sco) {
1112     ldrw(temp_reg, super_check_offset_addr);
1113     super_check_offset = RegisterOrConstant(temp_reg);
1114   }
1115   Address super_check_addr(sub_klass, super_check_offset);
1116   ldr(rscratch1, super_check_addr);
1117   cmp(super_klass, rscratch1); // load displayed supertype
1118 
1119   // This check has worked decisively for primary supers.
1120   // Secondary supers are sought in the super_cache ('super_cache_addr').
1121   // (Secondary supers are interfaces and very deeply nested subtypes.)
1122   // This works in the same check above because of a tricky aliasing
1123   // between the super_cache and the primary super display elements.
1124   // (The 'super_check_addr' can address either, as the case requires.)
1125   // Note that the cache is updated below if it does not help us find
1126   // what we need immediately.
1127   // So if it was a primary super, we can just fail immediately.
1128   // Otherwise, it's the slow path for us (no success at this point).
1129 
1130   if (super_check_offset.is_register()) {
1131     br(Assembler::EQ, *L_success);
1132     cmp(super_check_offset.as_register(), sc_offset);
1133     if (L_failure == &L_fallthrough) {
1134       br(Assembler::EQ, *L_slow_path);
1135     } else {
1136       br(Assembler::NE, *L_failure);
1137       final_jmp(*L_slow_path);
1138     }
1139   } else if (super_check_offset.as_constant() == sc_offset) {
1140     // Need a slow path; fast failure is impossible.
1141     if (L_slow_path == &L_fallthrough) {
1142       br(Assembler::EQ, *L_success);
1143     } else {
1144       br(Assembler::NE, *L_slow_path);
1145       final_jmp(*L_success);
1146     }
1147   } else {
1148     // No slow path; it's a fast decision.
1149     if (L_failure == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_failure);
1153       final_jmp(*L_success);
1154     }
1155   }
1156 
1157   bind(L_fallthrough);
1158 
1159 #undef final_jmp
1160 }
1161 
1162 // These two are taken from x86, but they look generally useful
1163 
1164 // scans count pointer sized words at [addr] for occurence of value,
1165 // generic
1166 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1167                                 Register scratch) {
1168   Label Lloop, Lexit;
1169   cbz(count, Lexit);
1170   bind(Lloop);
1171   ldr(scratch, post(addr, wordSize));
1172   cmp(value, scratch);
1173   br(EQ, Lexit);
1174   sub(count, count, 1);
1175   cbnz(count, Lloop);
1176   bind(Lexit);
1177 }
1178 
1179 // scans count 4 byte words at [addr] for occurence of value,
1180 // generic
1181 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1182                                 Register scratch) {
1183   Label Lloop, Lexit;
1184   cbz(count, Lexit);
1185   bind(Lloop);
1186   ldrw(scratch, post(addr, wordSize));
1187   cmpw(value, scratch);
1188   br(EQ, Lexit);
1189   sub(count, count, 1);
1190   cbnz(count, Lloop);
1191   bind(Lexit);
1192 }
1193 
1194 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1195                                                    Register super_klass,
1196                                                    Register temp_reg,
1197                                                    Register temp2_reg,
1198                                                    Label* L_success,
1199                                                    Label* L_failure,
1200                                                    bool set_cond_codes) {
1201   assert_different_registers(sub_klass, super_klass, temp_reg);
1202   if (temp2_reg != noreg)
1203     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1204 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1205 
1206   Label L_fallthrough;
1207   int label_nulls = 0;
1208   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1209   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1210   assert(label_nulls <= 1, "at most one NULL in the batch");
1211 
1212   // a couple of useful fields in sub_klass:
1213   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1214   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1215   Address secondary_supers_addr(sub_klass, ss_offset);
1216   Address super_cache_addr(     sub_klass, sc_offset);
1217 
1218   BLOCK_COMMENT("check_klass_subtype_slow_path");
1219 
1220   // Do a linear scan of the secondary super-klass chain.
1221   // This code is rarely used, so simplicity is a virtue here.
1222   // The repne_scan instruction uses fixed registers, which we must spill.
1223   // Don't worry too much about pre-existing connections with the input regs.
1224 
1225   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1226   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1227 
1228   RegSet pushed_registers;
1229   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1230   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1231 
1232   if (super_klass != r0 || UseCompressedOops) {
1233     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1234   }
1235 
1236   push(pushed_registers, sp);
1237 
1238   // Get super_klass value into r0 (even if it was in r5 or r2).
1239   if (super_klass != r0) {
1240     mov(r0, super_klass);
1241   }
1242 
1243 #ifndef PRODUCT
1244   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1245   Address pst_counter_addr(rscratch2);
1246   ldr(rscratch1, pst_counter_addr);
1247   add(rscratch1, rscratch1, 1);
1248   str(rscratch1, pst_counter_addr);
1249 #endif //PRODUCT
1250 
1251   // We will consult the secondary-super array.
1252   ldr(r5, secondary_supers_addr);
1253   // Load the array length.
1254   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1255   // Skip to start of data.
1256   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1257 
1258   cmp(sp, zr); // Clear Z flag; SP is never zero
1259   // Scan R2 words at [R5] for an occurrence of R0.
1260   // Set NZ/Z based on last compare.
1261   repne_scan(r5, r0, r2, rscratch1);
1262 
1263   // Unspill the temp. registers:
1264   pop(pushed_registers, sp);
1265 
1266   br(Assembler::NE, *L_failure);
1267 
1268   // Success.  Cache the super we found and proceed in triumph.
1269   str(super_klass, super_cache_addr);
1270 
1271   if (L_success != &L_fallthrough) {
1272     b(*L_success);
1273   }
1274 
1275 #undef IS_A_TEMP
1276 
1277   bind(L_fallthrough);
1278 }
1279 
1280 
1281 void MacroAssembler::verify_oop(Register reg, const char* s) {
1282   if (!VerifyOops) return;
1283 
1284   // Pass register number to verify_oop_subroutine
1285   const char* b = NULL;
1286   {
1287     ResourceMark rm;
1288     stringStream ss;
1289     ss.print("verify_oop: %s: %s", reg->name(), s);
1290     b = code_string(ss.as_string());
1291   }
1292   BLOCK_COMMENT("verify_oop {");
1293 
1294   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1295   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1296 
1297   mov(r0, reg);
1298   mov(rscratch1, (address)b);
1299 
1300   // call indirectly to solve generation ordering problem
1301   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1302   ldr(rscratch2, Address(rscratch2));
1303   blr(rscratch2);
1304 
1305   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1306   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1307 
1308   BLOCK_COMMENT("} verify_oop");
1309 }
1310 
1311 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1312   if (!VerifyOops) return;
1313 
1314   const char* b = NULL;
1315   {
1316     ResourceMark rm;
1317     stringStream ss;
1318     ss.print("verify_oop_addr: %s", s);
1319     b = code_string(ss.as_string());
1320   }
1321   BLOCK_COMMENT("verify_oop_addr {");
1322 
1323   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1324   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1325 
1326   // addr may contain sp so we will have to adjust it based on the
1327   // pushes that we just did.
1328   if (addr.uses(sp)) {
1329     lea(r0, addr);
1330     ldr(r0, Address(r0, 4 * wordSize));
1331   } else {
1332     ldr(r0, addr);
1333   }
1334   mov(rscratch1, (address)b);
1335 
1336   // call indirectly to solve generation ordering problem
1337   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1338   ldr(rscratch2, Address(rscratch2));
1339   blr(rscratch2);
1340 
1341   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1342   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1343 
1344   BLOCK_COMMENT("} verify_oop_addr");
1345 }
1346 
1347 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1348                                          int extra_slot_offset) {
1349   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1350   int stackElementSize = Interpreter::stackElementSize;
1351   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1352 #ifdef ASSERT
1353   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1354   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1355 #endif
1356   if (arg_slot.is_constant()) {
1357     return Address(esp, arg_slot.as_constant() * stackElementSize
1358                    + offset);
1359   } else {
1360     add(rscratch1, esp, arg_slot.as_register(),
1361         ext::uxtx, exact_log2(stackElementSize));
1362     return Address(rscratch1, offset);
1363   }
1364 }
1365 
1366 void MacroAssembler::call_VM_leaf_base(address entry_point,
1367                                        int number_of_arguments,
1368                                        Label *retaddr) {
1369   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1370 }
1371 
1372 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1373                                         int number_of_gp_arguments,
1374                                         int number_of_fp_arguments,
1375                                         ret_type type,
1376                                         Label *retaddr) {
1377   Label E, L;
1378 
1379   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1380 
1381   // We add 1 to number_of_arguments because the thread in arg0 is
1382   // not counted
1383   mov(rscratch1, entry_point);
1384   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1385   if (retaddr)
1386     bind(*retaddr);
1387 
1388   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1389   maybe_isb();
1390 }
1391 
1392 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1393   call_VM_leaf_base(entry_point, number_of_arguments);
1394 }
1395 
1396 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1397   pass_arg0(this, arg_0);
1398   call_VM_leaf_base(entry_point, 1);
1399 }
1400 
1401 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1402   pass_arg0(this, arg_0);
1403   pass_arg1(this, arg_1);
1404   call_VM_leaf_base(entry_point, 2);
1405 }
1406 
1407 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1408                                   Register arg_1, Register arg_2) {
1409   pass_arg0(this, arg_0);
1410   pass_arg1(this, arg_1);
1411   pass_arg2(this, arg_2);
1412   call_VM_leaf_base(entry_point, 3);
1413 }
1414 
1415 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1416   pass_arg0(this, arg_0);
1417   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1418 }
1419 
1420 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1421 
1422   assert(arg_0 != c_rarg1, "smashed arg");
1423   pass_arg1(this, arg_1);
1424   pass_arg0(this, arg_0);
1425   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1426 }
1427 
1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1429   assert(arg_0 != c_rarg2, "smashed arg");
1430   assert(arg_1 != c_rarg2, "smashed arg");
1431   pass_arg2(this, arg_2);
1432   assert(arg_0 != c_rarg1, "smashed arg");
1433   pass_arg1(this, arg_1);
1434   pass_arg0(this, arg_0);
1435   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1436 }
1437 
1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1439   assert(arg_0 != c_rarg3, "smashed arg");
1440   assert(arg_1 != c_rarg3, "smashed arg");
1441   assert(arg_2 != c_rarg3, "smashed arg");
1442   pass_arg3(this, arg_3);
1443   assert(arg_0 != c_rarg2, "smashed arg");
1444   assert(arg_1 != c_rarg2, "smashed arg");
1445   pass_arg2(this, arg_2);
1446   assert(arg_0 != c_rarg1, "smashed arg");
1447   pass_arg1(this, arg_1);
1448   pass_arg0(this, arg_0);
1449   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1450 }
1451 
1452 void MacroAssembler::null_check(Register reg, int offset) {
1453   if (needs_explicit_null_check(offset)) {
1454     // provoke OS NULL exception if reg = NULL by
1455     // accessing M[reg] w/o changing any registers
1456     // NOTE: this is plenty to provoke a segv
1457     ldr(zr, Address(reg));
1458   } else {
1459     // nothing to do, (later) access of M[reg + offset]
1460     // will provoke OS NULL exception if reg = NULL
1461   }
1462 }
1463 
1464 // MacroAssembler protected routines needed to implement
1465 // public methods
1466 
1467 void MacroAssembler::mov(Register r, Address dest) {
1468   code_section()->relocate(pc(), dest.rspec());
1469   u_int64_t imm64 = (u_int64_t)dest.target();
1470   movptr(r, imm64);
1471 }
1472 
1473 // Move a constant pointer into r.  In AArch64 mode the virtual
1474 // address space is 48 bits in size, so we only need three
1475 // instructions to create a patchable instruction sequence that can
1476 // reach anywhere.
1477 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1478 #ifndef PRODUCT
1479   {
1480     char buffer[64];
1481     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1482     block_comment(buffer);
1483   }
1484 #endif
1485   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1486   movz(r, imm64 & 0xffff);
1487   imm64 >>= 16;
1488   movk(r, imm64 & 0xffff, 16);
1489   imm64 >>= 16;
1490   movk(r, imm64 & 0xffff, 32);
1491 }
1492 
1493 // Macro to mov replicated immediate to vector register.
1494 //  Vd will get the following values for different arrangements in T
1495 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1496 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1497 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1498 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1499 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1500 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1501 //   T1D/T2D: invalid
1502 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1503   assert(T != T1D && T != T2D, "invalid arrangement");
1504   if (T == T8B || T == T16B) {
1505     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1506     movi(Vd, T, imm32 & 0xff, 0);
1507     return;
1508   }
1509   u_int32_t nimm32 = ~imm32;
1510   if (T == T4H || T == T8H) {
1511     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1512     imm32 &= 0xffff;
1513     nimm32 &= 0xffff;
1514   }
1515   u_int32_t x = imm32;
1516   int movi_cnt = 0;
1517   int movn_cnt = 0;
1518   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1519   x = nimm32;
1520   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1521   if (movn_cnt < movi_cnt) imm32 = nimm32;
1522   unsigned lsl = 0;
1523   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1524   if (movn_cnt < movi_cnt)
1525     mvni(Vd, T, imm32 & 0xff, lsl);
1526   else
1527     movi(Vd, T, imm32 & 0xff, lsl);
1528   imm32 >>= 8; lsl += 8;
1529   while (imm32) {
1530     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1531     if (movn_cnt < movi_cnt)
1532       bici(Vd, T, imm32 & 0xff, lsl);
1533     else
1534       orri(Vd, T, imm32 & 0xff, lsl);
1535     lsl += 8; imm32 >>= 8;
1536   }
1537 }
1538 
1539 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1540 {
1541 #ifndef PRODUCT
1542   {
1543     char buffer[64];
1544     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1545     block_comment(buffer);
1546   }
1547 #endif
1548   if (operand_valid_for_logical_immediate(false, imm64)) {
1549     orr(dst, zr, imm64);
1550   } else {
1551     // we can use a combination of MOVZ or MOVN with
1552     // MOVK to build up the constant
1553     u_int64_t imm_h[4];
1554     int zero_count = 0;
1555     int neg_count = 0;
1556     int i;
1557     for (i = 0; i < 4; i++) {
1558       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1559       if (imm_h[i] == 0) {
1560         zero_count++;
1561       } else if (imm_h[i] == 0xffffL) {
1562         neg_count++;
1563       }
1564     }
1565     if (zero_count == 4) {
1566       // one MOVZ will do
1567       movz(dst, 0);
1568     } else if (neg_count == 4) {
1569       // one MOVN will do
1570       movn(dst, 0);
1571     } else if (zero_count == 3) {
1572       for (i = 0; i < 4; i++) {
1573         if (imm_h[i] != 0L) {
1574           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1575           break;
1576         }
1577       }
1578     } else if (neg_count == 3) {
1579       // one MOVN will do
1580       for (int i = 0; i < 4; i++) {
1581         if (imm_h[i] != 0xffffL) {
1582           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1583           break;
1584         }
1585       }
1586     } else if (zero_count == 2) {
1587       // one MOVZ and one MOVK will do
1588       for (i = 0; i < 3; i++) {
1589         if (imm_h[i] != 0L) {
1590           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1591           i++;
1592           break;
1593         }
1594       }
1595       for (;i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1598         }
1599       }
1600     } else if (neg_count == 2) {
1601       // one MOVN and one MOVK will do
1602       for (i = 0; i < 4; i++) {
1603         if (imm_h[i] != 0xffffL) {
1604           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1605           i++;
1606           break;
1607         }
1608       }
1609       for (;i < 4; i++) {
1610         if (imm_h[i] != 0xffffL) {
1611           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1612         }
1613       }
1614     } else if (zero_count == 1) {
1615       // one MOVZ and two MOVKs will do
1616       for (i = 0; i < 4; i++) {
1617         if (imm_h[i] != 0L) {
1618           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1619           i++;
1620           break;
1621         }
1622       }
1623       for (;i < 4; i++) {
1624         if (imm_h[i] != 0x0L) {
1625           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1626         }
1627       }
1628     } else if (neg_count == 1) {
1629       // one MOVN and two MOVKs will do
1630       for (i = 0; i < 4; i++) {
1631         if (imm_h[i] != 0xffffL) {
1632           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1633           i++;
1634           break;
1635         }
1636       }
1637       for (;i < 4; i++) {
1638         if (imm_h[i] != 0xffffL) {
1639           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1640         }
1641       }
1642     } else {
1643       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1644       movz(dst, (u_int32_t)imm_h[0], 0);
1645       for (i = 1; i < 4; i++) {
1646         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1647       }
1648     }
1649   }
1650 }
1651 
1652 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1653 {
1654 #ifndef PRODUCT
1655     {
1656       char buffer[64];
1657       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1658       block_comment(buffer);
1659     }
1660 #endif
1661   if (operand_valid_for_logical_immediate(true, imm32)) {
1662     orrw(dst, zr, imm32);
1663   } else {
1664     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1665     // constant
1666     u_int32_t imm_h[2];
1667     imm_h[0] = imm32 & 0xffff;
1668     imm_h[1] = ((imm32 >> 16) & 0xffff);
1669     if (imm_h[0] == 0) {
1670       movzw(dst, imm_h[1], 16);
1671     } else if (imm_h[0] == 0xffff) {
1672       movnw(dst, imm_h[1] ^ 0xffff, 16);
1673     } else if (imm_h[1] == 0) {
1674       movzw(dst, imm_h[0], 0);
1675     } else if (imm_h[1] == 0xffff) {
1676       movnw(dst, imm_h[0] ^ 0xffff, 0);
1677     } else {
1678       // use a MOVZ and MOVK (makes it easier to debug)
1679       movzw(dst, imm_h[0], 0);
1680       movkw(dst, imm_h[1], 16);
1681     }
1682   }
1683 }
1684 
1685 // Form an address from base + offset in Rd.  Rd may or may
1686 // not actually be used: you must use the Address that is returned.
1687 // It is up to you to ensure that the shift provided matches the size
1688 // of your data.
1689 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1690   if (Address::offset_ok_for_immed(byte_offset, shift))
1691     // It fits; no need for any heroics
1692     return Address(base, byte_offset);
1693 
1694   // Don't do anything clever with negative or misaligned offsets
1695   unsigned mask = (1 << shift) - 1;
1696   if (byte_offset < 0 || byte_offset & mask) {
1697     mov(Rd, byte_offset);
1698     add(Rd, base, Rd);
1699     return Address(Rd);
1700   }
1701 
1702   // See if we can do this with two 12-bit offsets
1703   {
1704     unsigned long word_offset = byte_offset >> shift;
1705     unsigned long masked_offset = word_offset & 0xfff000;
1706     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1707         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1708       add(Rd, base, masked_offset << shift);
1709       word_offset -= masked_offset;
1710       return Address(Rd, word_offset << shift);
1711     }
1712   }
1713 
1714   // Do it the hard way
1715   mov(Rd, byte_offset);
1716   add(Rd, base, Rd);
1717   return Address(Rd);
1718 }
1719 
1720 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1721   if (UseLSE) {
1722     mov(tmp, 1);
1723     ldadd(Assembler::word, tmp, zr, counter_addr);
1724     return;
1725   }
1726   Label retry_load;
1727   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1728     prfm(Address(counter_addr), PSTL1STRM);
1729   bind(retry_load);
1730   // flush and load exclusive from the memory location
1731   ldxrw(tmp, counter_addr);
1732   addw(tmp, tmp, 1);
1733   // if we store+flush with no intervening write tmp wil be zero
1734   stxrw(tmp2, tmp, counter_addr);
1735   cbnzw(tmp2, retry_load);
1736 }
1737 
1738 
1739 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1740                                     bool want_remainder, Register scratch)
1741 {
1742   // Full implementation of Java idiv and irem.  The function
1743   // returns the (pc) offset of the div instruction - may be needed
1744   // for implicit exceptions.
1745   //
1746   // constraint : ra/rb =/= scratch
1747   //         normal case
1748   //
1749   // input : ra: dividend
1750   //         rb: divisor
1751   //
1752   // result: either
1753   //         quotient  (= ra idiv rb)
1754   //         remainder (= ra irem rb)
1755 
1756   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1757 
1758   int idivl_offset = offset();
1759   if (! want_remainder) {
1760     sdivw(result, ra, rb);
1761   } else {
1762     sdivw(scratch, ra, rb);
1763     Assembler::msubw(result, scratch, rb, ra);
1764   }
1765 
1766   return idivl_offset;
1767 }
1768 
1769 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1770                                     bool want_remainder, Register scratch)
1771 {
1772   // Full implementation of Java ldiv and lrem.  The function
1773   // returns the (pc) offset of the div instruction - may be needed
1774   // for implicit exceptions.
1775   //
1776   // constraint : ra/rb =/= scratch
1777   //         normal case
1778   //
1779   // input : ra: dividend
1780   //         rb: divisor
1781   //
1782   // result: either
1783   //         quotient  (= ra idiv rb)
1784   //         remainder (= ra irem rb)
1785 
1786   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1787 
1788   int idivq_offset = offset();
1789   if (! want_remainder) {
1790     sdiv(result, ra, rb);
1791   } else {
1792     sdiv(scratch, ra, rb);
1793     Assembler::msub(result, scratch, rb, ra);
1794   }
1795 
1796   return idivq_offset;
1797 }
1798 
1799 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1800   address prev = pc() - NativeMembar::instruction_size;
1801   address last = code()->last_insn();
1802   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1803     NativeMembar *bar = NativeMembar_at(prev);
1804     // We are merging two memory barrier instructions.  On AArch64 we
1805     // can do this simply by ORing them together.
1806     bar->set_kind(bar->get_kind() | order_constraint);
1807     BLOCK_COMMENT("merged membar");
1808   } else {
1809     code()->set_last_insn(pc());
1810     dmb(Assembler::barrier(order_constraint));
1811   }
1812 }
1813 
1814 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1815   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1816     merge_ldst(rt, adr, size_in_bytes, is_store);
1817     code()->clear_last_insn();
1818     return true;
1819   } else {
1820     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1821     const unsigned mask = size_in_bytes - 1;
1822     if (adr.getMode() == Address::base_plus_offset &&
1823         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1824       code()->set_last_insn(pc());
1825     }
1826     return false;
1827   }
1828 }
1829 
1830 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1831   // We always try to merge two adjacent loads into one ldp.
1832   if (!try_merge_ldst(Rx, adr, 8, false)) {
1833     Assembler::ldr(Rx, adr);
1834   }
1835 }
1836 
1837 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1838   // We always try to merge two adjacent loads into one ldp.
1839   if (!try_merge_ldst(Rw, adr, 4, false)) {
1840     Assembler::ldrw(Rw, adr);
1841   }
1842 }
1843 
1844 void MacroAssembler::str(Register Rx, const Address &adr) {
1845   // We always try to merge two adjacent stores into one stp.
1846   if (!try_merge_ldst(Rx, adr, 8, true)) {
1847     Assembler::str(Rx, adr);
1848   }
1849 }
1850 
1851 void MacroAssembler::strw(Register Rw, const Address &adr) {
1852   // We always try to merge two adjacent stores into one stp.
1853   if (!try_merge_ldst(Rw, adr, 4, true)) {
1854     Assembler::strw(Rw, adr);
1855   }
1856 }
1857 
1858 // MacroAssembler routines found actually to be needed
1859 
1860 void MacroAssembler::push(Register src)
1861 {
1862   str(src, Address(pre(esp, -1 * wordSize)));
1863 }
1864 
1865 void MacroAssembler::pop(Register dst)
1866 {
1867   ldr(dst, Address(post(esp, 1 * wordSize)));
1868 }
1869 
1870 // Note: load_unsigned_short used to be called load_unsigned_word.
1871 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1872   int off = offset();
1873   ldrh(dst, src);
1874   return off;
1875 }
1876 
1877 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1878   int off = offset();
1879   ldrb(dst, src);
1880   return off;
1881 }
1882 
1883 int MacroAssembler::load_signed_short(Register dst, Address src) {
1884   int off = offset();
1885   ldrsh(dst, src);
1886   return off;
1887 }
1888 
1889 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1890   int off = offset();
1891   ldrsb(dst, src);
1892   return off;
1893 }
1894 
1895 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1896   int off = offset();
1897   ldrshw(dst, src);
1898   return off;
1899 }
1900 
1901 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1902   int off = offset();
1903   ldrsbw(dst, src);
1904   return off;
1905 }
1906 
1907 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1908   switch (size_in_bytes) {
1909   case  8:  ldr(dst, src); break;
1910   case  4:  ldrw(dst, src); break;
1911   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1912   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1913   default:  ShouldNotReachHere();
1914   }
1915 }
1916 
1917 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1918   switch (size_in_bytes) {
1919   case  8:  str(src, dst); break;
1920   case  4:  strw(src, dst); break;
1921   case  2:  strh(src, dst); break;
1922   case  1:  strb(src, dst); break;
1923   default:  ShouldNotReachHere();
1924   }
1925 }
1926 
1927 void MacroAssembler::decrementw(Register reg, int value)
1928 {
1929   if (value < 0)  { incrementw(reg, -value);      return; }
1930   if (value == 0) {                               return; }
1931   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1932   /* else */ {
1933     guarantee(reg != rscratch2, "invalid dst for register decrement");
1934     movw(rscratch2, (unsigned)value);
1935     subw(reg, reg, rscratch2);
1936   }
1937 }
1938 
1939 void MacroAssembler::decrement(Register reg, int value)
1940 {
1941   if (value < 0)  { increment(reg, -value);      return; }
1942   if (value == 0) {                              return; }
1943   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1944   /* else */ {
1945     assert(reg != rscratch2, "invalid dst for register decrement");
1946     mov(rscratch2, (unsigned long)value);
1947     sub(reg, reg, rscratch2);
1948   }
1949 }
1950 
1951 void MacroAssembler::decrementw(Address dst, int value)
1952 {
1953   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1954   if (dst.getMode() == Address::literal) {
1955     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1956     lea(rscratch2, dst);
1957     dst = Address(rscratch2);
1958   }
1959   ldrw(rscratch1, dst);
1960   decrementw(rscratch1, value);
1961   strw(rscratch1, dst);
1962 }
1963 
1964 void MacroAssembler::decrement(Address dst, int value)
1965 {
1966   assert(!dst.uses(rscratch1), "invalid address for decrement");
1967   if (dst.getMode() == Address::literal) {
1968     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1969     lea(rscratch2, dst);
1970     dst = Address(rscratch2);
1971   }
1972   ldr(rscratch1, dst);
1973   decrement(rscratch1, value);
1974   str(rscratch1, dst);
1975 }
1976 
1977 void MacroAssembler::incrementw(Register reg, int value)
1978 {
1979   if (value < 0)  { decrementw(reg, -value);      return; }
1980   if (value == 0) {                               return; }
1981   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1982   /* else */ {
1983     assert(reg != rscratch2, "invalid dst for register increment");
1984     movw(rscratch2, (unsigned)value);
1985     addw(reg, reg, rscratch2);
1986   }
1987 }
1988 
1989 void MacroAssembler::increment(Register reg, int value)
1990 {
1991   if (value < 0)  { decrement(reg, -value);      return; }
1992   if (value == 0) {                              return; }
1993   if (value < (1 << 12)) { add(reg, reg, value); return; }
1994   /* else */ {
1995     assert(reg != rscratch2, "invalid dst for register increment");
1996     movw(rscratch2, (unsigned)value);
1997     add(reg, reg, rscratch2);
1998   }
1999 }
2000 
2001 void MacroAssembler::incrementw(Address dst, int value)
2002 {
2003   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2004   if (dst.getMode() == Address::literal) {
2005     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2006     lea(rscratch2, dst);
2007     dst = Address(rscratch2);
2008   }
2009   ldrw(rscratch1, dst);
2010   incrementw(rscratch1, value);
2011   strw(rscratch1, dst);
2012 }
2013 
2014 void MacroAssembler::increment(Address dst, int value)
2015 {
2016   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2017   if (dst.getMode() == Address::literal) {
2018     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2019     lea(rscratch2, dst);
2020     dst = Address(rscratch2);
2021   }
2022   ldr(rscratch1, dst);
2023   increment(rscratch1, value);
2024   str(rscratch1, dst);
2025 }
2026 
2027 
2028 void MacroAssembler::pusha() {
2029   push(0x7fffffff, sp);
2030 }
2031 
2032 void MacroAssembler::popa() {
2033   pop(0x7fffffff, sp);
2034 }
2035 
2036 // Push lots of registers in the bit set supplied.  Don't push sp.
2037 // Return the number of words pushed
2038 int MacroAssembler::push(unsigned int bitset, Register stack) {
2039   int words_pushed = 0;
2040 
2041   // Scan bitset to accumulate register pairs
2042   unsigned char regs[32];
2043   int count = 0;
2044   for (int reg = 0; reg <= 30; reg++) {
2045     if (1 & bitset)
2046       regs[count++] = reg;
2047     bitset >>= 1;
2048   }
2049   regs[count++] = zr->encoding_nocheck();
2050   count &= ~1;  // Only push an even nuber of regs
2051 
2052   if (count) {
2053     stp(as_Register(regs[0]), as_Register(regs[1]),
2054        Address(pre(stack, -count * wordSize)));
2055     words_pushed += 2;
2056   }
2057   for (int i = 2; i < count; i += 2) {
2058     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2059        Address(stack, i * wordSize));
2060     words_pushed += 2;
2061   }
2062 
2063   assert(words_pushed == count, "oops, pushed != count");
2064 
2065   return count;
2066 }
2067 
2068 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2069   int words_pushed = 0;
2070 
2071   // Scan bitset to accumulate register pairs
2072   unsigned char regs[32];
2073   int count = 0;
2074   for (int reg = 0; reg <= 30; reg++) {
2075     if (1 & bitset)
2076       regs[count++] = reg;
2077     bitset >>= 1;
2078   }
2079   regs[count++] = zr->encoding_nocheck();
2080   count &= ~1;
2081 
2082   for (int i = 2; i < count; i += 2) {
2083     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2084        Address(stack, i * wordSize));
2085     words_pushed += 2;
2086   }
2087   if (count) {
2088     ldp(as_Register(regs[0]), as_Register(regs[1]),
2089        Address(post(stack, count * wordSize)));
2090     words_pushed += 2;
2091   }
2092 
2093   assert(words_pushed == count, "oops, pushed != count");
2094 
2095   return count;
2096 }
2097 #ifdef ASSERT
2098 void MacroAssembler::verify_heapbase(const char* msg) {
2099 #if 0
2100   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2101   assert (Universe::heap() != NULL, "java heap should be initialized");
2102   if (CheckCompressedOops) {
2103     Label ok;
2104     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2105     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2106     br(Assembler::EQ, ok);
2107     stop(msg);
2108     bind(ok);
2109     pop(1 << rscratch1->encoding(), sp);
2110   }
2111 #endif
2112 }
2113 #endif
2114 
2115 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2116   Label done, not_weak;
2117   cbz(value, done);           // Use NULL as-is.
2118 
2119   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2120   tbz(r0, 0, not_weak);    // Test for jweak tag.
2121 
2122   // Resolve jweak.
2123   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2124                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2125   verify_oop(value);
2126   b(done);
2127 
2128   bind(not_weak);
2129   // Resolve (untagged) jobject.
2130   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2131   verify_oop(value);
2132   bind(done);
2133 }
2134 
2135 void MacroAssembler::stop(const char* msg) {
2136   address ip = pc();
2137   pusha();
2138   mov(c_rarg0, (address)msg);
2139   mov(c_rarg1, (address)ip);
2140   mov(c_rarg2, sp);
2141   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2142   // call(c_rarg3);
2143   blrt(c_rarg3, 3, 0, 1);
2144   hlt(0);
2145 }
2146 
2147 void MacroAssembler::unimplemented(const char* what) {
2148   const char* buf = NULL;
2149   {
2150     ResourceMark rm;
2151     stringStream ss;
2152     ss.print("unimplemented: %s", what);
2153     buf = code_string(ss.as_string());
2154   }
2155   stop(buf);
2156 }
2157 
2158 // If a constant does not fit in an immediate field, generate some
2159 // number of MOV instructions and then perform the operation.
2160 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2161                                            add_sub_imm_insn insn1,
2162                                            add_sub_reg_insn insn2) {
2163   assert(Rd != zr, "Rd = zr and not setting flags?");
2164   if (operand_valid_for_add_sub_immediate((int)imm)) {
2165     (this->*insn1)(Rd, Rn, imm);
2166   } else {
2167     if (uabs(imm) < (1 << 24)) {
2168        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2169        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2170     } else {
2171        assert_different_registers(Rd, Rn);
2172        mov(Rd, (uint64_t)imm);
2173        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2174     }
2175   }
2176 }
2177 
2178 // Seperate vsn which sets the flags. Optimisations are more restricted
2179 // because we must set the flags correctly.
2180 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2181                                            add_sub_imm_insn insn1,
2182                                            add_sub_reg_insn insn2) {
2183   if (operand_valid_for_add_sub_immediate((int)imm)) {
2184     (this->*insn1)(Rd, Rn, imm);
2185   } else {
2186     assert_different_registers(Rd, Rn);
2187     assert(Rd != zr, "overflow in immediate operand");
2188     mov(Rd, (uint64_t)imm);
2189     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2190   }
2191 }
2192 
2193 
2194 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2195   if (increment.is_register()) {
2196     add(Rd, Rn, increment.as_register());
2197   } else {
2198     add(Rd, Rn, increment.as_constant());
2199   }
2200 }
2201 
2202 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2203   if (increment.is_register()) {
2204     addw(Rd, Rn, increment.as_register());
2205   } else {
2206     addw(Rd, Rn, increment.as_constant());
2207   }
2208 }
2209 
2210 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2211   if (decrement.is_register()) {
2212     sub(Rd, Rn, decrement.as_register());
2213   } else {
2214     sub(Rd, Rn, decrement.as_constant());
2215   }
2216 }
2217 
2218 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2219   if (decrement.is_register()) {
2220     subw(Rd, Rn, decrement.as_register());
2221   } else {
2222     subw(Rd, Rn, decrement.as_constant());
2223   }
2224 }
2225 
2226 void MacroAssembler::reinit_heapbase()
2227 {
2228   if (UseCompressedOops) {
2229     if (Universe::is_fully_initialized()) {
2230       mov(rheapbase, Universe::narrow_ptrs_base());
2231     } else {
2232       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2233       ldr(rheapbase, Address(rheapbase));
2234     }
2235   }
2236 }
2237 
2238 // this simulates the behaviour of the x86 cmpxchg instruction using a
2239 // load linked/store conditional pair. we use the acquire/release
2240 // versions of these instructions so that we flush pending writes as
2241 // per Java semantics.
2242 
2243 // n.b the x86 version assumes the old value to be compared against is
2244 // in rax and updates rax with the value located in memory if the
2245 // cmpxchg fails. we supply a register for the old value explicitly
2246 
2247 // the aarch64 load linked/store conditional instructions do not
2248 // accept an offset. so, unlike x86, we must provide a plain register
2249 // to identify the memory word to be compared/exchanged rather than a
2250 // register+offset Address.
2251 
2252 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2253                                 Label &succeed, Label *fail) {
2254   // oldv holds comparison value
2255   // newv holds value to write in exchange
2256   // addr identifies memory word to compare against/update
2257   if (UseLSE) {
2258     mov(tmp, oldv);
2259     casal(Assembler::xword, oldv, newv, addr);
2260     cmp(tmp, oldv);
2261     br(Assembler::EQ, succeed);
2262     membar(AnyAny);
2263   } else {
2264     Label retry_load, nope;
2265     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2266       prfm(Address(addr), PSTL1STRM);
2267     bind(retry_load);
2268     // flush and load exclusive from the memory location
2269     // and fail if it is not what we expect
2270     ldaxr(tmp, addr);
2271     cmp(tmp, oldv);
2272     br(Assembler::NE, nope);
2273     // if we store+flush with no intervening write tmp wil be zero
2274     stlxr(tmp, newv, addr);
2275     cbzw(tmp, succeed);
2276     // retry so we only ever return after a load fails to compare
2277     // ensures we don't return a stale value after a failed write.
2278     b(retry_load);
2279     // if the memory word differs we return it in oldv and signal a fail
2280     bind(nope);
2281     membar(AnyAny);
2282     mov(oldv, tmp);
2283   }
2284   if (fail)
2285     b(*fail);
2286 }
2287 
2288 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2289                                         Label &succeed, Label *fail) {
2290   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2291   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2292 }
2293 
2294 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2295                                 Label &succeed, Label *fail) {
2296   // oldv holds comparison value
2297   // newv holds value to write in exchange
2298   // addr identifies memory word to compare against/update
2299   // tmp returns 0/1 for success/failure
2300   if (UseLSE) {
2301     mov(tmp, oldv);
2302     casal(Assembler::word, oldv, newv, addr);
2303     cmp(tmp, oldv);
2304     br(Assembler::EQ, succeed);
2305     membar(AnyAny);
2306   } else {
2307     Label retry_load, nope;
2308     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2309       prfm(Address(addr), PSTL1STRM);
2310     bind(retry_load);
2311     // flush and load exclusive from the memory location
2312     // and fail if it is not what we expect
2313     ldaxrw(tmp, addr);
2314     cmp(tmp, oldv);
2315     br(Assembler::NE, nope);
2316     // if we store+flush with no intervening write tmp wil be zero
2317     stlxrw(tmp, newv, addr);
2318     cbzw(tmp, succeed);
2319     // retry so we only ever return after a load fails to compare
2320     // ensures we don't return a stale value after a failed write.
2321     b(retry_load);
2322     // if the memory word differs we return it in oldv and signal a fail
2323     bind(nope);
2324     membar(AnyAny);
2325     mov(oldv, tmp);
2326   }
2327   if (fail)
2328     b(*fail);
2329 }
2330 
2331 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2332 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2333 // Pass a register for the result, otherwise pass noreg.
2334 
2335 // Clobbers rscratch1
2336 void MacroAssembler::cmpxchg(Register addr, Register expected,
2337                              Register new_val,
2338                              enum operand_size size,
2339                              bool acquire, bool release,
2340                              bool weak,
2341                              Register result) {
2342   if (result == noreg)  result = rscratch1;
2343   if (UseLSE) {
2344     mov(result, expected);
2345     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2346     cmp(result, expected);
2347   } else {
2348     BLOCK_COMMENT("cmpxchg {");
2349     Label retry_load, done;
2350     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2351       prfm(Address(addr), PSTL1STRM);
2352     bind(retry_load);
2353     load_exclusive(result, addr, size, acquire);
2354     if (size == xword)
2355       cmp(result, expected);
2356     else
2357       cmpw(result, expected);
2358     br(Assembler::NE, done);
2359     store_exclusive(rscratch1, new_val, addr, size, release);
2360     if (weak) {
2361       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2362     } else {
2363       cbnzw(rscratch1, retry_load);
2364     }
2365     bind(done);
2366     BLOCK_COMMENT("} cmpxchg");
2367   }
2368 }
2369 
2370 static bool different(Register a, RegisterOrConstant b, Register c) {
2371   if (b.is_constant())
2372     return a != c;
2373   else
2374     return a != b.as_register() && a != c && b.as_register() != c;
2375 }
2376 
2377 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2378 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2379   if (UseLSE) {                                                         \
2380     prev = prev->is_valid() ? prev : zr;                                \
2381     if (incr.is_register()) {                                           \
2382       AOP(sz, incr.as_register(), prev, addr);                          \
2383     } else {                                                            \
2384       mov(rscratch2, incr.as_constant());                               \
2385       AOP(sz, rscratch2, prev, addr);                                   \
2386     }                                                                   \
2387     return;                                                             \
2388   }                                                                     \
2389   Register result = rscratch2;                                          \
2390   if (prev->is_valid())                                                 \
2391     result = different(prev, incr, addr) ? prev : rscratch2;            \
2392                                                                         \
2393   Label retry_load;                                                     \
2394   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2395     prfm(Address(addr), PSTL1STRM);                                     \
2396   bind(retry_load);                                                     \
2397   LDXR(result, addr);                                                   \
2398   OP(rscratch1, result, incr);                                          \
2399   STXR(rscratch2, rscratch1, addr);                                     \
2400   cbnzw(rscratch2, retry_load);                                         \
2401   if (prev->is_valid() && prev != result) {                             \
2402     IOP(prev, rscratch1, incr);                                         \
2403   }                                                                     \
2404 }
2405 
2406 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2407 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2408 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2409 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2410 
2411 #undef ATOMIC_OP
2412 
2413 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2414 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2415   if (UseLSE) {                                                         \
2416     prev = prev->is_valid() ? prev : zr;                                \
2417     AOP(sz, newv, prev, addr);                                          \
2418     return;                                                             \
2419   }                                                                     \
2420   Register result = rscratch2;                                          \
2421   if (prev->is_valid())                                                 \
2422     result = different(prev, newv, addr) ? prev : rscratch2;            \
2423                                                                         \
2424   Label retry_load;                                                     \
2425   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2426     prfm(Address(addr), PSTL1STRM);                                     \
2427   bind(retry_load);                                                     \
2428   LDXR(result, addr);                                                   \
2429   STXR(rscratch1, newv, addr);                                          \
2430   cbnzw(rscratch1, retry_load);                                         \
2431   if (prev->is_valid() && prev != result)                               \
2432     mov(prev, result);                                                  \
2433 }
2434 
2435 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2436 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2437 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2438 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2439 
2440 #undef ATOMIC_XCHG
2441 
2442 #ifndef PRODUCT
2443 extern "C" void findpc(intptr_t x);
2444 #endif
2445 
2446 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2447 {
2448   // In order to get locks to work, we need to fake a in_VM state
2449   if (ShowMessageBoxOnError ) {
2450     JavaThread* thread = JavaThread::current();
2451     JavaThreadState saved_state = thread->thread_state();
2452     thread->set_thread_state(_thread_in_vm);
2453 #ifndef PRODUCT
2454     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2455       ttyLocker ttyl;
2456       BytecodeCounter::print();
2457     }
2458 #endif
2459     if (os::message_box(msg, "Execution stopped, print registers?")) {
2460       ttyLocker ttyl;
2461       tty->print_cr(" pc = 0x%016lx", pc);
2462 #ifndef PRODUCT
2463       tty->cr();
2464       findpc(pc);
2465       tty->cr();
2466 #endif
2467       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2468       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2469       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2470       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2471       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2472       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2473       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2474       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2475       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2476       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2477       tty->print_cr("r10 = 0x%016lx", regs[10]);
2478       tty->print_cr("r11 = 0x%016lx", regs[11]);
2479       tty->print_cr("r12 = 0x%016lx", regs[12]);
2480       tty->print_cr("r13 = 0x%016lx", regs[13]);
2481       tty->print_cr("r14 = 0x%016lx", regs[14]);
2482       tty->print_cr("r15 = 0x%016lx", regs[15]);
2483       tty->print_cr("r16 = 0x%016lx", regs[16]);
2484       tty->print_cr("r17 = 0x%016lx", regs[17]);
2485       tty->print_cr("r18 = 0x%016lx", regs[18]);
2486       tty->print_cr("r19 = 0x%016lx", regs[19]);
2487       tty->print_cr("r20 = 0x%016lx", regs[20]);
2488       tty->print_cr("r21 = 0x%016lx", regs[21]);
2489       tty->print_cr("r22 = 0x%016lx", regs[22]);
2490       tty->print_cr("r23 = 0x%016lx", regs[23]);
2491       tty->print_cr("r24 = 0x%016lx", regs[24]);
2492       tty->print_cr("r25 = 0x%016lx", regs[25]);
2493       tty->print_cr("r26 = 0x%016lx", regs[26]);
2494       tty->print_cr("r27 = 0x%016lx", regs[27]);
2495       tty->print_cr("r28 = 0x%016lx", regs[28]);
2496       tty->print_cr("r30 = 0x%016lx", regs[30]);
2497       tty->print_cr("r31 = 0x%016lx", regs[31]);
2498       BREAKPOINT;
2499     }
2500     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2501   } else {
2502     ttyLocker ttyl;
2503     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2504                     msg);
2505     assert(false, "DEBUG MESSAGE: %s", msg);
2506   }
2507 }
2508 
2509 #ifdef BUILTIN_SIM
2510 // routine to generate an x86 prolog for a stub function which
2511 // bootstraps into the generated ARM code which directly follows the
2512 // stub
2513 //
2514 // the argument encodes the number of general and fp registers
2515 // passed by the caller and the callng convention (currently just
2516 // the number of general registers and assumes C argument passing)
2517 
2518 extern "C" {
2519 int aarch64_stub_prolog_size();
2520 void aarch64_stub_prolog();
2521 void aarch64_prolog();
2522 }
2523 
2524 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2525                                    address *prolog_ptr)
2526 {
2527   int calltype = (((ret_type & 0x3) << 8) |
2528                   ((fp_arg_count & 0xf) << 4) |
2529                   (gp_arg_count & 0xf));
2530 
2531   // the addresses for the x86 to ARM entry code we need to use
2532   address start = pc();
2533   // printf("start = %lx\n", start);
2534   int byteCount =  aarch64_stub_prolog_size();
2535   // printf("byteCount = %x\n", byteCount);
2536   int instructionCount = (byteCount + 3)/ 4;
2537   // printf("instructionCount = %x\n", instructionCount);
2538   for (int i = 0; i < instructionCount; i++) {
2539     nop();
2540   }
2541 
2542   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2543 
2544   // write the address of the setup routine and the call format at the
2545   // end of into the copied code
2546   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2547   if (prolog_ptr)
2548     patch_end[-2] = (u_int64_t)prolog_ptr;
2549   patch_end[-1] = calltype;
2550 }
2551 #endif
2552 
2553 void MacroAssembler::push_call_clobbered_registers() {
2554   int step = 4 * wordSize;
2555   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2556   sub(sp, sp, step);
2557   mov(rscratch1, -step);
2558   // Push v0-v7, v16-v31.
2559   for (int i = 31; i>= 4; i -= 4) {
2560     if (i <= v7->encoding() || i >= v16->encoding())
2561       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2562           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2563   }
2564   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2565       as_FloatRegister(3), T1D, Address(sp));
2566 }
2567 
2568 void MacroAssembler::pop_call_clobbered_registers() {
2569   for (int i = 0; i < 32; i += 4) {
2570     if (i <= v7->encoding() || i >= v16->encoding())
2571       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2572           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2573   }
2574 
2575   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2576 }
2577 
2578 void MacroAssembler::push_CPU_state(bool save_vectors) {
2579   int step = (save_vectors ? 8 : 4) * wordSize;
2580   push(0x3fffffff, sp);         // integer registers except lr & sp
2581   mov(rscratch1, -step);
2582   sub(sp, sp, step);
2583   for (int i = 28; i >= 4; i -= 4) {
2584     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2585         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2586   }
2587   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2588 }
2589 
2590 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2591   int step = (restore_vectors ? 8 : 4) * wordSize;
2592   for (int i = 0; i <= 28; i += 4)
2593     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2594         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2595   pop(0x3fffffff, sp);         // integer registers except lr & sp
2596 }
2597 
2598 /**
2599  * Helpers for multiply_to_len().
2600  */
2601 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2602                                      Register src1, Register src2) {
2603   adds(dest_lo, dest_lo, src1);
2604   adc(dest_hi, dest_hi, zr);
2605   adds(dest_lo, dest_lo, src2);
2606   adc(final_dest_hi, dest_hi, zr);
2607 }
2608 
2609 // Generate an address from (r + r1 extend offset).  "size" is the
2610 // size of the operand.  The result may be in rscratch2.
2611 Address MacroAssembler::offsetted_address(Register r, Register r1,
2612                                           Address::extend ext, int offset, int size) {
2613   if (offset || (ext.shift() % size != 0)) {
2614     lea(rscratch2, Address(r, r1, ext));
2615     return Address(rscratch2, offset);
2616   } else {
2617     return Address(r, r1, ext);
2618   }
2619 }
2620 
2621 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2622 {
2623   assert(offset >= 0, "spill to negative address?");
2624   // Offset reachable ?
2625   //   Not aligned - 9 bits signed offset
2626   //   Aligned - 12 bits unsigned offset shifted
2627   Register base = sp;
2628   if ((offset & (size-1)) && offset >= (1<<8)) {
2629     add(tmp, base, offset & ((1<<12)-1));
2630     base = tmp;
2631     offset &= -1<<12;
2632   }
2633 
2634   if (offset >= (1<<12) * size) {
2635     add(tmp, base, offset & (((1<<12)-1)<<12));
2636     base = tmp;
2637     offset &= ~(((1<<12)-1)<<12);
2638   }
2639 
2640   return Address(base, offset);
2641 }
2642 
2643 // Checks whether offset is aligned.
2644 // Returns true if it is, else false.
2645 bool MacroAssembler::merge_alignment_check(Register base,
2646                                            size_t size,
2647                                            long cur_offset,
2648                                            long prev_offset) const {
2649   if (AvoidUnalignedAccesses) {
2650     if (base == sp) {
2651       // Checks whether low offset if aligned to pair of registers.
2652       long pair_mask = size * 2 - 1;
2653       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2654       return (offset & pair_mask) == 0;
2655     } else { // If base is not sp, we can't guarantee the access is aligned.
2656       return false;
2657     }
2658   } else {
2659     long mask = size - 1;
2660     // Load/store pair instruction only supports element size aligned offset.
2661     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2662   }
2663 }
2664 
2665 // Checks whether current and previous loads/stores can be merged.
2666 // Returns true if it can be merged, else false.
2667 bool MacroAssembler::ldst_can_merge(Register rt,
2668                                     const Address &adr,
2669                                     size_t cur_size_in_bytes,
2670                                     bool is_store) const {
2671   address prev = pc() - NativeInstruction::instruction_size;
2672   address last = code()->last_insn();
2673 
2674   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2675     return false;
2676   }
2677 
2678   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2679     return false;
2680   }
2681 
2682   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2683   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2684 
2685   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2686   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2687 
2688   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2689     return false;
2690   }
2691 
2692   long max_offset = 63 * prev_size_in_bytes;
2693   long min_offset = -64 * prev_size_in_bytes;
2694 
2695   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2696 
2697   // Only same base can be merged.
2698   if (adr.base() != prev_ldst->base()) {
2699     return false;
2700   }
2701 
2702   long cur_offset = adr.offset();
2703   long prev_offset = prev_ldst->offset();
2704   size_t diff = abs(cur_offset - prev_offset);
2705   if (diff != prev_size_in_bytes) {
2706     return false;
2707   }
2708 
2709   // Following cases can not be merged:
2710   // ldr x2, [x2, #8]
2711   // ldr x3, [x2, #16]
2712   // or:
2713   // ldr x2, [x3, #8]
2714   // ldr x2, [x3, #16]
2715   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2716   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2717     return false;
2718   }
2719 
2720   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2721   // Offset range must be in ldp/stp instruction's range.
2722   if (low_offset > max_offset || low_offset < min_offset) {
2723     return false;
2724   }
2725 
2726   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2727     return true;
2728   }
2729 
2730   return false;
2731 }
2732 
2733 // Merge current load/store with previous load/store into ldp/stp.
2734 void MacroAssembler::merge_ldst(Register rt,
2735                                 const Address &adr,
2736                                 size_t cur_size_in_bytes,
2737                                 bool is_store) {
2738 
2739   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2740 
2741   Register rt_low, rt_high;
2742   address prev = pc() - NativeInstruction::instruction_size;
2743   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2744 
2745   long offset;
2746 
2747   if (adr.offset() < prev_ldst->offset()) {
2748     offset = adr.offset();
2749     rt_low = rt;
2750     rt_high = prev_ldst->target();
2751   } else {
2752     offset = prev_ldst->offset();
2753     rt_low = prev_ldst->target();
2754     rt_high = rt;
2755   }
2756 
2757   Address adr_p = Address(prev_ldst->base(), offset);
2758   // Overwrite previous generated binary.
2759   code_section()->set_end(prev);
2760 
2761   const int sz = prev_ldst->size_in_bytes();
2762   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2763   if (!is_store) {
2764     BLOCK_COMMENT("merged ldr pair");
2765     if (sz == 8) {
2766       ldp(rt_low, rt_high, adr_p);
2767     } else {
2768       ldpw(rt_low, rt_high, adr_p);
2769     }
2770   } else {
2771     BLOCK_COMMENT("merged str pair");
2772     if (sz == 8) {
2773       stp(rt_low, rt_high, adr_p);
2774     } else {
2775       stpw(rt_low, rt_high, adr_p);
2776     }
2777   }
2778 }
2779 
2780 /**
2781  * Multiply 64 bit by 64 bit first loop.
2782  */
2783 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2784                                            Register y, Register y_idx, Register z,
2785                                            Register carry, Register product,
2786                                            Register idx, Register kdx) {
2787   //
2788   //  jlong carry, x[], y[], z[];
2789   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2790   //    huge_128 product = y[idx] * x[xstart] + carry;
2791   //    z[kdx] = (jlong)product;
2792   //    carry  = (jlong)(product >>> 64);
2793   //  }
2794   //  z[xstart] = carry;
2795   //
2796 
2797   Label L_first_loop, L_first_loop_exit;
2798   Label L_one_x, L_one_y, L_multiply;
2799 
2800   subsw(xstart, xstart, 1);
2801   br(Assembler::MI, L_one_x);
2802 
2803   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2804   ldr(x_xstart, Address(rscratch1));
2805   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2806 
2807   bind(L_first_loop);
2808   subsw(idx, idx, 1);
2809   br(Assembler::MI, L_first_loop_exit);
2810   subsw(idx, idx, 1);
2811   br(Assembler::MI, L_one_y);
2812   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2813   ldr(y_idx, Address(rscratch1));
2814   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2815   bind(L_multiply);
2816 
2817   // AArch64 has a multiply-accumulate instruction that we can't use
2818   // here because it has no way to process carries, so we have to use
2819   // separate add and adc instructions.  Bah.
2820   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2821   mul(product, x_xstart, y_idx);
2822   adds(product, product, carry);
2823   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2824 
2825   subw(kdx, kdx, 2);
2826   ror(product, product, 32); // back to big-endian
2827   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2828 
2829   b(L_first_loop);
2830 
2831   bind(L_one_y);
2832   ldrw(y_idx, Address(y,  0));
2833   b(L_multiply);
2834 
2835   bind(L_one_x);
2836   ldrw(x_xstart, Address(x,  0));
2837   b(L_first_loop);
2838 
2839   bind(L_first_loop_exit);
2840 }
2841 
2842 /**
2843  * Multiply 128 bit by 128. Unrolled inner loop.
2844  *
2845  */
2846 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2847                                              Register carry, Register carry2,
2848                                              Register idx, Register jdx,
2849                                              Register yz_idx1, Register yz_idx2,
2850                                              Register tmp, Register tmp3, Register tmp4,
2851                                              Register tmp6, Register product_hi) {
2852 
2853   //   jlong carry, x[], y[], z[];
2854   //   int kdx = ystart+1;
2855   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2856   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2857   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2858   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2859   //     carry  = (jlong)(tmp4 >>> 64);
2860   //     z[kdx+idx+1] = (jlong)tmp3;
2861   //     z[kdx+idx] = (jlong)tmp4;
2862   //   }
2863   //   idx += 2;
2864   //   if (idx > 0) {
2865   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2866   //     z[kdx+idx] = (jlong)yz_idx1;
2867   //     carry  = (jlong)(yz_idx1 >>> 64);
2868   //   }
2869   //
2870 
2871   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2872 
2873   lsrw(jdx, idx, 2);
2874 
2875   bind(L_third_loop);
2876 
2877   subsw(jdx, jdx, 1);
2878   br(Assembler::MI, L_third_loop_exit);
2879   subw(idx, idx, 4);
2880 
2881   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2882 
2883   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2884 
2885   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2886 
2887   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2888   ror(yz_idx2, yz_idx2, 32);
2889 
2890   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2891 
2892   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2893   umulh(tmp4, product_hi, yz_idx1);
2894 
2895   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2896   ror(rscratch2, rscratch2, 32);
2897 
2898   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2899   umulh(carry2, product_hi, yz_idx2);
2900 
2901   // propagate sum of both multiplications into carry:tmp4:tmp3
2902   adds(tmp3, tmp3, carry);
2903   adc(tmp4, tmp4, zr);
2904   adds(tmp3, tmp3, rscratch1);
2905   adcs(tmp4, tmp4, tmp);
2906   adc(carry, carry2, zr);
2907   adds(tmp4, tmp4, rscratch2);
2908   adc(carry, carry, zr);
2909 
2910   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2911   ror(tmp4, tmp4, 32);
2912   stp(tmp4, tmp3, Address(tmp6, 0));
2913 
2914   b(L_third_loop);
2915   bind (L_third_loop_exit);
2916 
2917   andw (idx, idx, 0x3);
2918   cbz(idx, L_post_third_loop_done);
2919 
2920   Label L_check_1;
2921   subsw(idx, idx, 2);
2922   br(Assembler::MI, L_check_1);
2923 
2924   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2925   ldr(yz_idx1, Address(rscratch1, 0));
2926   ror(yz_idx1, yz_idx1, 32);
2927   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2928   umulh(tmp4, product_hi, yz_idx1);
2929   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2930   ldr(yz_idx2, Address(rscratch1, 0));
2931   ror(yz_idx2, yz_idx2, 32);
2932 
2933   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2934 
2935   ror(tmp3, tmp3, 32);
2936   str(tmp3, Address(rscratch1, 0));
2937 
2938   bind (L_check_1);
2939 
2940   andw (idx, idx, 0x1);
2941   subsw(idx, idx, 1);
2942   br(Assembler::MI, L_post_third_loop_done);
2943   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2944   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2945   umulh(carry2, tmp4, product_hi);
2946   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2947 
2948   add2_with_carry(carry2, tmp3, tmp4, carry);
2949 
2950   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2951   extr(carry, carry2, tmp3, 32);
2952 
2953   bind(L_post_third_loop_done);
2954 }
2955 
2956 /**
2957  * Code for BigInteger::multiplyToLen() instrinsic.
2958  *
2959  * r0: x
2960  * r1: xlen
2961  * r2: y
2962  * r3: ylen
2963  * r4:  z
2964  * r5: zlen
2965  * r10: tmp1
2966  * r11: tmp2
2967  * r12: tmp3
2968  * r13: tmp4
2969  * r14: tmp5
2970  * r15: tmp6
2971  * r16: tmp7
2972  *
2973  */
2974 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2975                                      Register z, Register zlen,
2976                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2977                                      Register tmp5, Register tmp6, Register product_hi) {
2978 
2979   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2980 
2981   const Register idx = tmp1;
2982   const Register kdx = tmp2;
2983   const Register xstart = tmp3;
2984 
2985   const Register y_idx = tmp4;
2986   const Register carry = tmp5;
2987   const Register product  = xlen;
2988   const Register x_xstart = zlen;  // reuse register
2989 
2990   // First Loop.
2991   //
2992   //  final static long LONG_MASK = 0xffffffffL;
2993   //  int xstart = xlen - 1;
2994   //  int ystart = ylen - 1;
2995   //  long carry = 0;
2996   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2997   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2998   //    z[kdx] = (int)product;
2999   //    carry = product >>> 32;
3000   //  }
3001   //  z[xstart] = (int)carry;
3002   //
3003 
3004   movw(idx, ylen);      // idx = ylen;
3005   movw(kdx, zlen);      // kdx = xlen+ylen;
3006   mov(carry, zr);       // carry = 0;
3007 
3008   Label L_done;
3009 
3010   movw(xstart, xlen);
3011   subsw(xstart, xstart, 1);
3012   br(Assembler::MI, L_done);
3013 
3014   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3015 
3016   Label L_second_loop;
3017   cbzw(kdx, L_second_loop);
3018 
3019   Label L_carry;
3020   subw(kdx, kdx, 1);
3021   cbzw(kdx, L_carry);
3022 
3023   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3024   lsr(carry, carry, 32);
3025   subw(kdx, kdx, 1);
3026 
3027   bind(L_carry);
3028   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3029 
3030   // Second and third (nested) loops.
3031   //
3032   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3033   //   carry = 0;
3034   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3035   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3036   //                    (z[k] & LONG_MASK) + carry;
3037   //     z[k] = (int)product;
3038   //     carry = product >>> 32;
3039   //   }
3040   //   z[i] = (int)carry;
3041   // }
3042   //
3043   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3044 
3045   const Register jdx = tmp1;
3046 
3047   bind(L_second_loop);
3048   mov(carry, zr);                // carry = 0;
3049   movw(jdx, ylen);               // j = ystart+1
3050 
3051   subsw(xstart, xstart, 1);      // i = xstart-1;
3052   br(Assembler::MI, L_done);
3053 
3054   str(z, Address(pre(sp, -4 * wordSize)));
3055 
3056   Label L_last_x;
3057   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3058   subsw(xstart, xstart, 1);       // i = xstart-1;
3059   br(Assembler::MI, L_last_x);
3060 
3061   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3062   ldr(product_hi, Address(rscratch1));
3063   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3064 
3065   Label L_third_loop_prologue;
3066   bind(L_third_loop_prologue);
3067 
3068   str(ylen, Address(sp, wordSize));
3069   stp(x, xstart, Address(sp, 2 * wordSize));
3070   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3071                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3072   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3073   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3074 
3075   addw(tmp3, xlen, 1);
3076   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3077   subsw(tmp3, tmp3, 1);
3078   br(Assembler::MI, L_done);
3079 
3080   lsr(carry, carry, 32);
3081   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3082   b(L_second_loop);
3083 
3084   // Next infrequent code is moved outside loops.
3085   bind(L_last_x);
3086   ldrw(product_hi, Address(x,  0));
3087   b(L_third_loop_prologue);
3088 
3089   bind(L_done);
3090 }
3091 
3092 // Code for BigInteger::mulAdd instrinsic
3093 // out     = r0
3094 // in      = r1
3095 // offset  = r2  (already out.length-offset)
3096 // len     = r3
3097 // k       = r4
3098 //
3099 // pseudo code from java implementation:
3100 // carry = 0;
3101 // offset = out.length-offset - 1;
3102 // for (int j=len-1; j >= 0; j--) {
3103 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3104 //     out[offset--] = (int)product;
3105 //     carry = product >>> 32;
3106 // }
3107 // return (int)carry;
3108 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3109       Register len, Register k) {
3110     Label LOOP, END;
3111     // pre-loop
3112     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3113     csel(out, zr, out, Assembler::EQ);
3114     br(Assembler::EQ, END);
3115     add(in, in, len, LSL, 2); // in[j+1] address
3116     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3117     mov(out, zr); // used to keep carry now
3118     BIND(LOOP);
3119     ldrw(rscratch1, Address(pre(in, -4)));
3120     madd(rscratch1, rscratch1, k, out);
3121     ldrw(rscratch2, Address(pre(offset, -4)));
3122     add(rscratch1, rscratch1, rscratch2);
3123     strw(rscratch1, Address(offset));
3124     lsr(out, rscratch1, 32);
3125     subs(len, len, 1);
3126     br(Assembler::NE, LOOP);
3127     BIND(END);
3128 }
3129 
3130 /**
3131  * Emits code to update CRC-32 with a byte value according to constants in table
3132  *
3133  * @param [in,out]crc   Register containing the crc.
3134  * @param [in]val       Register containing the byte to fold into the CRC.
3135  * @param [in]table     Register containing the table of crc constants.
3136  *
3137  * uint32_t crc;
3138  * val = crc_table[(val ^ crc) & 0xFF];
3139  * crc = val ^ (crc >> 8);
3140  *
3141  */
3142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3143   eor(val, val, crc);
3144   andr(val, val, 0xff);
3145   ldrw(val, Address(table, val, Address::lsl(2)));
3146   eor(crc, val, crc, Assembler::LSR, 8);
3147 }
3148 
3149 /**
3150  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3151  *
3152  * @param [in,out]crc   Register containing the crc.
3153  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3154  * @param [in]table0    Register containing table 0 of crc constants.
3155  * @param [in]table1    Register containing table 1 of crc constants.
3156  * @param [in]table2    Register containing table 2 of crc constants.
3157  * @param [in]table3    Register containing table 3 of crc constants.
3158  *
3159  * uint32_t crc;
3160  *   v = crc ^ v
3161  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3162  *
3163  */
3164 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3165         Register table0, Register table1, Register table2, Register table3,
3166         bool upper) {
3167   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3168   uxtb(tmp, v);
3169   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3170   ubfx(tmp, v, 8, 8);
3171   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3172   eor(crc, crc, tmp);
3173   ubfx(tmp, v, 16, 8);
3174   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3175   eor(crc, crc, tmp);
3176   ubfx(tmp, v, 24, 8);
3177   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3178   eor(crc, crc, tmp);
3179 }
3180 
3181 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3182         Register len, Register tmp0, Register tmp1, Register tmp2,
3183         Register tmp3) {
3184     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3185     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3186 
3187     mvnw(crc, crc);
3188 
3189     subs(len, len, 128);
3190     br(Assembler::GE, CRC_by64_pre);
3191   BIND(CRC_less64);
3192     adds(len, len, 128-32);
3193     br(Assembler::GE, CRC_by32_loop);
3194   BIND(CRC_less32);
3195     adds(len, len, 32-4);
3196     br(Assembler::GE, CRC_by4_loop);
3197     adds(len, len, 4);
3198     br(Assembler::GT, CRC_by1_loop);
3199     b(L_exit);
3200 
3201   BIND(CRC_by32_loop);
3202     ldp(tmp0, tmp1, Address(post(buf, 16)));
3203     subs(len, len, 32);
3204     crc32x(crc, crc, tmp0);
3205     ldr(tmp2, Address(post(buf, 8)));
3206     crc32x(crc, crc, tmp1);
3207     ldr(tmp3, Address(post(buf, 8)));
3208     crc32x(crc, crc, tmp2);
3209     crc32x(crc, crc, tmp3);
3210     br(Assembler::GE, CRC_by32_loop);
3211     cmn(len, 32);
3212     br(Assembler::NE, CRC_less32);
3213     b(L_exit);
3214 
3215   BIND(CRC_by4_loop);
3216     ldrw(tmp0, Address(post(buf, 4)));
3217     subs(len, len, 4);
3218     crc32w(crc, crc, tmp0);
3219     br(Assembler::GE, CRC_by4_loop);
3220     adds(len, len, 4);
3221     br(Assembler::LE, L_exit);
3222   BIND(CRC_by1_loop);
3223     ldrb(tmp0, Address(post(buf, 1)));
3224     subs(len, len, 1);
3225     crc32b(crc, crc, tmp0);
3226     br(Assembler::GT, CRC_by1_loop);
3227     b(L_exit);
3228 
3229   BIND(CRC_by64_pre);
3230     sub(buf, buf, 8);
3231     ldp(tmp0, tmp1, Address(buf, 8));
3232     crc32x(crc, crc, tmp0);
3233     ldr(tmp2, Address(buf, 24));
3234     crc32x(crc, crc, tmp1);
3235     ldr(tmp3, Address(buf, 32));
3236     crc32x(crc, crc, tmp2);
3237     ldr(tmp0, Address(buf, 40));
3238     crc32x(crc, crc, tmp3);
3239     ldr(tmp1, Address(buf, 48));
3240     crc32x(crc, crc, tmp0);
3241     ldr(tmp2, Address(buf, 56));
3242     crc32x(crc, crc, tmp1);
3243     ldr(tmp3, Address(pre(buf, 64)));
3244 
3245     b(CRC_by64_loop);
3246 
3247     align(CodeEntryAlignment);
3248   BIND(CRC_by64_loop);
3249     subs(len, len, 64);
3250     crc32x(crc, crc, tmp2);
3251     ldr(tmp0, Address(buf, 8));
3252     crc32x(crc, crc, tmp3);
3253     ldr(tmp1, Address(buf, 16));
3254     crc32x(crc, crc, tmp0);
3255     ldr(tmp2, Address(buf, 24));
3256     crc32x(crc, crc, tmp1);
3257     ldr(tmp3, Address(buf, 32));
3258     crc32x(crc, crc, tmp2);
3259     ldr(tmp0, Address(buf, 40));
3260     crc32x(crc, crc, tmp3);
3261     ldr(tmp1, Address(buf, 48));
3262     crc32x(crc, crc, tmp0);
3263     ldr(tmp2, Address(buf, 56));
3264     crc32x(crc, crc, tmp1);
3265     ldr(tmp3, Address(pre(buf, 64)));
3266     br(Assembler::GE, CRC_by64_loop);
3267 
3268     // post-loop
3269     crc32x(crc, crc, tmp2);
3270     crc32x(crc, crc, tmp3);
3271 
3272     sub(len, len, 64);
3273     add(buf, buf, 8);
3274     cmn(len, 128);
3275     br(Assembler::NE, CRC_less64);
3276   BIND(L_exit);
3277     mvnw(crc, crc);
3278 }
3279 
3280 /**
3281  * @param crc   register containing existing CRC (32-bit)
3282  * @param buf   register pointing to input byte buffer (byte*)
3283  * @param len   register containing number of bytes
3284  * @param table register that will contain address of CRC table
3285  * @param tmp   scratch register
3286  */
3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3288         Register table0, Register table1, Register table2, Register table3,
3289         Register tmp, Register tmp2, Register tmp3) {
3290   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3291   unsigned long offset;
3292 
3293   if (UseCRC32) {
3294       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3295       return;
3296   }
3297 
3298     mvnw(crc, crc);
3299 
3300     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3301     if (offset) add(table0, table0, offset);
3302     add(table1, table0, 1*256*sizeof(juint));
3303     add(table2, table0, 2*256*sizeof(juint));
3304     add(table3, table0, 3*256*sizeof(juint));
3305 
3306   if (UseNeon) {
3307       cmp(len, 64);
3308       br(Assembler::LT, L_by16);
3309       eor(v16, T16B, v16, v16);
3310 
3311     Label L_fold;
3312 
3313       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3314 
3315       ld1(v0, v1, T2D, post(buf, 32));
3316       ld1r(v4, T2D, post(tmp, 8));
3317       ld1r(v5, T2D, post(tmp, 8));
3318       ld1r(v6, T2D, post(tmp, 8));
3319       ld1r(v7, T2D, post(tmp, 8));
3320       mov(v16, T4S, 0, crc);
3321 
3322       eor(v0, T16B, v0, v16);
3323       sub(len, len, 64);
3324 
3325     BIND(L_fold);
3326       pmull(v22, T8H, v0, v5, T8B);
3327       pmull(v20, T8H, v0, v7, T8B);
3328       pmull(v23, T8H, v0, v4, T8B);
3329       pmull(v21, T8H, v0, v6, T8B);
3330 
3331       pmull2(v18, T8H, v0, v5, T16B);
3332       pmull2(v16, T8H, v0, v7, T16B);
3333       pmull2(v19, T8H, v0, v4, T16B);
3334       pmull2(v17, T8H, v0, v6, T16B);
3335 
3336       uzp1(v24, T8H, v20, v22);
3337       uzp2(v25, T8H, v20, v22);
3338       eor(v20, T16B, v24, v25);
3339 
3340       uzp1(v26, T8H, v16, v18);
3341       uzp2(v27, T8H, v16, v18);
3342       eor(v16, T16B, v26, v27);
3343 
3344       ushll2(v22, T4S, v20, T8H, 8);
3345       ushll(v20, T4S, v20, T4H, 8);
3346 
3347       ushll2(v18, T4S, v16, T8H, 8);
3348       ushll(v16, T4S, v16, T4H, 8);
3349 
3350       eor(v22, T16B, v23, v22);
3351       eor(v18, T16B, v19, v18);
3352       eor(v20, T16B, v21, v20);
3353       eor(v16, T16B, v17, v16);
3354 
3355       uzp1(v17, T2D, v16, v20);
3356       uzp2(v21, T2D, v16, v20);
3357       eor(v17, T16B, v17, v21);
3358 
3359       ushll2(v20, T2D, v17, T4S, 16);
3360       ushll(v16, T2D, v17, T2S, 16);
3361 
3362       eor(v20, T16B, v20, v22);
3363       eor(v16, T16B, v16, v18);
3364 
3365       uzp1(v17, T2D, v20, v16);
3366       uzp2(v21, T2D, v20, v16);
3367       eor(v28, T16B, v17, v21);
3368 
3369       pmull(v22, T8H, v1, v5, T8B);
3370       pmull(v20, T8H, v1, v7, T8B);
3371       pmull(v23, T8H, v1, v4, T8B);
3372       pmull(v21, T8H, v1, v6, T8B);
3373 
3374       pmull2(v18, T8H, v1, v5, T16B);
3375       pmull2(v16, T8H, v1, v7, T16B);
3376       pmull2(v19, T8H, v1, v4, T16B);
3377       pmull2(v17, T8H, v1, v6, T16B);
3378 
3379       ld1(v0, v1, T2D, post(buf, 32));
3380 
3381       uzp1(v24, T8H, v20, v22);
3382       uzp2(v25, T8H, v20, v22);
3383       eor(v20, T16B, v24, v25);
3384 
3385       uzp1(v26, T8H, v16, v18);
3386       uzp2(v27, T8H, v16, v18);
3387       eor(v16, T16B, v26, v27);
3388 
3389       ushll2(v22, T4S, v20, T8H, 8);
3390       ushll(v20, T4S, v20, T4H, 8);
3391 
3392       ushll2(v18, T4S, v16, T8H, 8);
3393       ushll(v16, T4S, v16, T4H, 8);
3394 
3395       eor(v22, T16B, v23, v22);
3396       eor(v18, T16B, v19, v18);
3397       eor(v20, T16B, v21, v20);
3398       eor(v16, T16B, v17, v16);
3399 
3400       uzp1(v17, T2D, v16, v20);
3401       uzp2(v21, T2D, v16, v20);
3402       eor(v16, T16B, v17, v21);
3403 
3404       ushll2(v20, T2D, v16, T4S, 16);
3405       ushll(v16, T2D, v16, T2S, 16);
3406 
3407       eor(v20, T16B, v22, v20);
3408       eor(v16, T16B, v16, v18);
3409 
3410       uzp1(v17, T2D, v20, v16);
3411       uzp2(v21, T2D, v20, v16);
3412       eor(v20, T16B, v17, v21);
3413 
3414       shl(v16, T2D, v28, 1);
3415       shl(v17, T2D, v20, 1);
3416 
3417       eor(v0, T16B, v0, v16);
3418       eor(v1, T16B, v1, v17);
3419 
3420       subs(len, len, 32);
3421       br(Assembler::GE, L_fold);
3422 
3423       mov(crc, 0);
3424       mov(tmp, v0, T1D, 0);
3425       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3426       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3427       mov(tmp, v0, T1D, 1);
3428       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3429       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3430       mov(tmp, v1, T1D, 0);
3431       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3432       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3433       mov(tmp, v1, T1D, 1);
3434       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3435       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3436 
3437       add(len, len, 32);
3438   }
3439 
3440   BIND(L_by16);
3441     subs(len, len, 16);
3442     br(Assembler::GE, L_by16_loop);
3443     adds(len, len, 16-4);
3444     br(Assembler::GE, L_by4_loop);
3445     adds(len, len, 4);
3446     br(Assembler::GT, L_by1_loop);
3447     b(L_exit);
3448 
3449   BIND(L_by4_loop);
3450     ldrw(tmp, Address(post(buf, 4)));
3451     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3452     subs(len, len, 4);
3453     br(Assembler::GE, L_by4_loop);
3454     adds(len, len, 4);
3455     br(Assembler::LE, L_exit);
3456   BIND(L_by1_loop);
3457     subs(len, len, 1);
3458     ldrb(tmp, Address(post(buf, 1)));
3459     update_byte_crc32(crc, tmp, table0);
3460     br(Assembler::GT, L_by1_loop);
3461     b(L_exit);
3462 
3463     align(CodeEntryAlignment);
3464   BIND(L_by16_loop);
3465     subs(len, len, 16);
3466     ldp(tmp, tmp3, Address(post(buf, 16)));
3467     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3468     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3469     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3470     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3471     br(Assembler::GE, L_by16_loop);
3472     adds(len, len, 16-4);
3473     br(Assembler::GE, L_by4_loop);
3474     adds(len, len, 4);
3475     br(Assembler::GT, L_by1_loop);
3476   BIND(L_exit);
3477     mvnw(crc, crc);
3478 }
3479 
3480 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3481         Register len, Register tmp0, Register tmp1, Register tmp2,
3482         Register tmp3) {
3483     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3484     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3485 
3486     subs(len, len, 128);
3487     br(Assembler::GE, CRC_by64_pre);
3488   BIND(CRC_less64);
3489     adds(len, len, 128-32);
3490     br(Assembler::GE, CRC_by32_loop);
3491   BIND(CRC_less32);
3492     adds(len, len, 32-4);
3493     br(Assembler::GE, CRC_by4_loop);
3494     adds(len, len, 4);
3495     br(Assembler::GT, CRC_by1_loop);
3496     b(L_exit);
3497 
3498   BIND(CRC_by32_loop);
3499     ldp(tmp0, tmp1, Address(post(buf, 16)));
3500     subs(len, len, 32);
3501     crc32cx(crc, crc, tmp0);
3502     ldr(tmp2, Address(post(buf, 8)));
3503     crc32cx(crc, crc, tmp1);
3504     ldr(tmp3, Address(post(buf, 8)));
3505     crc32cx(crc, crc, tmp2);
3506     crc32cx(crc, crc, tmp3);
3507     br(Assembler::GE, CRC_by32_loop);
3508     cmn(len, 32);
3509     br(Assembler::NE, CRC_less32);
3510     b(L_exit);
3511 
3512   BIND(CRC_by4_loop);
3513     ldrw(tmp0, Address(post(buf, 4)));
3514     subs(len, len, 4);
3515     crc32cw(crc, crc, tmp0);
3516     br(Assembler::GE, CRC_by4_loop);
3517     adds(len, len, 4);
3518     br(Assembler::LE, L_exit);
3519   BIND(CRC_by1_loop);
3520     ldrb(tmp0, Address(post(buf, 1)));
3521     subs(len, len, 1);
3522     crc32cb(crc, crc, tmp0);
3523     br(Assembler::GT, CRC_by1_loop);
3524     b(L_exit);
3525 
3526   BIND(CRC_by64_pre);
3527     sub(buf, buf, 8);
3528     ldp(tmp0, tmp1, Address(buf, 8));
3529     crc32cx(crc, crc, tmp0);
3530     ldr(tmp2, Address(buf, 24));
3531     crc32cx(crc, crc, tmp1);
3532     ldr(tmp3, Address(buf, 32));
3533     crc32cx(crc, crc, tmp2);
3534     ldr(tmp0, Address(buf, 40));
3535     crc32cx(crc, crc, tmp3);
3536     ldr(tmp1, Address(buf, 48));
3537     crc32cx(crc, crc, tmp0);
3538     ldr(tmp2, Address(buf, 56));
3539     crc32cx(crc, crc, tmp1);
3540     ldr(tmp3, Address(pre(buf, 64)));
3541 
3542     b(CRC_by64_loop);
3543 
3544     align(CodeEntryAlignment);
3545   BIND(CRC_by64_loop);
3546     subs(len, len, 64);
3547     crc32cx(crc, crc, tmp2);
3548     ldr(tmp0, Address(buf, 8));
3549     crc32cx(crc, crc, tmp3);
3550     ldr(tmp1, Address(buf, 16));
3551     crc32cx(crc, crc, tmp0);
3552     ldr(tmp2, Address(buf, 24));
3553     crc32cx(crc, crc, tmp1);
3554     ldr(tmp3, Address(buf, 32));
3555     crc32cx(crc, crc, tmp2);
3556     ldr(tmp0, Address(buf, 40));
3557     crc32cx(crc, crc, tmp3);
3558     ldr(tmp1, Address(buf, 48));
3559     crc32cx(crc, crc, tmp0);
3560     ldr(tmp2, Address(buf, 56));
3561     crc32cx(crc, crc, tmp1);
3562     ldr(tmp3, Address(pre(buf, 64)));
3563     br(Assembler::GE, CRC_by64_loop);
3564 
3565     // post-loop
3566     crc32cx(crc, crc, tmp2);
3567     crc32cx(crc, crc, tmp3);
3568 
3569     sub(len, len, 64);
3570     add(buf, buf, 8);
3571     cmn(len, 128);
3572     br(Assembler::NE, CRC_less64);
3573   BIND(L_exit);
3574 }
3575 
3576 /**
3577  * @param crc   register containing existing CRC (32-bit)
3578  * @param buf   register pointing to input byte buffer (byte*)
3579  * @param len   register containing number of bytes
3580  * @param table register that will contain address of CRC table
3581  * @param tmp   scratch register
3582  */
3583 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3584         Register table0, Register table1, Register table2, Register table3,
3585         Register tmp, Register tmp2, Register tmp3) {
3586   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3587 }
3588 
3589 
3590 SkipIfEqual::SkipIfEqual(
3591     MacroAssembler* masm, const bool* flag_addr, bool value) {
3592   _masm = masm;
3593   unsigned long offset;
3594   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3595   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3596   _masm->cbzw(rscratch1, _label);
3597 }
3598 
3599 SkipIfEqual::~SkipIfEqual() {
3600   _masm->bind(_label);
3601 }
3602 
3603 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3604   Address adr;
3605   switch(dst.getMode()) {
3606   case Address::base_plus_offset:
3607     // This is the expected mode, although we allow all the other
3608     // forms below.
3609     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3610     break;
3611   default:
3612     lea(rscratch2, dst);
3613     adr = Address(rscratch2);
3614     break;
3615   }
3616   ldr(rscratch1, adr);
3617   add(rscratch1, rscratch1, src);
3618   str(rscratch1, adr);
3619 }
3620 
3621 void MacroAssembler::cmpptr(Register src1, Address src2) {
3622   unsigned long offset;
3623   adrp(rscratch1, src2, offset);
3624   ldr(rscratch1, Address(rscratch1, offset));
3625   cmp(src1, rscratch1);
3626 }
3627 
3628 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3629   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3630   bs->obj_equals(this, obj1, obj2);
3631 }
3632 
3633 void MacroAssembler::load_klass(Register dst, Register src) {
3634   if (UseCompressedClassPointers) {
3635     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3636     decode_klass_not_null(dst);
3637   } else {
3638     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3639   }
3640 }
3641 
3642 // ((OopHandle)result).resolve();
3643 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3644   // OopHandle::resolve is an indirection.
3645   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3646 }
3647 
3648 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3649   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3650   ldr(dst, Address(rmethod, Method::const_offset()));
3651   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3652   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3653   ldr(dst, Address(dst, mirror_offset));
3654   resolve_oop_handle(dst, tmp);
3655 }
3656 
3657 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3658   if (UseCompressedClassPointers) {
3659     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3660     if (Universe::narrow_klass_base() == NULL) {
3661       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3662       return;
3663     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3664                && Universe::narrow_klass_shift() == 0) {
3665       // Only the bottom 32 bits matter
3666       cmpw(trial_klass, tmp);
3667       return;
3668     }
3669     decode_klass_not_null(tmp);
3670   } else {
3671     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3672   }
3673   cmp(trial_klass, tmp);
3674 }
3675 
3676 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3677   load_klass(dst, src);
3678   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3679 }
3680 
3681 void MacroAssembler::store_klass(Register dst, Register src) {
3682   // FIXME: Should this be a store release?  concurrent gcs assumes
3683   // klass length is valid if klass field is not null.
3684   if (UseCompressedClassPointers) {
3685     encode_klass_not_null(src);
3686     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3687   } else {
3688     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3689   }
3690 }
3691 
3692 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3693   if (UseCompressedClassPointers) {
3694     // Store to klass gap in destination
3695     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3696   }
3697 }
3698 
3699 // Algorithm must match CompressedOops::encode.
3700 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3701 #ifdef ASSERT
3702   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3703 #endif
3704   verify_oop(s, "broken oop in encode_heap_oop");
3705   if (Universe::narrow_oop_base() == NULL) {
3706     if (Universe::narrow_oop_shift() != 0) {
3707       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3708       lsr(d, s, LogMinObjAlignmentInBytes);
3709     } else {
3710       mov(d, s);
3711     }
3712   } else {
3713     subs(d, s, rheapbase);
3714     csel(d, d, zr, Assembler::HS);
3715     lsr(d, d, LogMinObjAlignmentInBytes);
3716 
3717     /*  Old algorithm: is this any worse?
3718     Label nonnull;
3719     cbnz(r, nonnull);
3720     sub(r, r, rheapbase);
3721     bind(nonnull);
3722     lsr(r, r, LogMinObjAlignmentInBytes);
3723     */
3724   }
3725 }
3726 
3727 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3728 #ifdef ASSERT
3729   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3730   if (CheckCompressedOops) {
3731     Label ok;
3732     cbnz(r, ok);
3733     stop("null oop passed to encode_heap_oop_not_null");
3734     bind(ok);
3735   }
3736 #endif
3737   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3738   if (Universe::narrow_oop_base() != NULL) {
3739     sub(r, r, rheapbase);
3740   }
3741   if (Universe::narrow_oop_shift() != 0) {
3742     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3743     lsr(r, r, LogMinObjAlignmentInBytes);
3744   }
3745 }
3746 
3747 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3748 #ifdef ASSERT
3749   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3750   if (CheckCompressedOops) {
3751     Label ok;
3752     cbnz(src, ok);
3753     stop("null oop passed to encode_heap_oop_not_null2");
3754     bind(ok);
3755   }
3756 #endif
3757   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3758 
3759   Register data = src;
3760   if (Universe::narrow_oop_base() != NULL) {
3761     sub(dst, src, rheapbase);
3762     data = dst;
3763   }
3764   if (Universe::narrow_oop_shift() != 0) {
3765     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3766     lsr(dst, data, LogMinObjAlignmentInBytes);
3767     data = dst;
3768   }
3769   if (data == src)
3770     mov(dst, src);
3771 }
3772 
3773 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3774 #ifdef ASSERT
3775   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3776 #endif
3777   if (Universe::narrow_oop_base() == NULL) {
3778     if (Universe::narrow_oop_shift() != 0 || d != s) {
3779       lsl(d, s, Universe::narrow_oop_shift());
3780     }
3781   } else {
3782     Label done;
3783     if (d != s)
3784       mov(d, s);
3785     cbz(s, done);
3786     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3787     bind(done);
3788   }
3789   verify_oop(d, "broken oop in decode_heap_oop");
3790 }
3791 
3792 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3793   assert (UseCompressedOops, "should only be used for compressed headers");
3794   assert (Universe::heap() != NULL, "java heap should be initialized");
3795   // Cannot assert, unverified entry point counts instructions (see .ad file)
3796   // vtableStubs also counts instructions in pd_code_size_limit.
3797   // Also do not verify_oop as this is called by verify_oop.
3798   if (Universe::narrow_oop_shift() != 0) {
3799     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3800     if (Universe::narrow_oop_base() != NULL) {
3801       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3802     } else {
3803       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3804     }
3805   } else {
3806     assert (Universe::narrow_oop_base() == NULL, "sanity");
3807   }
3808 }
3809 
3810 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3811   assert (UseCompressedOops, "should only be used for compressed headers");
3812   assert (Universe::heap() != NULL, "java heap should be initialized");
3813   // Cannot assert, unverified entry point counts instructions (see .ad file)
3814   // vtableStubs also counts instructions in pd_code_size_limit.
3815   // Also do not verify_oop as this is called by verify_oop.
3816   if (Universe::narrow_oop_shift() != 0) {
3817     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3818     if (Universe::narrow_oop_base() != NULL) {
3819       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3820     } else {
3821       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3822     }
3823   } else {
3824     assert (Universe::narrow_oop_base() == NULL, "sanity");
3825     if (dst != src) {
3826       mov(dst, src);
3827     }
3828   }
3829 }
3830 
3831 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3832   if (Universe::narrow_klass_base() == NULL) {
3833     if (Universe::narrow_klass_shift() != 0) {
3834       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3835       lsr(dst, src, LogKlassAlignmentInBytes);
3836     } else {
3837       if (dst != src) mov(dst, src);
3838     }
3839     return;
3840   }
3841 
3842   if (use_XOR_for_compressed_class_base) {
3843     if (Universe::narrow_klass_shift() != 0) {
3844       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3845       lsr(dst, dst, LogKlassAlignmentInBytes);
3846     } else {
3847       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3848     }
3849     return;
3850   }
3851 
3852   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3853       && Universe::narrow_klass_shift() == 0) {
3854     movw(dst, src);
3855     return;
3856   }
3857 
3858 #ifdef ASSERT
3859   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3860 #endif
3861 
3862   Register rbase = dst;
3863   if (dst == src) rbase = rheapbase;
3864   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3865   sub(dst, src, rbase);
3866   if (Universe::narrow_klass_shift() != 0) {
3867     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3868     lsr(dst, dst, LogKlassAlignmentInBytes);
3869   }
3870   if (dst == src) reinit_heapbase();
3871 }
3872 
3873 void MacroAssembler::encode_klass_not_null(Register r) {
3874   encode_klass_not_null(r, r);
3875 }
3876 
3877 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3878   Register rbase = dst;
3879   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3880 
3881   if (Universe::narrow_klass_base() == NULL) {
3882     if (Universe::narrow_klass_shift() != 0) {
3883       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3884       lsl(dst, src, LogKlassAlignmentInBytes);
3885     } else {
3886       if (dst != src) mov(dst, src);
3887     }
3888     return;
3889   }
3890 
3891   if (use_XOR_for_compressed_class_base) {
3892     if (Universe::narrow_klass_shift() != 0) {
3893       lsl(dst, src, LogKlassAlignmentInBytes);
3894       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3895     } else {
3896       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3897     }
3898     return;
3899   }
3900 
3901   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3902       && Universe::narrow_klass_shift() == 0) {
3903     if (dst != src)
3904       movw(dst, src);
3905     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3906     return;
3907   }
3908 
3909   // Cannot assert, unverified entry point counts instructions (see .ad file)
3910   // vtableStubs also counts instructions in pd_code_size_limit.
3911   // Also do not verify_oop as this is called by verify_oop.
3912   if (dst == src) rbase = rheapbase;
3913   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3914   if (Universe::narrow_klass_shift() != 0) {
3915     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3916     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3917   } else {
3918     add(dst, rbase, src);
3919   }
3920   if (dst == src) reinit_heapbase();
3921 }
3922 
3923 void  MacroAssembler::decode_klass_not_null(Register r) {
3924   decode_klass_not_null(r, r);
3925 }
3926 
3927 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3928 #ifdef ASSERT
3929   {
3930     ThreadInVMfromUnknown tiv;
3931     assert (UseCompressedOops, "should only be used for compressed oops");
3932     assert (Universe::heap() != NULL, "java heap should be initialized");
3933     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3934     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3935   }
3936 #endif
3937   int oop_index = oop_recorder()->find_index(obj);
3938   InstructionMark im(this);
3939   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3940   code_section()->relocate(inst_mark(), rspec);
3941   movz(dst, 0xDEAD, 16);
3942   movk(dst, 0xBEEF);
3943 }
3944 
3945 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3946   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3947   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3948   int index = oop_recorder()->find_index(k);
3949   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3950 
3951   InstructionMark im(this);
3952   RelocationHolder rspec = metadata_Relocation::spec(index);
3953   code_section()->relocate(inst_mark(), rspec);
3954   narrowKlass nk = Klass::encode_klass(k);
3955   movz(dst, (nk >> 16), 16);
3956   movk(dst, nk & 0xffff);
3957 }
3958 
3959 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3960                                     Register dst, Address src,
3961                                     Register tmp1, Register thread_tmp) {
3962   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3963   decorators = AccessInternal::decorator_fixup(decorators);
3964   bool as_raw = (decorators & AS_RAW) != 0;
3965   if (as_raw) {
3966     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3967   } else {
3968     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3969   }
3970 }
3971 
3972 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3973                                      Address dst, Register src,
3974                                      Register tmp1, Register thread_tmp) {
3975   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3976   decorators = AccessInternal::decorator_fixup(decorators);
3977   bool as_raw = (decorators & AS_RAW) != 0;
3978   if (as_raw) {
3979     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3980   } else {
3981     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3982   }
3983 }
3984 
3985 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
3986   if ((decorators & ACCESS_READ) == 0) {
3987     decorators |= ACCESS_WRITE;
3988   }
3989   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3990   return bs->resolve(this, decorators, obj);
3991 }
3992 
3993 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3994                                    Register thread_tmp, DecoratorSet decorators) {
3995   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3996 }
3997 
3998 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3999                                             Register thread_tmp, DecoratorSet decorators) {
4000   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4001 }
4002 
4003 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4004                                     Register thread_tmp, DecoratorSet decorators) {
4005   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4006 }
4007 
4008 // Used for storing NULLs.
4009 void MacroAssembler::store_heap_oop_null(Address dst) {
4010   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4011 }
4012 
4013 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4014   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4015   int index = oop_recorder()->allocate_metadata_index(obj);
4016   RelocationHolder rspec = metadata_Relocation::spec(index);
4017   return Address((address)obj, rspec);
4018 }
4019 
4020 // Move an oop into a register.  immediate is true if we want
4021 // immediate instrcutions, i.e. we are not going to patch this
4022 // instruction while the code is being executed by another thread.  In
4023 // that case we can use move immediates rather than the constant pool.
4024 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4025   int oop_index;
4026   if (obj == NULL) {
4027     oop_index = oop_recorder()->allocate_oop_index(obj);
4028   } else {
4029 #ifdef ASSERT
4030     {
4031       ThreadInVMfromUnknown tiv;
4032       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4033     }
4034 #endif
4035     oop_index = oop_recorder()->find_index(obj);
4036   }
4037   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4038   if (! immediate) {
4039     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4040     ldr_constant(dst, Address(dummy, rspec));
4041   } else
4042     mov(dst, Address((address)obj, rspec));
4043 }
4044 
4045 // Move a metadata address into a register.
4046 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4047   int oop_index;
4048   if (obj == NULL) {
4049     oop_index = oop_recorder()->allocate_metadata_index(obj);
4050   } else {
4051     oop_index = oop_recorder()->find_index(obj);
4052   }
4053   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4054   mov(dst, Address((address)obj, rspec));
4055 }
4056 
4057 Address MacroAssembler::constant_oop_address(jobject obj) {
4058 #ifdef ASSERT
4059   {
4060     ThreadInVMfromUnknown tiv;
4061     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4062     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4063   }
4064 #endif
4065   int oop_index = oop_recorder()->find_index(obj);
4066   return Address((address)obj, oop_Relocation::spec(oop_index));
4067 }
4068 
4069 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4070 void MacroAssembler::tlab_allocate(Register obj,
4071                                    Register var_size_in_bytes,
4072                                    int con_size_in_bytes,
4073                                    Register t1,
4074                                    Register t2,
4075                                    Label& slow_case) {
4076   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4077   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4078 }
4079 
4080 // Defines obj, preserves var_size_in_bytes
4081 void MacroAssembler::eden_allocate(Register obj,
4082                                    Register var_size_in_bytes,
4083                                    int con_size_in_bytes,
4084                                    Register t1,
4085                                    Label& slow_case) {
4086   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4087   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4088 }
4089 
4090 // Zero words; len is in bytes
4091 // Destroys all registers except addr
4092 // len must be a nonzero multiple of wordSize
4093 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4094   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4095 
4096 #ifdef ASSERT
4097   { Label L;
4098     tst(len, BytesPerWord - 1);
4099     br(Assembler::EQ, L);
4100     stop("len is not a multiple of BytesPerWord");
4101     bind(L);
4102   }
4103 #endif
4104 
4105 #ifndef PRODUCT
4106   block_comment("zero memory");
4107 #endif
4108 
4109   Label loop;
4110   Label entry;
4111 
4112 //  Algorithm:
4113 //
4114 //    scratch1 = cnt & 7;
4115 //    cnt -= scratch1;
4116 //    p += scratch1;
4117 //    switch (scratch1) {
4118 //      do {
4119 //        cnt -= 8;
4120 //          p[-8] = 0;
4121 //        case 7:
4122 //          p[-7] = 0;
4123 //        case 6:
4124 //          p[-6] = 0;
4125 //          // ...
4126 //        case 1:
4127 //          p[-1] = 0;
4128 //        case 0:
4129 //          p += 8;
4130 //      } while (cnt);
4131 //    }
4132 
4133   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4134 
4135   lsr(len, len, LogBytesPerWord);
4136   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4137   sub(len, len, rscratch1);      // cnt -= unroll
4138   // t1 always points to the end of the region we're about to zero
4139   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4140   adr(rscratch2, entry);
4141   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4142   br(rscratch2);
4143   bind(loop);
4144   sub(len, len, unroll);
4145   for (int i = -unroll; i < 0; i++)
4146     Assembler::str(zr, Address(t1, i * wordSize));
4147   bind(entry);
4148   add(t1, t1, unroll * wordSize);
4149   cbnz(len, loop);
4150 }
4151 
4152 void MacroAssembler::verify_tlab() {
4153 #ifdef ASSERT
4154   if (UseTLAB && VerifyOops) {
4155     Label next, ok;
4156 
4157     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4158 
4159     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4160     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4161     cmp(rscratch2, rscratch1);
4162     br(Assembler::HS, next);
4163     STOP("assert(top >= start)");
4164     should_not_reach_here();
4165 
4166     bind(next);
4167     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4168     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4169     cmp(rscratch2, rscratch1);
4170     br(Assembler::HS, ok);
4171     STOP("assert(top <= end)");
4172     should_not_reach_here();
4173 
4174     bind(ok);
4175     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4176   }
4177 #endif
4178 }
4179 
4180 // Writes to stack successive pages until offset reached to check for
4181 // stack overflow + shadow pages.  This clobbers tmp.
4182 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4183   assert_different_registers(tmp, size, rscratch1);
4184   mov(tmp, sp);
4185   // Bang stack for total size given plus shadow page size.
4186   // Bang one page at a time because large size can bang beyond yellow and
4187   // red zones.
4188   Label loop;
4189   mov(rscratch1, os::vm_page_size());
4190   bind(loop);
4191   lea(tmp, Address(tmp, -os::vm_page_size()));
4192   subsw(size, size, rscratch1);
4193   str(size, Address(tmp));
4194   br(Assembler::GT, loop);
4195 
4196   // Bang down shadow pages too.
4197   // At this point, (tmp-0) is the last address touched, so don't
4198   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4199   // was post-decremented.)  Skip this address by starting at i=1, and
4200   // touch a few more pages below.  N.B.  It is important to touch all
4201   // the way down to and including i=StackShadowPages.
4202   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4203     // this could be any sized move but this is can be a debugging crumb
4204     // so the bigger the better.
4205     lea(tmp, Address(tmp, -os::vm_page_size()));
4206     str(size, Address(tmp));
4207   }
4208 }
4209 
4210 
4211 // Move the address of the polling page into dest.
4212 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4213   if (SafepointMechanism::uses_thread_local_poll()) {
4214     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4215   } else {
4216     unsigned long off;
4217     adrp(dest, Address(page, rtype), off);
4218     assert(off == 0, "polling page must be page aligned");
4219   }
4220 }
4221 
4222 // Move the address of the polling page into r, then read the polling
4223 // page.
4224 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4225   get_polling_page(r, page, rtype);
4226   return read_polling_page(r, rtype);
4227 }
4228 
4229 // Read the polling page.  The address of the polling page must
4230 // already be in r.
4231 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4232   InstructionMark im(this);
4233   code_section()->relocate(inst_mark(), rtype);
4234   ldrw(zr, Address(r, 0));
4235   return inst_mark();
4236 }
4237 
4238 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4239   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4240   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4241   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4242   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4243   long offset_low = dest_page - low_page;
4244   long offset_high = dest_page - high_page;
4245 
4246   assert(is_valid_AArch64_address(dest.target()), "bad address");
4247   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4248 
4249   InstructionMark im(this);
4250   code_section()->relocate(inst_mark(), dest.rspec());
4251   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4252   // the code cache so that if it is relocated we know it will still reach
4253   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4254     _adrp(reg1, dest.target());
4255   } else {
4256     unsigned long target = (unsigned long)dest.target();
4257     unsigned long adrp_target
4258       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4259 
4260     _adrp(reg1, (address)adrp_target);
4261     movk(reg1, target >> 32, 32);
4262   }
4263   byte_offset = (unsigned long)dest.target() & 0xfff;
4264 }
4265 
4266 void MacroAssembler::load_byte_map_base(Register reg) {
4267   jbyte *byte_map_base =
4268     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4269 
4270   if (is_valid_AArch64_address((address)byte_map_base)) {
4271     // Strictly speaking the byte_map_base isn't an address at all,
4272     // and it might even be negative.
4273     unsigned long offset;
4274     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4275     // We expect offset to be zero with most collectors.
4276     if (offset != 0) {
4277       add(reg, reg, offset);
4278     }
4279   } else {
4280     mov(reg, (uint64_t)byte_map_base);
4281   }
4282 }
4283 
4284 void MacroAssembler::build_frame(int framesize) {
4285   assert(framesize > 0, "framesize must be > 0");
4286   if (framesize < ((1 << 9) + 2 * wordSize)) {
4287     sub(sp, sp, framesize);
4288     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4289     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4290   } else {
4291     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4292     if (PreserveFramePointer) mov(rfp, sp);
4293     if (framesize < ((1 << 12) + 2 * wordSize))
4294       sub(sp, sp, framesize - 2 * wordSize);
4295     else {
4296       mov(rscratch1, framesize - 2 * wordSize);
4297       sub(sp, sp, rscratch1);
4298     }
4299   }
4300 }
4301 
4302 void MacroAssembler::remove_frame(int framesize) {
4303   assert(framesize > 0, "framesize must be > 0");
4304   if (framesize < ((1 << 9) + 2 * wordSize)) {
4305     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4306     add(sp, sp, framesize);
4307   } else {
4308     if (framesize < ((1 << 12) + 2 * wordSize))
4309       add(sp, sp, framesize - 2 * wordSize);
4310     else {
4311       mov(rscratch1, framesize - 2 * wordSize);
4312       add(sp, sp, rscratch1);
4313     }
4314     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4315   }
4316 }
4317 
4318 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4319 
4320 // Search for str1 in str2 and return index or -1
4321 void MacroAssembler::string_indexof(Register str2, Register str1,
4322                                     Register cnt2, Register cnt1,
4323                                     Register tmp1, Register tmp2,
4324                                     Register tmp3, Register tmp4,
4325                                     Register tmp5, Register tmp6,
4326                                     int icnt1, Register result, int ae) {
4327   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4328   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4329 
4330   Register ch1 = rscratch1;
4331   Register ch2 = rscratch2;
4332   Register cnt1tmp = tmp1;
4333   Register cnt2tmp = tmp2;
4334   Register cnt1_neg = cnt1;
4335   Register cnt2_neg = cnt2;
4336   Register result_tmp = tmp4;
4337 
4338   bool isL = ae == StrIntrinsicNode::LL;
4339 
4340   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4341   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4342   int str1_chr_shift = str1_isL ? 0:1;
4343   int str2_chr_shift = str2_isL ? 0:1;
4344   int str1_chr_size = str1_isL ? 1:2;
4345   int str2_chr_size = str2_isL ? 1:2;
4346   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4347                                       (chr_insn)&MacroAssembler::ldrh;
4348   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4349                                       (chr_insn)&MacroAssembler::ldrh;
4350   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4351   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4352 
4353   // Note, inline_string_indexOf() generates checks:
4354   // if (substr.count > string.count) return -1;
4355   // if (substr.count == 0) return 0;
4356 
4357   // We have two strings, a source string in str2, cnt2 and a pattern string
4358   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4359 
4360   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4361   // With a small pattern and source we use linear scan.
4362 
4363   if (icnt1 == -1) {
4364     sub(result_tmp, cnt2, cnt1);
4365     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4366     br(LT, LINEARSEARCH);
4367     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4368     cmp(cnt1, 256);
4369     lsr(tmp1, cnt2, 2);
4370     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4371     br(GE, LINEARSTUB);
4372   }
4373 
4374 // The Boyer Moore alogorithm is based on the description here:-
4375 //
4376 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4377 //
4378 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4379 // and the 'Good Suffix' rule.
4380 //
4381 // These rules are essentially heuristics for how far we can shift the
4382 // pattern along the search string.
4383 //
4384 // The implementation here uses the 'Bad Character' rule only because of the
4385 // complexity of initialisation for the 'Good Suffix' rule.
4386 //
4387 // This is also known as the Boyer-Moore-Horspool algorithm:-
4388 //
4389 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4390 //
4391 // This particular implementation has few java-specific optimizations.
4392 //
4393 // #define ASIZE 256
4394 //
4395 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4396 //       int i, j;
4397 //       unsigned c;
4398 //       unsigned char bc[ASIZE];
4399 //
4400 //       /* Preprocessing */
4401 //       for (i = 0; i < ASIZE; ++i)
4402 //          bc[i] = m;
4403 //       for (i = 0; i < m - 1; ) {
4404 //          c = x[i];
4405 //          ++i;
4406 //          // c < 256 for Latin1 string, so, no need for branch
4407 //          #ifdef PATTERN_STRING_IS_LATIN1
4408 //          bc[c] = m - i;
4409 //          #else
4410 //          if (c < ASIZE) bc[c] = m - i;
4411 //          #endif
4412 //       }
4413 //
4414 //       /* Searching */
4415 //       j = 0;
4416 //       while (j <= n - m) {
4417 //          c = y[i+j];
4418 //          if (x[m-1] == c)
4419 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4420 //          if (i < 0) return j;
4421 //          // c < 256 for Latin1 string, so, no need for branch
4422 //          #ifdef SOURCE_STRING_IS_LATIN1
4423 //          // LL case: (c< 256) always true. Remove branch
4424 //          j += bc[y[j+m-1]];
4425 //          #endif
4426 //          #ifndef PATTERN_STRING_IS_UTF
4427 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4428 //          if (c < ASIZE)
4429 //            j += bc[y[j+m-1]];
4430 //          else
4431 //            j += 1
4432 //          #endif
4433 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4434 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4435 //          if (c < ASIZE)
4436 //            j += bc[y[j+m-1]];
4437 //          else
4438 //            j += m
4439 //          #endif
4440 //       }
4441 //    }
4442 
4443   if (icnt1 == -1) {
4444     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4445         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4446     Register cnt1end = tmp2;
4447     Register str2end = cnt2;
4448     Register skipch = tmp2;
4449 
4450     // str1 length is >=8, so, we can read at least 1 register for cases when
4451     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4452     // UL case. We'll re-read last character in inner pre-loop code to have
4453     // single outer pre-loop load
4454     const int firstStep = isL ? 7 : 3;
4455 
4456     const int ASIZE = 256;
4457     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4458     sub(sp, sp, ASIZE);
4459     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4460     mov(ch1, sp);
4461     BIND(BM_INIT_LOOP);
4462       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4463       subs(tmp5, tmp5, 1);
4464       br(GT, BM_INIT_LOOP);
4465 
4466       sub(cnt1tmp, cnt1, 1);
4467       mov(tmp5, str2);
4468       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4469       sub(ch2, cnt1, 1);
4470       mov(tmp3, str1);
4471     BIND(BCLOOP);
4472       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4473       if (!str1_isL) {
4474         cmp(ch1, ASIZE);
4475         br(HS, BCSKIP);
4476       }
4477       strb(ch2, Address(sp, ch1));
4478     BIND(BCSKIP);
4479       subs(ch2, ch2, 1);
4480       br(GT, BCLOOP);
4481 
4482       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4483       if (str1_isL == str2_isL) {
4484         // load last 8 bytes (8LL/4UU symbols)
4485         ldr(tmp6, Address(tmp6, -wordSize));
4486       } else {
4487         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4488         // convert Latin1 to UTF. We'll have to wait until load completed, but
4489         // it's still faster than per-character loads+checks
4490         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4491         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4492         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4493         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4494         orr(ch2, ch1, ch2, LSL, 16);
4495         orr(tmp6, tmp6, tmp3, LSL, 48);
4496         orr(tmp6, tmp6, ch2, LSL, 16);
4497       }
4498     BIND(BMLOOPSTR2);
4499       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4500       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4501       if (str1_isL == str2_isL) {
4502         // re-init tmp3. It's for free because it's executed in parallel with
4503         // load above. Alternative is to initialize it before loop, but it'll
4504         // affect performance on in-order systems with 2 or more ld/st pipelines
4505         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4506       }
4507       if (!isL) { // UU/UL case
4508         lsl(ch2, cnt1tmp, 1); // offset in bytes
4509       }
4510       cmp(tmp3, skipch);
4511       br(NE, BMSKIP);
4512       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4513       mov(ch1, tmp6);
4514       if (isL) {
4515         b(BMLOOPSTR1_AFTER_LOAD);
4516       } else {
4517         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4518         b(BMLOOPSTR1_CMP);
4519       }
4520     BIND(BMLOOPSTR1);
4521       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4522       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4523     BIND(BMLOOPSTR1_AFTER_LOAD);
4524       subs(cnt1tmp, cnt1tmp, 1);
4525       br(LT, BMLOOPSTR1_LASTCMP);
4526     BIND(BMLOOPSTR1_CMP);
4527       cmp(ch1, ch2);
4528       br(EQ, BMLOOPSTR1);
4529     BIND(BMSKIP);
4530       if (!isL) {
4531         // if we've met UTF symbol while searching Latin1 pattern, then we can
4532         // skip cnt1 symbols
4533         if (str1_isL != str2_isL) {
4534           mov(result_tmp, cnt1);
4535         } else {
4536           mov(result_tmp, 1);
4537         }
4538         cmp(skipch, ASIZE);
4539         br(HS, BMADV);
4540       }
4541       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4542     BIND(BMADV);
4543       sub(cnt1tmp, cnt1, 1);
4544       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4545       cmp(str2, str2end);
4546       br(LE, BMLOOPSTR2);
4547       add(sp, sp, ASIZE);
4548       b(NOMATCH);
4549     BIND(BMLOOPSTR1_LASTCMP);
4550       cmp(ch1, ch2);
4551       br(NE, BMSKIP);
4552     BIND(BMMATCH);
4553       sub(result, str2, tmp5);
4554       if (!str2_isL) lsr(result, result, 1);
4555       add(sp, sp, ASIZE);
4556       b(DONE);
4557 
4558     BIND(LINEARSTUB);
4559     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4560     br(LT, LINEAR_MEDIUM);
4561     mov(result, zr);
4562     RuntimeAddress stub = NULL;
4563     if (isL) {
4564       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4565       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4566     } else if (str1_isL) {
4567       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4568        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4569     } else {
4570       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4571       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4572     }
4573     trampoline_call(stub);
4574     b(DONE);
4575   }
4576 
4577   BIND(LINEARSEARCH);
4578   {
4579     Label DO1, DO2, DO3;
4580 
4581     Register str2tmp = tmp2;
4582     Register first = tmp3;
4583 
4584     if (icnt1 == -1)
4585     {
4586         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4587 
4588         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4589         br(LT, DOSHORT);
4590       BIND(LINEAR_MEDIUM);
4591         (this->*str1_load_1chr)(first, Address(str1));
4592         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4593         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4594         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4595         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4596 
4597       BIND(FIRST_LOOP);
4598         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4599         cmp(first, ch2);
4600         br(EQ, STR1_LOOP);
4601       BIND(STR2_NEXT);
4602         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4603         br(LE, FIRST_LOOP);
4604         b(NOMATCH);
4605 
4606       BIND(STR1_LOOP);
4607         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4608         add(cnt2tmp, cnt2_neg, str2_chr_size);
4609         br(GE, MATCH);
4610 
4611       BIND(STR1_NEXT);
4612         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4613         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4614         cmp(ch1, ch2);
4615         br(NE, STR2_NEXT);
4616         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4617         add(cnt2tmp, cnt2tmp, str2_chr_size);
4618         br(LT, STR1_NEXT);
4619         b(MATCH);
4620 
4621       BIND(DOSHORT);
4622       if (str1_isL == str2_isL) {
4623         cmp(cnt1, 2);
4624         br(LT, DO1);
4625         br(GT, DO3);
4626       }
4627     }
4628 
4629     if (icnt1 == 4) {
4630       Label CH1_LOOP;
4631 
4632         (this->*load_4chr)(ch1, str1);
4633         sub(result_tmp, cnt2, 4);
4634         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4635         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4636 
4637       BIND(CH1_LOOP);
4638         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4639         cmp(ch1, ch2);
4640         br(EQ, MATCH);
4641         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4642         br(LE, CH1_LOOP);
4643         b(NOMATCH);
4644       }
4645 
4646     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4647       Label CH1_LOOP;
4648 
4649       BIND(DO2);
4650         (this->*load_2chr)(ch1, str1);
4651         if (icnt1 == 2) {
4652           sub(result_tmp, cnt2, 2);
4653         }
4654         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4655         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4656       BIND(CH1_LOOP);
4657         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4658         cmp(ch1, ch2);
4659         br(EQ, MATCH);
4660         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4661         br(LE, CH1_LOOP);
4662         b(NOMATCH);
4663     }
4664 
4665     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4666       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4667 
4668       BIND(DO3);
4669         (this->*load_2chr)(first, str1);
4670         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4671         if (icnt1 == 3) {
4672           sub(result_tmp, cnt2, 3);
4673         }
4674         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4675         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4676       BIND(FIRST_LOOP);
4677         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4678         cmpw(first, ch2);
4679         br(EQ, STR1_LOOP);
4680       BIND(STR2_NEXT);
4681         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4682         br(LE, FIRST_LOOP);
4683         b(NOMATCH);
4684 
4685       BIND(STR1_LOOP);
4686         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4687         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4688         cmp(ch1, ch2);
4689         br(NE, STR2_NEXT);
4690         b(MATCH);
4691     }
4692 
4693     if (icnt1 == -1 || icnt1 == 1) {
4694       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4695 
4696       BIND(DO1);
4697         (this->*str1_load_1chr)(ch1, str1);
4698         cmp(cnt2, 8);
4699         br(LT, DO1_SHORT);
4700 
4701         sub(result_tmp, cnt2, 8/str2_chr_size);
4702         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4703         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4704         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4705 
4706         if (str2_isL) {
4707           orr(ch1, ch1, ch1, LSL, 8);
4708         }
4709         orr(ch1, ch1, ch1, LSL, 16);
4710         orr(ch1, ch1, ch1, LSL, 32);
4711       BIND(CH1_LOOP);
4712         ldr(ch2, Address(str2, cnt2_neg));
4713         eor(ch2, ch1, ch2);
4714         sub(tmp1, ch2, tmp3);
4715         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4716         bics(tmp1, tmp1, tmp2);
4717         br(NE, HAS_ZERO);
4718         adds(cnt2_neg, cnt2_neg, 8);
4719         br(LT, CH1_LOOP);
4720 
4721         cmp(cnt2_neg, 8);
4722         mov(cnt2_neg, 0);
4723         br(LT, CH1_LOOP);
4724         b(NOMATCH);
4725 
4726       BIND(HAS_ZERO);
4727         rev(tmp1, tmp1);
4728         clz(tmp1, tmp1);
4729         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4730         b(MATCH);
4731 
4732       BIND(DO1_SHORT);
4733         mov(result_tmp, cnt2);
4734         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4735         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4736       BIND(DO1_LOOP);
4737         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4738         cmpw(ch1, ch2);
4739         br(EQ, MATCH);
4740         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4741         br(LT, DO1_LOOP);
4742     }
4743   }
4744   BIND(NOMATCH);
4745     mov(result, -1);
4746     b(DONE);
4747   BIND(MATCH);
4748     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4749   BIND(DONE);
4750 }
4751 
4752 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4753 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4754 
4755 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4756                                          Register ch, Register result,
4757                                          Register tmp1, Register tmp2, Register tmp3)
4758 {
4759   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4760   Register cnt1_neg = cnt1;
4761   Register ch1 = rscratch1;
4762   Register result_tmp = rscratch2;
4763 
4764   cmp(cnt1, 4);
4765   br(LT, DO1_SHORT);
4766 
4767   orr(ch, ch, ch, LSL, 16);
4768   orr(ch, ch, ch, LSL, 32);
4769 
4770   sub(cnt1, cnt1, 4);
4771   mov(result_tmp, cnt1);
4772   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4773   sub(cnt1_neg, zr, cnt1, LSL, 1);
4774 
4775   mov(tmp3, 0x0001000100010001);
4776 
4777   BIND(CH1_LOOP);
4778     ldr(ch1, Address(str1, cnt1_neg));
4779     eor(ch1, ch, ch1);
4780     sub(tmp1, ch1, tmp3);
4781     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4782     bics(tmp1, tmp1, tmp2);
4783     br(NE, HAS_ZERO);
4784     adds(cnt1_neg, cnt1_neg, 8);
4785     br(LT, CH1_LOOP);
4786 
4787     cmp(cnt1_neg, 8);
4788     mov(cnt1_neg, 0);
4789     br(LT, CH1_LOOP);
4790     b(NOMATCH);
4791 
4792   BIND(HAS_ZERO);
4793     rev(tmp1, tmp1);
4794     clz(tmp1, tmp1);
4795     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4796     b(MATCH);
4797 
4798   BIND(DO1_SHORT);
4799     mov(result_tmp, cnt1);
4800     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4801     sub(cnt1_neg, zr, cnt1, LSL, 1);
4802   BIND(DO1_LOOP);
4803     ldrh(ch1, Address(str1, cnt1_neg));
4804     cmpw(ch, ch1);
4805     br(EQ, MATCH);
4806     adds(cnt1_neg, cnt1_neg, 2);
4807     br(LT, DO1_LOOP);
4808   BIND(NOMATCH);
4809     mov(result, -1);
4810     b(DONE);
4811   BIND(MATCH);
4812     add(result, result_tmp, cnt1_neg, ASR, 1);
4813   BIND(DONE);
4814 }
4815 
4816 // Compare strings.
4817 void MacroAssembler::string_compare(Register str1, Register str2,
4818     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4819     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4820   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4821       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4822       SHORT_LOOP_START, TAIL_CHECK;
4823 
4824   const int STUB_THRESHOLD = 64 + 8;
4825   bool isLL = ae == StrIntrinsicNode::LL;
4826   bool isLU = ae == StrIntrinsicNode::LU;
4827   bool isUL = ae == StrIntrinsicNode::UL;
4828 
4829   bool str1_isL = isLL || isLU;
4830   bool str2_isL = isLL || isUL;
4831 
4832   int str1_chr_shift = str1_isL ? 0 : 1;
4833   int str2_chr_shift = str2_isL ? 0 : 1;
4834   int str1_chr_size = str1_isL ? 1 : 2;
4835   int str2_chr_size = str2_isL ? 1 : 2;
4836   int minCharsInWord = isLL ? wordSize : wordSize/2;
4837 
4838   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4839   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4840                                       (chr_insn)&MacroAssembler::ldrh;
4841   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4842                                       (chr_insn)&MacroAssembler::ldrh;
4843   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4844                             (uxt_insn)&MacroAssembler::uxthw;
4845 
4846   BLOCK_COMMENT("string_compare {");
4847 
4848   // Bizzarely, the counts are passed in bytes, regardless of whether they
4849   // are L or U strings, however the result is always in characters.
4850   if (!str1_isL) asrw(cnt1, cnt1, 1);
4851   if (!str2_isL) asrw(cnt2, cnt2, 1);
4852 
4853   // Compute the minimum of the string lengths and save the difference.
4854   subsw(result, cnt1, cnt2);
4855   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4856 
4857   // A very short string
4858   cmpw(cnt2, minCharsInWord);
4859   br(Assembler::LT, SHORT_STRING);
4860 
4861   // Compare longwords
4862   // load first parts of strings and finish initialization while loading
4863   {
4864     if (str1_isL == str2_isL) { // LL or UU
4865       ldr(tmp1, Address(str1));
4866       cmp(str1, str2);
4867       br(Assembler::EQ, DONE);
4868       ldr(tmp2, Address(str2));
4869       cmp(cnt2, STUB_THRESHOLD);
4870       br(GE, STUB);
4871       subsw(cnt2, cnt2, minCharsInWord);
4872       br(EQ, TAIL_CHECK);
4873       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4874       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4875       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4876     } else if (isLU) {
4877       ldrs(vtmp, Address(str1));
4878       cmp(str1, str2);
4879       br(Assembler::EQ, DONE);
4880       ldr(tmp2, Address(str2));
4881       cmp(cnt2, STUB_THRESHOLD);
4882       br(GE, STUB);
4883       subsw(cnt2, cnt2, 4);
4884       br(EQ, TAIL_CHECK);
4885       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4886       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4887       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4888       zip1(vtmp, T8B, vtmp, vtmpZ);
4889       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4890       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4891       add(cnt1, cnt1, 4);
4892       fmovd(tmp1, vtmp);
4893     } else { // UL case
4894       ldr(tmp1, Address(str1));
4895       cmp(str1, str2);
4896       br(Assembler::EQ, DONE);
4897       ldrs(vtmp, Address(str2));
4898       cmp(cnt2, STUB_THRESHOLD);
4899       br(GE, STUB);
4900       subsw(cnt2, cnt2, 4);
4901       br(EQ, TAIL_CHECK);
4902       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4903       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4904       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4905       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4906       zip1(vtmp, T8B, vtmp, vtmpZ);
4907       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4908       add(cnt1, cnt1, 8);
4909       fmovd(tmp2, vtmp);
4910     }
4911     adds(cnt2, cnt2, isUL ? 4 : 8);
4912     br(GE, TAIL);
4913     eor(rscratch2, tmp1, tmp2);
4914     cbnz(rscratch2, DIFFERENCE);
4915     // main loop
4916     bind(NEXT_WORD);
4917     if (str1_isL == str2_isL) {
4918       ldr(tmp1, Address(str1, cnt2));
4919       ldr(tmp2, Address(str2, cnt2));
4920       adds(cnt2, cnt2, 8);
4921     } else if (isLU) {
4922       ldrs(vtmp, Address(str1, cnt1));
4923       ldr(tmp2, Address(str2, cnt2));
4924       add(cnt1, cnt1, 4);
4925       zip1(vtmp, T8B, vtmp, vtmpZ);
4926       fmovd(tmp1, vtmp);
4927       adds(cnt2, cnt2, 8);
4928     } else { // UL
4929       ldrs(vtmp, Address(str2, cnt2));
4930       ldr(tmp1, Address(str1, cnt1));
4931       zip1(vtmp, T8B, vtmp, vtmpZ);
4932       add(cnt1, cnt1, 8);
4933       fmovd(tmp2, vtmp);
4934       adds(cnt2, cnt2, 4);
4935     }
4936     br(GE, TAIL);
4937 
4938     eor(rscratch2, tmp1, tmp2);
4939     cbz(rscratch2, NEXT_WORD);
4940     b(DIFFERENCE);
4941     bind(TAIL);
4942     eor(rscratch2, tmp1, tmp2);
4943     cbnz(rscratch2, DIFFERENCE);
4944     // Last longword.  In the case where length == 4 we compare the
4945     // same longword twice, but that's still faster than another
4946     // conditional branch.
4947     if (str1_isL == str2_isL) {
4948       ldr(tmp1, Address(str1));
4949       ldr(tmp2, Address(str2));
4950     } else if (isLU) {
4951       ldrs(vtmp, Address(str1));
4952       ldr(tmp2, Address(str2));
4953       zip1(vtmp, T8B, vtmp, vtmpZ);
4954       fmovd(tmp1, vtmp);
4955     } else { // UL
4956       ldrs(vtmp, Address(str2));
4957       ldr(tmp1, Address(str1));
4958       zip1(vtmp, T8B, vtmp, vtmpZ);
4959       fmovd(tmp2, vtmp);
4960     }
4961     bind(TAIL_CHECK);
4962     eor(rscratch2, tmp1, tmp2);
4963     cbz(rscratch2, DONE);
4964 
4965     // Find the first different characters in the longwords and
4966     // compute their difference.
4967     bind(DIFFERENCE);
4968     rev(rscratch2, rscratch2);
4969     clz(rscratch2, rscratch2);
4970     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4971     lsrv(tmp1, tmp1, rscratch2);
4972     (this->*ext_chr)(tmp1, tmp1);
4973     lsrv(tmp2, tmp2, rscratch2);
4974     (this->*ext_chr)(tmp2, tmp2);
4975     subw(result, tmp1, tmp2);
4976     b(DONE);
4977   }
4978 
4979   bind(STUB);
4980     RuntimeAddress stub = NULL;
4981     switch(ae) {
4982       case StrIntrinsicNode::LL:
4983         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4984         break;
4985       case StrIntrinsicNode::UU:
4986         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4987         break;
4988       case StrIntrinsicNode::LU:
4989         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4990         break;
4991       case StrIntrinsicNode::UL:
4992         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
4993         break;
4994       default:
4995         ShouldNotReachHere();
4996      }
4997     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
4998     trampoline_call(stub);
4999     b(DONE);
5000 
5001   bind(SHORT_STRING);
5002   // Is the minimum length zero?
5003   cbz(cnt2, DONE);
5004   // arrange code to do most branches while loading and loading next characters
5005   // while comparing previous
5006   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5007   subs(cnt2, cnt2, 1);
5008   br(EQ, SHORT_LAST_INIT);
5009   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5010   b(SHORT_LOOP_START);
5011   bind(SHORT_LOOP);
5012   subs(cnt2, cnt2, 1);
5013   br(EQ, SHORT_LAST);
5014   bind(SHORT_LOOP_START);
5015   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5016   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5017   cmp(tmp1, cnt1);
5018   br(NE, SHORT_LOOP_TAIL);
5019   subs(cnt2, cnt2, 1);
5020   br(EQ, SHORT_LAST2);
5021   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5022   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5023   cmp(tmp2, rscratch1);
5024   br(EQ, SHORT_LOOP);
5025   sub(result, tmp2, rscratch1);
5026   b(DONE);
5027   bind(SHORT_LOOP_TAIL);
5028   sub(result, tmp1, cnt1);
5029   b(DONE);
5030   bind(SHORT_LAST2);
5031   cmp(tmp2, rscratch1);
5032   br(EQ, DONE);
5033   sub(result, tmp2, rscratch1);
5034 
5035   b(DONE);
5036   bind(SHORT_LAST_INIT);
5037   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5038   bind(SHORT_LAST);
5039   cmp(tmp1, cnt1);
5040   br(EQ, DONE);
5041   sub(result, tmp1, cnt1);
5042 
5043   bind(DONE);
5044 
5045   BLOCK_COMMENT("} string_compare");
5046 }
5047 
5048 // This method checks if provided byte array contains byte with highest bit set.
5049 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5050     // Simple and most common case of aligned small array which is not at the
5051     // end of memory page is placed here. All other cases are in stub.
5052     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5053     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5054     assert_different_registers(ary1, len, result);
5055 
5056     cmpw(len, 0);
5057     br(LE, SET_RESULT);
5058     cmpw(len, 4 * wordSize);
5059     br(GE, STUB_LONG); // size > 32 then go to stub
5060 
5061     int shift = 64 - exact_log2(os::vm_page_size());
5062     lsl(rscratch1, ary1, shift);
5063     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5064     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5065     br(CS, STUB); // at the end of page then go to stub
5066     subs(len, len, wordSize);
5067     br(LT, END);
5068 
5069   BIND(LOOP);
5070     ldr(rscratch1, Address(post(ary1, wordSize)));
5071     tst(rscratch1, UPPER_BIT_MASK);
5072     br(NE, SET_RESULT);
5073     subs(len, len, wordSize);
5074     br(GE, LOOP);
5075     cmpw(len, -wordSize);
5076     br(EQ, SET_RESULT);
5077 
5078   BIND(END);
5079     ldr(result, Address(ary1));
5080     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5081     lslv(result, result, len);
5082     tst(result, UPPER_BIT_MASK);
5083     b(SET_RESULT);
5084 
5085   BIND(STUB);
5086     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5087     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5088     trampoline_call(has_neg);
5089     b(DONE);
5090 
5091   BIND(STUB_LONG);
5092     RuntimeAddress has_neg_long =  RuntimeAddress(
5093             StubRoutines::aarch64::has_negatives_long());
5094     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5095     trampoline_call(has_neg_long);
5096     b(DONE);
5097 
5098   BIND(SET_RESULT);
5099     cset(result, NE); // set true or false
5100 
5101   BIND(DONE);
5102 }
5103 
5104 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5105                                    Register tmp4, Register tmp5, Register result,
5106                                    Register cnt1, int elem_size) {
5107   Label DONE, SAME;
5108   Register tmp1 = rscratch1;
5109   Register tmp2 = rscratch2;
5110   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5111   int elem_per_word = wordSize/elem_size;
5112   int log_elem_size = exact_log2(elem_size);
5113   int length_offset = arrayOopDesc::length_offset_in_bytes();
5114   int base_offset
5115     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5116   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5117 
5118   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5119   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5120 
5121 #ifndef PRODUCT
5122   {
5123     const char kind = (elem_size == 2) ? 'U' : 'L';
5124     char comment[64];
5125     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5126     BLOCK_COMMENT(comment);
5127   }
5128 #endif
5129 
5130   // if (a1 == a2)
5131   //     return true;
5132   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5133   br(EQ, SAME);
5134 
5135   if (UseSimpleArrayEquals) {
5136     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5137     // if (a1 == null || a2 == null)
5138     //     return false;
5139     // a1 & a2 == 0 means (some-pointer is null) or
5140     // (very-rare-or-even-probably-impossible-pointer-values)
5141     // so, we can save one branch in most cases
5142     tst(a1, a2);
5143     mov(result, false);
5144     br(EQ, A_MIGHT_BE_NULL);
5145     // if (a1.length != a2.length)
5146     //      return false;
5147     bind(A_IS_NOT_NULL);
5148     ldrw(cnt1, Address(a1, length_offset));
5149     ldrw(cnt2, Address(a2, length_offset));
5150     eorw(tmp5, cnt1, cnt2);
5151     cbnzw(tmp5, DONE);
5152     lea(a1, Address(a1, base_offset));
5153     lea(a2, Address(a2, base_offset));
5154     // Check for short strings, i.e. smaller than wordSize.
5155     subs(cnt1, cnt1, elem_per_word);
5156     br(Assembler::LT, SHORT);
5157     // Main 8 byte comparison loop.
5158     bind(NEXT_WORD); {
5159       ldr(tmp1, Address(post(a1, wordSize)));
5160       ldr(tmp2, Address(post(a2, wordSize)));
5161       subs(cnt1, cnt1, elem_per_word);
5162       eor(tmp5, tmp1, tmp2);
5163       cbnz(tmp5, DONE);
5164     } br(GT, NEXT_WORD);
5165     // Last longword.  In the case where length == 4 we compare the
5166     // same longword twice, but that's still faster than another
5167     // conditional branch.
5168     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5169     // length == 4.
5170     if (log_elem_size > 0)
5171       lsl(cnt1, cnt1, log_elem_size);
5172     ldr(tmp3, Address(a1, cnt1));
5173     ldr(tmp4, Address(a2, cnt1));
5174     eor(tmp5, tmp3, tmp4);
5175     cbnz(tmp5, DONE);
5176     b(SAME);
5177     bind(A_MIGHT_BE_NULL);
5178     // in case both a1 and a2 are not-null, proceed with loads
5179     cbz(a1, DONE);
5180     cbz(a2, DONE);
5181     b(A_IS_NOT_NULL);
5182     bind(SHORT);
5183 
5184     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5185     {
5186       ldrw(tmp1, Address(post(a1, 4)));
5187       ldrw(tmp2, Address(post(a2, 4)));
5188       eorw(tmp5, tmp1, tmp2);
5189       cbnzw(tmp5, DONE);
5190     }
5191     bind(TAIL03);
5192     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5193     {
5194       ldrh(tmp3, Address(post(a1, 2)));
5195       ldrh(tmp4, Address(post(a2, 2)));
5196       eorw(tmp5, tmp3, tmp4);
5197       cbnzw(tmp5, DONE);
5198     }
5199     bind(TAIL01);
5200     if (elem_size == 1) { // Only needed when comparing byte arrays.
5201       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5202       {
5203         ldrb(tmp1, a1);
5204         ldrb(tmp2, a2);
5205         eorw(tmp5, tmp1, tmp2);
5206         cbnzw(tmp5, DONE);
5207       }
5208     }
5209   } else {
5210     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5211         CSET_EQ, LAST_CHECK;
5212     mov(result, false);
5213     cbz(a1, DONE);
5214     ldrw(cnt1, Address(a1, length_offset));
5215     cbz(a2, DONE);
5216     ldrw(cnt2, Address(a2, length_offset));
5217     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5218     // faster to perform another branch before comparing a1 and a2
5219     cmp(cnt1, elem_per_word);
5220     br(LE, SHORT); // short or same
5221     ldr(tmp3, Address(pre(a1, base_offset)));
5222     cmp(cnt1, stubBytesThreshold);
5223     br(GE, STUB);
5224     ldr(tmp4, Address(pre(a2, base_offset)));
5225     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5226     cmp(cnt2, cnt1);
5227     br(NE, DONE);
5228 
5229     // Main 16 byte comparison loop with 2 exits
5230     bind(NEXT_DWORD); {
5231       ldr(tmp1, Address(pre(a1, wordSize)));
5232       ldr(tmp2, Address(pre(a2, wordSize)));
5233       subs(cnt1, cnt1, 2 * elem_per_word);
5234       br(LE, TAIL);
5235       eor(tmp4, tmp3, tmp4);
5236       cbnz(tmp4, DONE);
5237       ldr(tmp3, Address(pre(a1, wordSize)));
5238       ldr(tmp4, Address(pre(a2, wordSize)));
5239       cmp(cnt1, elem_per_word);
5240       br(LE, TAIL2);
5241       cmp(tmp1, tmp2);
5242     } br(EQ, NEXT_DWORD);
5243     b(DONE);
5244 
5245     bind(TAIL);
5246     eor(tmp4, tmp3, tmp4);
5247     eor(tmp2, tmp1, tmp2);
5248     lslv(tmp2, tmp2, tmp5);
5249     orr(tmp5, tmp4, tmp2);
5250     cmp(tmp5, zr);
5251     b(CSET_EQ);
5252 
5253     bind(TAIL2);
5254     eor(tmp2, tmp1, tmp2);
5255     cbnz(tmp2, DONE);
5256     b(LAST_CHECK);
5257 
5258     bind(STUB);
5259     ldr(tmp4, Address(pre(a2, base_offset)));
5260     cmp(cnt2, cnt1);
5261     br(NE, DONE);
5262     if (elem_size == 2) { // convert to byte counter
5263       lsl(cnt1, cnt1, 1);
5264     }
5265     eor(tmp5, tmp3, tmp4);
5266     cbnz(tmp5, DONE);
5267     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5268     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5269     trampoline_call(stub);
5270     b(DONE);
5271 
5272     bind(EARLY_OUT);
5273     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5274     // so, if a2 == null => return false(0), else return true, so we can return a2
5275     mov(result, a2);
5276     b(DONE);
5277     bind(SHORT);
5278     cmp(cnt2, cnt1);
5279     br(NE, DONE);
5280     cbz(cnt1, SAME);
5281     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5282     ldr(tmp3, Address(a1, base_offset));
5283     ldr(tmp4, Address(a2, base_offset));
5284     bind(LAST_CHECK);
5285     eor(tmp4, tmp3, tmp4);
5286     lslv(tmp5, tmp4, tmp5);
5287     cmp(tmp5, zr);
5288     bind(CSET_EQ);
5289     cset(result, EQ);
5290     b(DONE);
5291   }
5292 
5293   bind(SAME);
5294   mov(result, true);
5295   // That's it.
5296   bind(DONE);
5297 
5298   BLOCK_COMMENT("} array_equals");
5299 }
5300 
5301 // Compare Strings
5302 
5303 // For Strings we're passed the address of the first characters in a1
5304 // and a2 and the length in cnt1.
5305 // elem_size is the element size in bytes: either 1 or 2.
5306 // There are two implementations.  For arrays >= 8 bytes, all
5307 // comparisons (including the final one, which may overlap) are
5308 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5309 // halfword, then a short, and then a byte.
5310 
5311 void MacroAssembler::string_equals(Register a1, Register a2,
5312                                    Register result, Register cnt1, int elem_size)
5313 {
5314   Label SAME, DONE, SHORT, NEXT_WORD;
5315   Register tmp1 = rscratch1;
5316   Register tmp2 = rscratch2;
5317   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5318 
5319   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5320   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5321 
5322 #ifndef PRODUCT
5323   {
5324     const char kind = (elem_size == 2) ? 'U' : 'L';
5325     char comment[64];
5326     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5327     BLOCK_COMMENT(comment);
5328   }
5329 #endif
5330 
5331   mov(result, false);
5332 
5333   // Check for short strings, i.e. smaller than wordSize.
5334   subs(cnt1, cnt1, wordSize);
5335   br(Assembler::LT, SHORT);
5336   // Main 8 byte comparison loop.
5337   bind(NEXT_WORD); {
5338     ldr(tmp1, Address(post(a1, wordSize)));
5339     ldr(tmp2, Address(post(a2, wordSize)));
5340     subs(cnt1, cnt1, wordSize);
5341     eor(tmp1, tmp1, tmp2);
5342     cbnz(tmp1, DONE);
5343   } br(GT, NEXT_WORD);
5344   // Last longword.  In the case where length == 4 we compare the
5345   // same longword twice, but that's still faster than another
5346   // conditional branch.
5347   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5348   // length == 4.
5349   ldr(tmp1, Address(a1, cnt1));
5350   ldr(tmp2, Address(a2, cnt1));
5351   eor(tmp2, tmp1, tmp2);
5352   cbnz(tmp2, DONE);
5353   b(SAME);
5354 
5355   bind(SHORT);
5356   Label TAIL03, TAIL01;
5357 
5358   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5359   {
5360     ldrw(tmp1, Address(post(a1, 4)));
5361     ldrw(tmp2, Address(post(a2, 4)));
5362     eorw(tmp1, tmp1, tmp2);
5363     cbnzw(tmp1, DONE);
5364   }
5365   bind(TAIL03);
5366   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5367   {
5368     ldrh(tmp1, Address(post(a1, 2)));
5369     ldrh(tmp2, Address(post(a2, 2)));
5370     eorw(tmp1, tmp1, tmp2);
5371     cbnzw(tmp1, DONE);
5372   }
5373   bind(TAIL01);
5374   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5375     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5376     {
5377       ldrb(tmp1, a1);
5378       ldrb(tmp2, a2);
5379       eorw(tmp1, tmp1, tmp2);
5380       cbnzw(tmp1, DONE);
5381     }
5382   }
5383   // Arrays are equal.
5384   bind(SAME);
5385   mov(result, true);
5386 
5387   // That's it.
5388   bind(DONE);
5389   BLOCK_COMMENT("} string_equals");
5390 }
5391 
5392 
5393 // The size of the blocks erased by the zero_blocks stub.  We must
5394 // handle anything smaller than this ourselves in zero_words().
5395 const int MacroAssembler::zero_words_block_size = 8;
5396 
5397 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5398 // possible, handling small word counts locally and delegating
5399 // anything larger to the zero_blocks stub.  It is expanded many times
5400 // in compiled code, so it is important to keep it short.
5401 
5402 // ptr:   Address of a buffer to be zeroed.
5403 // cnt:   Count in HeapWords.
5404 //
5405 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5406 void MacroAssembler::zero_words(Register ptr, Register cnt)
5407 {
5408   assert(is_power_of_2(zero_words_block_size), "adjust this");
5409   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5410 
5411   BLOCK_COMMENT("zero_words {");
5412   cmp(cnt, zero_words_block_size);
5413   Label around, done, done16;
5414   br(LO, around);
5415   {
5416     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5417     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5418     if (StubRoutines::aarch64::complete()) {
5419       trampoline_call(zero_blocks);
5420     } else {
5421       bl(zero_blocks);
5422     }
5423   }
5424   bind(around);
5425   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5426     Label l;
5427     tbz(cnt, exact_log2(i), l);
5428     for (int j = 0; j < i; j += 2) {
5429       stp(zr, zr, post(ptr, 16));
5430     }
5431     bind(l);
5432   }
5433   {
5434     Label l;
5435     tbz(cnt, 0, l);
5436     str(zr, Address(ptr));
5437     bind(l);
5438   }
5439   BLOCK_COMMENT("} zero_words");
5440 }
5441 
5442 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5443 // cnt:          Immediate count in HeapWords.
5444 #define SmallArraySize (18 * BytesPerLong)
5445 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5446 {
5447   BLOCK_COMMENT("zero_words {");
5448   int i = cnt & 1;  // store any odd word to start
5449   if (i) str(zr, Address(base));
5450 
5451   if (cnt <= SmallArraySize / BytesPerLong) {
5452     for (; i < (int)cnt; i += 2)
5453       stp(zr, zr, Address(base, i * wordSize));
5454   } else {
5455     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5456     int remainder = cnt % (2 * unroll);
5457     for (; i < remainder; i += 2)
5458       stp(zr, zr, Address(base, i * wordSize));
5459 
5460     Label loop;
5461     Register cnt_reg = rscratch1;
5462     Register loop_base = rscratch2;
5463     cnt = cnt - remainder;
5464     mov(cnt_reg, cnt);
5465     // adjust base and prebias by -2 * wordSize so we can pre-increment
5466     add(loop_base, base, (remainder - 2) * wordSize);
5467     bind(loop);
5468     sub(cnt_reg, cnt_reg, 2 * unroll);
5469     for (i = 1; i < unroll; i++)
5470       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5471     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5472     cbnz(cnt_reg, loop);
5473   }
5474   BLOCK_COMMENT("} zero_words");
5475 }
5476 
5477 // Zero blocks of memory by using DC ZVA.
5478 //
5479 // Aligns the base address first sufficently for DC ZVA, then uses
5480 // DC ZVA repeatedly for every full block.  cnt is the size to be
5481 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5482 // in cnt.
5483 //
5484 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5485 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5486 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5487   Register tmp = rscratch1;
5488   Register tmp2 = rscratch2;
5489   int zva_length = VM_Version::zva_length();
5490   Label initial_table_end, loop_zva;
5491   Label fini;
5492 
5493   // Base must be 16 byte aligned. If not just return and let caller handle it
5494   tst(base, 0x0f);
5495   br(Assembler::NE, fini);
5496   // Align base with ZVA length.
5497   neg(tmp, base);
5498   andr(tmp, tmp, zva_length - 1);
5499 
5500   // tmp: the number of bytes to be filled to align the base with ZVA length.
5501   add(base, base, tmp);
5502   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5503   adr(tmp2, initial_table_end);
5504   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5505   br(tmp2);
5506 
5507   for (int i = -zva_length + 16; i < 0; i += 16)
5508     stp(zr, zr, Address(base, i));
5509   bind(initial_table_end);
5510 
5511   sub(cnt, cnt, zva_length >> 3);
5512   bind(loop_zva);
5513   dc(Assembler::ZVA, base);
5514   subs(cnt, cnt, zva_length >> 3);
5515   add(base, base, zva_length);
5516   br(Assembler::GE, loop_zva);
5517   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5518   bind(fini);
5519 }
5520 
5521 // base:   Address of a buffer to be filled, 8 bytes aligned.
5522 // cnt:    Count in 8-byte unit.
5523 // value:  Value to be filled with.
5524 // base will point to the end of the buffer after filling.
5525 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5526 {
5527 //  Algorithm:
5528 //
5529 //    scratch1 = cnt & 7;
5530 //    cnt -= scratch1;
5531 //    p += scratch1;
5532 //    switch (scratch1) {
5533 //      do {
5534 //        cnt -= 8;
5535 //          p[-8] = v;
5536 //        case 7:
5537 //          p[-7] = v;
5538 //        case 6:
5539 //          p[-6] = v;
5540 //          // ...
5541 //        case 1:
5542 //          p[-1] = v;
5543 //        case 0:
5544 //          p += 8;
5545 //      } while (cnt);
5546 //    }
5547 
5548   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5549 
5550   Label fini, skip, entry, loop;
5551   const int unroll = 8; // Number of stp instructions we'll unroll
5552 
5553   cbz(cnt, fini);
5554   tbz(base, 3, skip);
5555   str(value, Address(post(base, 8)));
5556   sub(cnt, cnt, 1);
5557   bind(skip);
5558 
5559   andr(rscratch1, cnt, (unroll-1) * 2);
5560   sub(cnt, cnt, rscratch1);
5561   add(base, base, rscratch1, Assembler::LSL, 3);
5562   adr(rscratch2, entry);
5563   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5564   br(rscratch2);
5565 
5566   bind(loop);
5567   add(base, base, unroll * 16);
5568   for (int i = -unroll; i < 0; i++)
5569     stp(value, value, Address(base, i * 16));
5570   bind(entry);
5571   subs(cnt, cnt, unroll * 2);
5572   br(Assembler::GE, loop);
5573 
5574   tbz(cnt, 0, fini);
5575   str(value, Address(post(base, 8)));
5576   bind(fini);
5577 }
5578 
5579 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5580 // java/lang/StringUTF16.compress.
5581 void MacroAssembler::encode_iso_array(Register src, Register dst,
5582                       Register len, Register result,
5583                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5584                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5585 {
5586     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5587         NEXT_32_START, NEXT_32_PRFM_START;
5588     Register tmp1 = rscratch1, tmp2 = rscratch2;
5589 
5590       mov(result, len); // Save initial len
5591 
5592 #ifndef BUILTIN_SIM
5593       cmp(len, 8); // handle shortest strings first
5594       br(LT, LOOP_1);
5595       cmp(len, 32);
5596       br(LT, NEXT_8);
5597       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5598       // to convert chars to bytes
5599       if (SoftwarePrefetchHintDistance >= 0) {
5600         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5601         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5602         br(LE, NEXT_32_START);
5603         b(NEXT_32_PRFM_START);
5604         BIND(NEXT_32_PRFM);
5605           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5606         BIND(NEXT_32_PRFM_START);
5607           prfm(Address(src, SoftwarePrefetchHintDistance));
5608           orr(v4, T16B, Vtmp1, Vtmp2);
5609           orr(v5, T16B, Vtmp3, Vtmp4);
5610           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5611           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5612           stpq(Vtmp1, Vtmp3, dst);
5613           uzp2(v5, T16B, v4, v5); // high bytes
5614           umov(tmp2, v5, D, 1);
5615           fmovd(tmp1, v5);
5616           orr(tmp1, tmp1, tmp2);
5617           cbnz(tmp1, LOOP_8);
5618           sub(len, len, 32);
5619           add(dst, dst, 32);
5620           add(src, src, 64);
5621           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5622           br(GE, NEXT_32_PRFM);
5623           cmp(len, 32);
5624           br(LT, LOOP_8);
5625         BIND(NEXT_32);
5626           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5627         BIND(NEXT_32_START);
5628       } else {
5629         BIND(NEXT_32);
5630           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5631       }
5632       prfm(Address(src, SoftwarePrefetchHintDistance));
5633       uzp1(v4, T16B, Vtmp1, Vtmp2);
5634       uzp1(v5, T16B, Vtmp3, Vtmp4);
5635       stpq(v4, v5, dst);
5636       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5637       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5638       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5639       umov(tmp2, Vtmp1, D, 1);
5640       fmovd(tmp1, Vtmp1);
5641       orr(tmp1, tmp1, tmp2);
5642       cbnz(tmp1, LOOP_8);
5643       sub(len, len, 32);
5644       add(dst, dst, 32);
5645       add(src, src, 64);
5646       cmp(len, 32);
5647       br(GE, NEXT_32);
5648       cbz(len, DONE);
5649 
5650     BIND(LOOP_8);
5651       cmp(len, 8);
5652       br(LT, LOOP_1);
5653     BIND(NEXT_8);
5654       ld1(Vtmp1, T8H, src);
5655       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5656       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5657       strd(Vtmp2, dst);
5658       fmovd(tmp1, Vtmp3);
5659       cbnz(tmp1, NEXT_1);
5660 
5661       sub(len, len, 8);
5662       add(dst, dst, 8);
5663       add(src, src, 16);
5664       cmp(len, 8);
5665       br(GE, NEXT_8);
5666 
5667     BIND(LOOP_1);
5668 #endif
5669     cbz(len, DONE);
5670     BIND(NEXT_1);
5671       ldrh(tmp1, Address(post(src, 2)));
5672       strb(tmp1, Address(post(dst, 1)));
5673       tst(tmp1, 0xff00);
5674       br(NE, SET_RESULT);
5675       subs(len, len, 1);
5676       br(GT, NEXT_1);
5677 
5678     BIND(SET_RESULT);
5679       sub(result, result, len); // Return index where we stopped
5680                                 // Return len == 0 if we processed all
5681                                 // characters
5682     BIND(DONE);
5683 }
5684 
5685 
5686 // Inflate byte[] array to char[].
5687 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5688                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5689                                         Register tmp4) {
5690   Label big, done, after_init, to_stub;
5691 
5692   assert_different_registers(src, dst, len, tmp4, rscratch1);
5693 
5694   fmovd(vtmp1, zr);
5695   lsrw(tmp4, len, 3);
5696   bind(after_init);
5697   cbnzw(tmp4, big);
5698   // Short string: less than 8 bytes.
5699   {
5700     Label loop, tiny;
5701 
5702     cmpw(len, 4);
5703     br(LT, tiny);
5704     // Use SIMD to do 4 bytes.
5705     ldrs(vtmp2, post(src, 4));
5706     zip1(vtmp3, T8B, vtmp2, vtmp1);
5707     subw(len, len, 4);
5708     strd(vtmp3, post(dst, 8));
5709 
5710     cbzw(len, done);
5711 
5712     // Do the remaining bytes by steam.
5713     bind(loop);
5714     ldrb(tmp4, post(src, 1));
5715     strh(tmp4, post(dst, 2));
5716     subw(len, len, 1);
5717 
5718     bind(tiny);
5719     cbnz(len, loop);
5720 
5721     b(done);
5722   }
5723 
5724   if (SoftwarePrefetchHintDistance >= 0) {
5725     bind(to_stub);
5726       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5727       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5728       trampoline_call(stub);
5729       b(after_init);
5730   }
5731 
5732   // Unpack the bytes 8 at a time.
5733   bind(big);
5734   {
5735     Label loop, around, loop_last, loop_start;
5736 
5737     if (SoftwarePrefetchHintDistance >= 0) {
5738       const int large_loop_threshold = (64 + 16)/8;
5739       ldrd(vtmp2, post(src, 8));
5740       andw(len, len, 7);
5741       cmp(tmp4, large_loop_threshold);
5742       br(GE, to_stub);
5743       b(loop_start);
5744 
5745       bind(loop);
5746       ldrd(vtmp2, post(src, 8));
5747       bind(loop_start);
5748       subs(tmp4, tmp4, 1);
5749       br(EQ, loop_last);
5750       zip1(vtmp2, T16B, vtmp2, vtmp1);
5751       ldrd(vtmp3, post(src, 8));
5752       st1(vtmp2, T8H, post(dst, 16));
5753       subs(tmp4, tmp4, 1);
5754       zip1(vtmp3, T16B, vtmp3, vtmp1);
5755       st1(vtmp3, T8H, post(dst, 16));
5756       br(NE, loop);
5757       b(around);
5758       bind(loop_last);
5759       zip1(vtmp2, T16B, vtmp2, vtmp1);
5760       st1(vtmp2, T8H, post(dst, 16));
5761       bind(around);
5762       cbz(len, done);
5763     } else {
5764       andw(len, len, 7);
5765       bind(loop);
5766       ldrd(vtmp2, post(src, 8));
5767       sub(tmp4, tmp4, 1);
5768       zip1(vtmp3, T16B, vtmp2, vtmp1);
5769       st1(vtmp3, T8H, post(dst, 16));
5770       cbnz(tmp4, loop);
5771     }
5772   }
5773 
5774   // Do the tail of up to 8 bytes.
5775   add(src, src, len);
5776   ldrd(vtmp3, Address(src, -8));
5777   add(dst, dst, len, ext::uxtw, 1);
5778   zip1(vtmp3, T16B, vtmp3, vtmp1);
5779   strq(vtmp3, Address(dst, -16));
5780 
5781   bind(done);
5782 }
5783 
5784 // Compress char[] array to byte[].
5785 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5786                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5787                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5788                                          Register result) {
5789   encode_iso_array(src, dst, len, result,
5790                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5791   cmp(len, zr);
5792   csel(result, result, zr, EQ);
5793 }
5794 
5795 // get_thread() can be called anywhere inside generated code so we
5796 // need to save whatever non-callee save context might get clobbered
5797 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5798 // the call setup code.
5799 //
5800 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5801 //
5802 void MacroAssembler::get_thread(Register dst) {
5803   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5804   push(saved_regs, sp);
5805 
5806   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5807   blrt(lr, 1, 0, 1);
5808   if (dst != c_rarg0) {
5809     mov(dst, c_rarg0);
5810   }
5811 
5812   pop(saved_regs, sp);
5813 }