1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   if (last_java_pc != NULL) {
 376     adr(scratch, last_java_pc);
 377   } else {
 378     // FIXME: This is almost never correct.  We should delete all
 379     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 380     // correct return address instead.
 381     adr(scratch, pc());
 382   }
 383 
 384   str(scratch, Address(rthread,
 385                        JavaThread::frame_anchor_offset()
 386                        + JavaFrameAnchor::last_Java_pc_offset()));
 387 
 388   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 389 }
 390 
 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 392                                          Register last_java_fp,
 393                                          Label &L,
 394                                          Register scratch) {
 395   if (L.is_bound()) {
 396     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 397   } else {
 398     InstructionMark im(this);
 399     L.add_patch_at(code(), locator());
 400     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 401   }
 402 }
 403 
 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 405   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 406   assert(CodeCache::find_blob(entry.target()) != NULL,
 407          "destination of far call not found in code cache");
 408   if (far_branches()) {
 409     unsigned long offset;
 410     // We can use ADRP here because we know that the total size of
 411     // the code cache cannot exceed 2Gb.
 412     adrp(tmp, entry, offset);
 413     add(tmp, tmp, offset);
 414     if (cbuf) cbuf->set_insts_mark();
 415     blr(tmp);
 416   } else {
 417     if (cbuf) cbuf->set_insts_mark();
 418     bl(entry);
 419   }
 420 }
 421 
 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 423   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 424   assert(CodeCache::find_blob(entry.target()) != NULL,
 425          "destination of far call not found in code cache");
 426   if (far_branches()) {
 427     unsigned long offset;
 428     // We can use ADRP here because we know that the total size of
 429     // the code cache cannot exceed 2Gb.
 430     adrp(tmp, entry, offset);
 431     add(tmp, tmp, offset);
 432     if (cbuf) cbuf->set_insts_mark();
 433     br(tmp);
 434   } else {
 435     if (cbuf) cbuf->set_insts_mark();
 436     b(entry);
 437   }
 438 }
 439 
 440 void MacroAssembler::reserved_stack_check() {
 441     // testing if reserved zone needs to be enabled
 442     Label no_reserved_zone_enabling;
 443 
 444     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 445     cmp(sp, rscratch1);
 446     br(Assembler::LO, no_reserved_zone_enabling);
 447 
 448     enter();   // LR and FP are live.
 449     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 450     mov(c_rarg0, rthread);
 451     blr(rscratch1);
 452     leave();
 453 
 454     // We have already removed our own frame.
 455     // throw_delayed_StackOverflowError will think that it's been
 456     // called by our caller.
 457     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 458     br(rscratch1);
 459     should_not_reach_here();
 460 
 461     bind(no_reserved_zone_enabling);
 462 }
 463 
 464 int MacroAssembler::biased_locking_enter(Register lock_reg,
 465                                          Register obj_reg,
 466                                          Register swap_reg,
 467                                          Register tmp_reg,
 468                                          bool swap_reg_contains_mark,
 469                                          Label& done,
 470                                          Label* slow_case,
 471                                          BiasedLockingCounters* counters) {
 472   assert(UseBiasedLocking, "why call this otherwise?");
 473   assert_different_registers(lock_reg, obj_reg, swap_reg);
 474 
 475   if (PrintBiasedLockingStatistics && counters == NULL)
 476     counters = BiasedLocking::counters();
 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 
 518   // At this point we know that the header has the bias pattern and
 519   // that we are not the bias owner in the current epoch. We need to
 520   // figure out more details about the state of the header in order to
 521   // know what operations can be legally performed on the object's
 522   // header.
 523 
 524   // If the low three bits in the xor result aren't clear, that means
 525   // the prototype header is no longer biased and we have to revoke
 526   // the bias on this object.
 527   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 528   cbnz(rscratch1, try_revoke_bias);
 529 
 530   // Biasing is still enabled for this data type. See whether the
 531   // epoch of the current bias is still valid, meaning that the epoch
 532   // bits of the mark word are equal to the epoch bits of the
 533   // prototype header. (Note that the prototype header's epoch bits
 534   // only change at a safepoint.) If not, attempt to rebias the object
 535   // toward the current thread. Note that we must be absolutely sure
 536   // that the current epoch is invalid in order to do this because
 537   // otherwise the manipulations it performs on the mark word are
 538   // illegal.
 539   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 540   cbnz(rscratch1, try_rebias);
 541 
 542   // The epoch of the current bias is still valid but we know nothing
 543   // about the owner; it might be set or it might be clear. Try to
 544   // acquire the bias of the object using an atomic operation. If this
 545   // fails we will go in to the runtime to revoke the object's bias.
 546   // Note that we first construct the presumed unbiased header so we
 547   // don't accidentally blow away another thread's valid bias.
 548   {
 549     Label here;
 550     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 551     andr(swap_reg, swap_reg, rscratch1);
 552     orr(tmp_reg, swap_reg, rthread);
 553     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 554     // If the biasing toward our thread failed, this means that
 555     // another thread succeeded in biasing it toward itself and we
 556     // need to revoke that bias. The revocation will occur in the
 557     // interpreter runtime in the slow case.
 558     bind(here);
 559     if (counters != NULL) {
 560       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 561                   tmp_reg, rscratch1, rscratch2);
 562     }
 563   }
 564   b(done);
 565 
 566   bind(try_rebias);
 567   // At this point we know the epoch has expired, meaning that the
 568   // current "bias owner", if any, is actually invalid. Under these
 569   // circumstances _only_, we are allowed to use the current header's
 570   // value as the comparison value when doing the cas to acquire the
 571   // bias in the current epoch. In other words, we allow transfer of
 572   // the bias from one thread to another directly in this situation.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     orr(tmp_reg, rthread, tmp_reg);
 580     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 581     // If the biasing toward our thread failed, then another thread
 582     // succeeded in biasing it toward itself and we need to revoke that
 583     // bias. The revocation will occur in the runtime in the slow case.
 584     bind(here);
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 587                   tmp_reg, rscratch1, rscratch2);
 588     }
 589   }
 590   b(done);
 591 
 592   bind(try_revoke_bias);
 593   // The prototype mark in the klass doesn't have the bias bit set any
 594   // more, indicating that objects of this data type are not supposed
 595   // to be biased any more. We are going to try to reset the mark of
 596   // this object to the prototype value and fall through to the
 597   // CAS-based locking scheme. Note that if our CAS fails, it means
 598   // that another thread raced us for the privilege of revoking the
 599   // bias of this particular object, so it's okay to continue in the
 600   // normal locking code.
 601   //
 602   // FIXME: due to a lack of registers we currently blow away the age
 603   // bits in this situation. Should attempt to preserve them.
 604   {
 605     Label here, nope;
 606     load_prototype_header(tmp_reg, obj_reg);
 607     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 608     bind(here);
 609 
 610     // Fall through to the normal CAS-based lock, because no matter what
 611     // the result of the above CAS, some thread must have succeeded in
 612     // removing the bias bit from the object's header.
 613     if (counters != NULL) {
 614       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 615                   rscratch1, rscratch2);
 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }
 657 
 658 static void pass_arg3(MacroAssembler* masm, Register arg) {
 659   if (c_rarg3 != arg ) {
 660     masm->mov(c_rarg3, arg);
 661   }
 662 }
 663 
 664 void MacroAssembler::call_VM_base(Register oop_result,
 665                                   Register java_thread,
 666                                   Register last_java_sp,
 667                                   address  entry_point,
 668                                   int      number_of_arguments,
 669                                   bool     check_exceptions) {
 670    // determine java_thread register
 671   if (!java_thread->is_valid()) {
 672     java_thread = rthread;
 673   }
 674 
 675   // determine last_java_sp register
 676   if (!last_java_sp->is_valid()) {
 677     last_java_sp = esp;
 678   }
 679 
 680   // debugging support
 681   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 682   assert(java_thread == rthread, "unexpected register");
 683 #ifdef ASSERT
 684   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 685   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 686 #endif // ASSERT
 687 
 688   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 689   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 690 
 691   // push java thread (becomes first argument of C function)
 692 
 693   mov(c_rarg0, java_thread);
 694 
 695   // set last Java frame before call
 696   assert(last_java_sp != rfp, "can't use rfp");
 697 
 698   Label l;
 699   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 700 
 701   // do the call, remove parameters
 702   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 703 
 704   // reset last Java frame
 705   // Only interpreter should have to clear fp
 706   reset_last_Java_frame(true);
 707 
 708    // C++ interp handles this in the interpreter
 709   check_and_handle_popframe(java_thread);
 710   check_and_handle_earlyret(java_thread);
 711 
 712   if (check_exceptions) {
 713     // check for pending exceptions (java_thread is set upon return)
 714     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 715     Label ok;
 716     cbz(rscratch1, ok);
 717     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 718     br(rscratch1);
 719     bind(ok);
 720   }
 721 
 722   // get oop result if there is one and reset the value in the thread
 723   if (oop_result->is_valid()) {
 724     get_vm_result(oop_result, java_thread);
 725   }
 726 }
 727 
 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 729   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 730 }
 731 
 732 // Maybe emit a call via a trampoline.  If the code cache is small
 733 // trampolines won't be emitted.
 734 
 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 736   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 737   assert(entry.rspec().type() == relocInfo::runtime_call_type
 738          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 739          || entry.rspec().type() == relocInfo::static_call_type
 740          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 741 
 742   unsigned int start_offset = offset();
 743   if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
 744     address stub = emit_trampoline_stub(start_offset, entry.target());
 745     if (stub == NULL) {
 746       return NULL; // CodeCache is full
 747     }
 748   }
 749 
 750   if (cbuf) cbuf->set_insts_mark();
 751   relocate(entry.rspec());
 752   if (!far_branches()) {
 753     bl(entry.target());
 754   } else {
 755     bl(pc());
 756   }
 757   // just need to return a non-null address
 758   return pc();
 759 }
 760 
 761 
 762 // Emit a trampoline stub for a call to a target which is too far away.
 763 //
 764 // code sequences:
 765 //
 766 // call-site:
 767 //   branch-and-link to <destination> or <trampoline stub>
 768 //
 769 // Related trampoline stub for this call site in the stub section:
 770 //   load the call target from the constant pool
 771 //   branch (LR still points to the call site above)
 772 
 773 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 774                                              address dest) {
 775   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 776   if (stub == NULL) {
 777     return NULL;  // CodeBuffer::expand failed
 778   }
 779 
 780   // Create a trampoline stub relocation which relates this trampoline stub
 781   // with the call instruction at insts_call_instruction_offset in the
 782   // instructions code-section.
 783   align(wordSize);
 784   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 785                                             + insts_call_instruction_offset));
 786   const int stub_start_offset = offset();
 787 
 788   // Now, create the trampoline stub's code:
 789   // - load the call
 790   // - call
 791   Label target;
 792   ldr(rscratch1, target);
 793   br(rscratch1);
 794   bind(target);
 795   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 796          "should be");
 797   emit_int64((int64_t)dest);
 798 
 799   const address stub_start_addr = addr_at(stub_start_offset);
 800 
 801   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 802 
 803   end_a_stub();
 804   return stub_start_addr;
 805 }
 806 
 807 address MacroAssembler::ic_call(address entry, jint method_index) {
 808   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 809   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 810   // unsigned long offset;
 811   // ldr_constant(rscratch2, const_ptr);
 812   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 813   return trampoline_call(Address(entry, rh));
 814 }
 815 
 816 // Implementation of call_VM versions
 817 
 818 void MacroAssembler::call_VM(Register oop_result,
 819                              address entry_point,
 820                              bool check_exceptions) {
 821   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 822 }
 823 
 824 void MacroAssembler::call_VM(Register oop_result,
 825                              address entry_point,
 826                              Register arg_1,
 827                              bool check_exceptions) {
 828   pass_arg1(this, arg_1);
 829   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 830 }
 831 
 832 void MacroAssembler::call_VM(Register oop_result,
 833                              address entry_point,
 834                              Register arg_1,
 835                              Register arg_2,
 836                              bool check_exceptions) {
 837   assert(arg_1 != c_rarg2, "smashed arg");
 838   pass_arg2(this, arg_2);
 839   pass_arg1(this, arg_1);
 840   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 841 }
 842 
 843 void MacroAssembler::call_VM(Register oop_result,
 844                              address entry_point,
 845                              Register arg_1,
 846                              Register arg_2,
 847                              Register arg_3,
 848                              bool check_exceptions) {
 849   assert(arg_1 != c_rarg3, "smashed arg");
 850   assert(arg_2 != c_rarg3, "smashed arg");
 851   pass_arg3(this, arg_3);
 852 
 853   assert(arg_1 != c_rarg2, "smashed arg");
 854   pass_arg2(this, arg_2);
 855 
 856   pass_arg1(this, arg_1);
 857   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 858 }
 859 
 860 void MacroAssembler::call_VM(Register oop_result,
 861                              Register last_java_sp,
 862                              address entry_point,
 863                              int number_of_arguments,
 864                              bool check_exceptions) {
 865   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 866 }
 867 
 868 void MacroAssembler::call_VM(Register oop_result,
 869                              Register last_java_sp,
 870                              address entry_point,
 871                              Register arg_1,
 872                              bool check_exceptions) {
 873   pass_arg1(this, arg_1);
 874   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              Register arg_1,
 881                              Register arg_2,
 882                              bool check_exceptions) {
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886   pass_arg1(this, arg_1);
 887   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 888 }
 889 
 890 void MacroAssembler::call_VM(Register oop_result,
 891                              Register last_java_sp,
 892                              address entry_point,
 893                              Register arg_1,
 894                              Register arg_2,
 895                              Register arg_3,
 896                              bool check_exceptions) {
 897   assert(arg_1 != c_rarg3, "smashed arg");
 898   assert(arg_2 != c_rarg3, "smashed arg");
 899   pass_arg3(this, arg_3);
 900   assert(arg_1 != c_rarg2, "smashed arg");
 901   pass_arg2(this, arg_2);
 902   pass_arg1(this, arg_1);
 903   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 904 }
 905 
 906 
 907 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 908   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 909   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 910   verify_oop(oop_result, "broken oop in call_VM_base");
 911 }
 912 
 913 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 914   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 915   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 916 }
 917 
 918 void MacroAssembler::align(int modulus) {
 919   while (offset() % modulus != 0) nop();
 920 }
 921 
 922 // these are no-ops overridden by InterpreterMacroAssembler
 923 
 924 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 925 
 926 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 927 
 928 
 929 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 930                                                       Register tmp,
 931                                                       int offset) {
 932   intptr_t value = *delayed_value_addr;
 933   if (value != 0)
 934     return RegisterOrConstant(value + offset);
 935 
 936   // load indirectly to solve generation ordering problem
 937   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 938 
 939   if (offset != 0)
 940     add(tmp, tmp, offset);
 941 
 942   return RegisterOrConstant(tmp);
 943 }
 944 
 945 
 946 void MacroAssembler:: notify(int type) {
 947   if (type == bytecode_start) {
 948     // set_last_Java_frame(esp, rfp, (address)NULL);
 949     Assembler:: notify(type);
 950     // reset_last_Java_frame(true);
 951   }
 952   else
 953     Assembler:: notify(type);
 954 }
 955 
 956 // Look up the method for a megamorphic invokeinterface call.
 957 // The target method is determined by <intf_klass, itable_index>.
 958 // The receiver klass is in recv_klass.
 959 // On success, the result will be in method_result, and execution falls through.
 960 // On failure, execution transfers to the given label.
 961 void MacroAssembler::lookup_interface_method(Register recv_klass,
 962                                              Register intf_klass,
 963                                              RegisterOrConstant itable_index,
 964                                              Register method_result,
 965                                              Register scan_temp,
 966                                              Label& L_no_such_interface,
 967                          bool return_method) {
 968   assert_different_registers(recv_klass, intf_klass, scan_temp);
 969   assert_different_registers(method_result, intf_klass, scan_temp);
 970   assert(recv_klass != method_result || !return_method,
 971      "recv_klass can be destroyed when method isn't needed");
 972   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 973          "caller must use same register for non-constant itable index as for method");
 974 
 975   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 976   int vtable_base = in_bytes(Klass::vtable_start_offset());
 977   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 978   int scan_step   = itableOffsetEntry::size() * wordSize;
 979   int vte_size    = vtableEntry::size_in_bytes();
 980   assert(vte_size == wordSize, "else adjust times_vte_scale");
 981 
 982   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 983 
 984   // %%% Could store the aligned, prescaled offset in the klassoop.
 985   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 986   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 987   add(scan_temp, scan_temp, vtable_base);
 988 
 989   if (return_method) {
 990     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 991     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 992     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 993     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 994     if (itentry_off)
 995       add(recv_klass, recv_klass, itentry_off);
 996   }
 997 
 998   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 999   //   if (scan->interface() == intf) {
1000   //     result = (klass + scan->offset() + itable_index);
1001   //   }
1002   // }
1003   Label search, found_method;
1004 
1005   for (int peel = 1; peel >= 0; peel--) {
1006     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1007     cmp(intf_klass, method_result);
1008 
1009     if (peel) {
1010       br(Assembler::EQ, found_method);
1011     } else {
1012       br(Assembler::NE, search);
1013       // (invert the test to fall through to found_method...)
1014     }
1015 
1016     if (!peel)  break;
1017 
1018     bind(search);
1019 
1020     // Check that the previous entry is non-null.  A null entry means that
1021     // the receiver class doesn't implement the interface, and wasn't the
1022     // same as when the caller was compiled.
1023     cbz(method_result, L_no_such_interface);
1024     add(scan_temp, scan_temp, scan_step);
1025   }
1026 
1027   bind(found_method);
1028 
1029   // Got a hit.
1030   if (return_method) {
1031     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1032     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1033   }
1034 }
1035 
1036 // virtual method calling
1037 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1038                                            RegisterOrConstant vtable_index,
1039                                            Register method_result) {
1040   const int base = in_bytes(Klass::vtable_start_offset());
1041   assert(vtableEntry::size() * wordSize == 8,
1042          "adjust the scaling in the code below");
1043   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1044 
1045   if (vtable_index.is_register()) {
1046     lea(method_result, Address(recv_klass,
1047                                vtable_index.as_register(),
1048                                Address::lsl(LogBytesPerWord)));
1049     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1050   } else {
1051     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1052     ldr(method_result,
1053         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1054   }
1055 }
1056 
1057 void MacroAssembler::check_klass_subtype(Register sub_klass,
1058                            Register super_klass,
1059                            Register temp_reg,
1060                            Label& L_success) {
1061   Label L_failure;
1062   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1063   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1064   bind(L_failure);
1065 }
1066 
1067 
1068 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1069                                                    Register super_klass,
1070                                                    Register temp_reg,
1071                                                    Label* L_success,
1072                                                    Label* L_failure,
1073                                                    Label* L_slow_path,
1074                                         RegisterOrConstant super_check_offset) {
1075   assert_different_registers(sub_klass, super_klass, temp_reg);
1076   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1077   if (super_check_offset.is_register()) {
1078     assert_different_registers(sub_klass, super_klass,
1079                                super_check_offset.as_register());
1080   } else if (must_load_sco) {
1081     assert(temp_reg != noreg, "supply either a temp or a register offset");
1082   }
1083 
1084   Label L_fallthrough;
1085   int label_nulls = 0;
1086   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1087   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1088   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1089   assert(label_nulls <= 1, "at most one NULL in the batch");
1090 
1091   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1092   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1093   Address super_check_offset_addr(super_klass, sco_offset);
1094 
1095   // Hacked jmp, which may only be used just before L_fallthrough.
1096 #define final_jmp(label)                                                \
1097   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1098   else                            b(label)                /*omit semi*/
1099 
1100   // If the pointers are equal, we are done (e.g., String[] elements).
1101   // This self-check enables sharing of secondary supertype arrays among
1102   // non-primary types such as array-of-interface.  Otherwise, each such
1103   // type would need its own customized SSA.
1104   // We move this check to the front of the fast path because many
1105   // type checks are in fact trivially successful in this manner,
1106   // so we get a nicely predicted branch right at the start of the check.
1107   cmp(sub_klass, super_klass);
1108   br(Assembler::EQ, *L_success);
1109 
1110   // Check the supertype display:
1111   if (must_load_sco) {
1112     ldrw(temp_reg, super_check_offset_addr);
1113     super_check_offset = RegisterOrConstant(temp_reg);
1114   }
1115   Address super_check_addr(sub_klass, super_check_offset);
1116   ldr(rscratch1, super_check_addr);
1117   cmp(super_klass, rscratch1); // load displayed supertype
1118 
1119   // This check has worked decisively for primary supers.
1120   // Secondary supers are sought in the super_cache ('super_cache_addr').
1121   // (Secondary supers are interfaces and very deeply nested subtypes.)
1122   // This works in the same check above because of a tricky aliasing
1123   // between the super_cache and the primary super display elements.
1124   // (The 'super_check_addr' can address either, as the case requires.)
1125   // Note that the cache is updated below if it does not help us find
1126   // what we need immediately.
1127   // So if it was a primary super, we can just fail immediately.
1128   // Otherwise, it's the slow path for us (no success at this point).
1129 
1130   if (super_check_offset.is_register()) {
1131     br(Assembler::EQ, *L_success);
1132     cmp(super_check_offset.as_register(), sc_offset);
1133     if (L_failure == &L_fallthrough) {
1134       br(Assembler::EQ, *L_slow_path);
1135     } else {
1136       br(Assembler::NE, *L_failure);
1137       final_jmp(*L_slow_path);
1138     }
1139   } else if (super_check_offset.as_constant() == sc_offset) {
1140     // Need a slow path; fast failure is impossible.
1141     if (L_slow_path == &L_fallthrough) {
1142       br(Assembler::EQ, *L_success);
1143     } else {
1144       br(Assembler::NE, *L_slow_path);
1145       final_jmp(*L_success);
1146     }
1147   } else {
1148     // No slow path; it's a fast decision.
1149     if (L_failure == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_failure);
1153       final_jmp(*L_success);
1154     }
1155   }
1156 
1157   bind(L_fallthrough);
1158 
1159 #undef final_jmp
1160 }
1161 
1162 // These two are taken from x86, but they look generally useful
1163 
1164 // scans count pointer sized words at [addr] for occurence of value,
1165 // generic
1166 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1167                                 Register scratch) {
1168   Label Lloop, Lexit;
1169   cbz(count, Lexit);
1170   bind(Lloop);
1171   ldr(scratch, post(addr, wordSize));
1172   cmp(value, scratch);
1173   br(EQ, Lexit);
1174   sub(count, count, 1);
1175   cbnz(count, Lloop);
1176   bind(Lexit);
1177 }
1178 
1179 // scans count 4 byte words at [addr] for occurence of value,
1180 // generic
1181 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1182                                 Register scratch) {
1183   Label Lloop, Lexit;
1184   cbz(count, Lexit);
1185   bind(Lloop);
1186   ldrw(scratch, post(addr, wordSize));
1187   cmpw(value, scratch);
1188   br(EQ, Lexit);
1189   sub(count, count, 1);
1190   cbnz(count, Lloop);
1191   bind(Lexit);
1192 }
1193 
1194 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1195                                                    Register super_klass,
1196                                                    Register temp_reg,
1197                                                    Register temp2_reg,
1198                                                    Label* L_success,
1199                                                    Label* L_failure,
1200                                                    bool set_cond_codes) {
1201   assert_different_registers(sub_klass, super_klass, temp_reg);
1202   if (temp2_reg != noreg)
1203     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1204 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1205 
1206   Label L_fallthrough;
1207   int label_nulls = 0;
1208   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1209   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1210   assert(label_nulls <= 1, "at most one NULL in the batch");
1211 
1212   // a couple of useful fields in sub_klass:
1213   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1214   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1215   Address secondary_supers_addr(sub_klass, ss_offset);
1216   Address super_cache_addr(     sub_klass, sc_offset);
1217 
1218   BLOCK_COMMENT("check_klass_subtype_slow_path");
1219 
1220   // Do a linear scan of the secondary super-klass chain.
1221   // This code is rarely used, so simplicity is a virtue here.
1222   // The repne_scan instruction uses fixed registers, which we must spill.
1223   // Don't worry too much about pre-existing connections with the input regs.
1224 
1225   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1226   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1227 
1228   RegSet pushed_registers;
1229   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1230   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1231 
1232   if (super_klass != r0 || UseCompressedOops) {
1233     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1234   }
1235 
1236   push(pushed_registers, sp);
1237 
1238   // Get super_klass value into r0 (even if it was in r5 or r2).
1239   if (super_klass != r0) {
1240     mov(r0, super_klass);
1241   }
1242 
1243 #ifndef PRODUCT
1244   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1245   Address pst_counter_addr(rscratch2);
1246   ldr(rscratch1, pst_counter_addr);
1247   add(rscratch1, rscratch1, 1);
1248   str(rscratch1, pst_counter_addr);
1249 #endif //PRODUCT
1250 
1251   // We will consult the secondary-super array.
1252   ldr(r5, secondary_supers_addr);
1253   // Load the array length.
1254   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1255   // Skip to start of data.
1256   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1257 
1258   cmp(sp, zr); // Clear Z flag; SP is never zero
1259   // Scan R2 words at [R5] for an occurrence of R0.
1260   // Set NZ/Z based on last compare.
1261   repne_scan(r5, r0, r2, rscratch1);
1262 
1263   // Unspill the temp. registers:
1264   pop(pushed_registers, sp);
1265 
1266   br(Assembler::NE, *L_failure);
1267 
1268   // Success.  Cache the super we found and proceed in triumph.
1269   str(super_klass, super_cache_addr);
1270 
1271   if (L_success != &L_fallthrough) {
1272     b(*L_success);
1273   }
1274 
1275 #undef IS_A_TEMP
1276 
1277   bind(L_fallthrough);
1278 }
1279 
1280 
1281 void MacroAssembler::verify_oop(Register reg, const char* s) {
1282   if (!VerifyOops) return;
1283 
1284   // Pass register number to verify_oop_subroutine
1285   const char* b = NULL;
1286   {
1287     ResourceMark rm;
1288     stringStream ss;
1289     ss.print("verify_oop: %s: %s", reg->name(), s);
1290     b = code_string(ss.as_string());
1291   }
1292   BLOCK_COMMENT("verify_oop {");
1293 
1294   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1295   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1296 
1297   mov(r0, reg);
1298   mov(rscratch1, (address)b);
1299 
1300   // call indirectly to solve generation ordering problem
1301   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1302   ldr(rscratch2, Address(rscratch2));
1303   blr(rscratch2);
1304 
1305   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1306   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1307 
1308   BLOCK_COMMENT("} verify_oop");
1309 }
1310 
1311 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1312   if (!VerifyOops) return;
1313 
1314   const char* b = NULL;
1315   {
1316     ResourceMark rm;
1317     stringStream ss;
1318     ss.print("verify_oop_addr: %s", s);
1319     b = code_string(ss.as_string());
1320   }
1321   BLOCK_COMMENT("verify_oop_addr {");
1322 
1323   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1324   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1325 
1326   // addr may contain sp so we will have to adjust it based on the
1327   // pushes that we just did.
1328   if (addr.uses(sp)) {
1329     lea(r0, addr);
1330     ldr(r0, Address(r0, 4 * wordSize));
1331   } else {
1332     ldr(r0, addr);
1333   }
1334   mov(rscratch1, (address)b);
1335 
1336   // call indirectly to solve generation ordering problem
1337   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1338   ldr(rscratch2, Address(rscratch2));
1339   blr(rscratch2);
1340 
1341   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1342   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1343 
1344   BLOCK_COMMENT("} verify_oop_addr");
1345 }
1346 
1347 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1348                                          int extra_slot_offset) {
1349   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1350   int stackElementSize = Interpreter::stackElementSize;
1351   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1352 #ifdef ASSERT
1353   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1354   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1355 #endif
1356   if (arg_slot.is_constant()) {
1357     return Address(esp, arg_slot.as_constant() * stackElementSize
1358                    + offset);
1359   } else {
1360     add(rscratch1, esp, arg_slot.as_register(),
1361         ext::uxtx, exact_log2(stackElementSize));
1362     return Address(rscratch1, offset);
1363   }
1364 }
1365 
1366 void MacroAssembler::call_VM_leaf_base(address entry_point,
1367                                        int number_of_arguments,
1368                                        Label *retaddr) {
1369   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1370 }
1371 
1372 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1373                                         int number_of_gp_arguments,
1374                                         int number_of_fp_arguments,
1375                                         ret_type type,
1376                                         Label *retaddr) {
1377   Label E, L;
1378 
1379   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1380 
1381   // We add 1 to number_of_arguments because the thread in arg0 is
1382   // not counted
1383   mov(rscratch1, entry_point);
1384   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1385   if (retaddr)
1386     bind(*retaddr);
1387 
1388   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1389   maybe_isb();
1390 }
1391 
1392 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1393   call_VM_leaf_base(entry_point, number_of_arguments);
1394 }
1395 
1396 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1397   pass_arg0(this, arg_0);
1398   call_VM_leaf_base(entry_point, 1);
1399 }
1400 
1401 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1402   pass_arg0(this, arg_0);
1403   pass_arg1(this, arg_1);
1404   call_VM_leaf_base(entry_point, 2);
1405 }
1406 
1407 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1408                                   Register arg_1, Register arg_2) {
1409   pass_arg0(this, arg_0);
1410   pass_arg1(this, arg_1);
1411   pass_arg2(this, arg_2);
1412   call_VM_leaf_base(entry_point, 3);
1413 }
1414 
1415 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1416   pass_arg0(this, arg_0);
1417   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1418 }
1419 
1420 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1421 
1422   assert(arg_0 != c_rarg1, "smashed arg");
1423   pass_arg1(this, arg_1);
1424   pass_arg0(this, arg_0);
1425   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1426 }
1427 
1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1429   assert(arg_0 != c_rarg2, "smashed arg");
1430   assert(arg_1 != c_rarg2, "smashed arg");
1431   pass_arg2(this, arg_2);
1432   assert(arg_0 != c_rarg1, "smashed arg");
1433   pass_arg1(this, arg_1);
1434   pass_arg0(this, arg_0);
1435   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1436 }
1437 
1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1439   assert(arg_0 != c_rarg3, "smashed arg");
1440   assert(arg_1 != c_rarg3, "smashed arg");
1441   assert(arg_2 != c_rarg3, "smashed arg");
1442   pass_arg3(this, arg_3);
1443   assert(arg_0 != c_rarg2, "smashed arg");
1444   assert(arg_1 != c_rarg2, "smashed arg");
1445   pass_arg2(this, arg_2);
1446   assert(arg_0 != c_rarg1, "smashed arg");
1447   pass_arg1(this, arg_1);
1448   pass_arg0(this, arg_0);
1449   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1450 }
1451 
1452 void MacroAssembler::null_check(Register reg, int offset) {
1453   if (needs_explicit_null_check(offset)) {
1454     // provoke OS NULL exception if reg = NULL by
1455     // accessing M[reg] w/o changing any registers
1456     // NOTE: this is plenty to provoke a segv
1457     ldr(zr, Address(reg));
1458   } else {
1459     // nothing to do, (later) access of M[reg + offset]
1460     // will provoke OS NULL exception if reg = NULL
1461   }
1462 }
1463 
1464 // MacroAssembler protected routines needed to implement
1465 // public methods
1466 
1467 void MacroAssembler::mov(Register r, Address dest) {
1468   code_section()->relocate(pc(), dest.rspec());
1469   u_int64_t imm64 = (u_int64_t)dest.target();
1470   movptr(r, imm64);
1471 }
1472 
1473 // Move a constant pointer into r.  In AArch64 mode the virtual
1474 // address space is 48 bits in size, so we only need three
1475 // instructions to create a patchable instruction sequence that can
1476 // reach anywhere.
1477 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1478 #ifndef PRODUCT
1479   {
1480     char buffer[64];
1481     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1482     block_comment(buffer);
1483   }
1484 #endif
1485   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1486   movz(r, imm64 & 0xffff);
1487   imm64 >>= 16;
1488   movk(r, imm64 & 0xffff, 16);
1489   imm64 >>= 16;
1490   movk(r, imm64 & 0xffff, 32);
1491 }
1492 
1493 // Macro to mov replicated immediate to vector register.
1494 //  Vd will get the following values for different arrangements in T
1495 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1496 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1497 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1498 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1499 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1500 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1501 //   T1D/T2D: invalid
1502 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1503   assert(T != T1D && T != T2D, "invalid arrangement");
1504   if (T == T8B || T == T16B) {
1505     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1506     movi(Vd, T, imm32 & 0xff, 0);
1507     return;
1508   }
1509   u_int32_t nimm32 = ~imm32;
1510   if (T == T4H || T == T8H) {
1511     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1512     imm32 &= 0xffff;
1513     nimm32 &= 0xffff;
1514   }
1515   u_int32_t x = imm32;
1516   int movi_cnt = 0;
1517   int movn_cnt = 0;
1518   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1519   x = nimm32;
1520   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1521   if (movn_cnt < movi_cnt) imm32 = nimm32;
1522   unsigned lsl = 0;
1523   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1524   if (movn_cnt < movi_cnt)
1525     mvni(Vd, T, imm32 & 0xff, lsl);
1526   else
1527     movi(Vd, T, imm32 & 0xff, lsl);
1528   imm32 >>= 8; lsl += 8;
1529   while (imm32) {
1530     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1531     if (movn_cnt < movi_cnt)
1532       bici(Vd, T, imm32 & 0xff, lsl);
1533     else
1534       orri(Vd, T, imm32 & 0xff, lsl);
1535     lsl += 8; imm32 >>= 8;
1536   }
1537 }
1538 
1539 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1540 {
1541 #ifndef PRODUCT
1542   {
1543     char buffer[64];
1544     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1545     block_comment(buffer);
1546   }
1547 #endif
1548   if (operand_valid_for_logical_immediate(false, imm64)) {
1549     orr(dst, zr, imm64);
1550   } else {
1551     // we can use a combination of MOVZ or MOVN with
1552     // MOVK to build up the constant
1553     u_int64_t imm_h[4];
1554     int zero_count = 0;
1555     int neg_count = 0;
1556     int i;
1557     for (i = 0; i < 4; i++) {
1558       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1559       if (imm_h[i] == 0) {
1560         zero_count++;
1561       } else if (imm_h[i] == 0xffffL) {
1562         neg_count++;
1563       }
1564     }
1565     if (zero_count == 4) {
1566       // one MOVZ will do
1567       movz(dst, 0);
1568     } else if (neg_count == 4) {
1569       // one MOVN will do
1570       movn(dst, 0);
1571     } else if (zero_count == 3) {
1572       for (i = 0; i < 4; i++) {
1573         if (imm_h[i] != 0L) {
1574           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1575           break;
1576         }
1577       }
1578     } else if (neg_count == 3) {
1579       // one MOVN will do
1580       for (int i = 0; i < 4; i++) {
1581         if (imm_h[i] != 0xffffL) {
1582           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1583           break;
1584         }
1585       }
1586     } else if (zero_count == 2) {
1587       // one MOVZ and one MOVK will do
1588       for (i = 0; i < 3; i++) {
1589         if (imm_h[i] != 0L) {
1590           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1591           i++;
1592           break;
1593         }
1594       }
1595       for (;i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1598         }
1599       }
1600     } else if (neg_count == 2) {
1601       // one MOVN and one MOVK will do
1602       for (i = 0; i < 4; i++) {
1603         if (imm_h[i] != 0xffffL) {
1604           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1605           i++;
1606           break;
1607         }
1608       }
1609       for (;i < 4; i++) {
1610         if (imm_h[i] != 0xffffL) {
1611           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1612         }
1613       }
1614     } else if (zero_count == 1) {
1615       // one MOVZ and two MOVKs will do
1616       for (i = 0; i < 4; i++) {
1617         if (imm_h[i] != 0L) {
1618           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1619           i++;
1620           break;
1621         }
1622       }
1623       for (;i < 4; i++) {
1624         if (imm_h[i] != 0x0L) {
1625           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1626         }
1627       }
1628     } else if (neg_count == 1) {
1629       // one MOVN and two MOVKs will do
1630       for (i = 0; i < 4; i++) {
1631         if (imm_h[i] != 0xffffL) {
1632           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1633           i++;
1634           break;
1635         }
1636       }
1637       for (;i < 4; i++) {
1638         if (imm_h[i] != 0xffffL) {
1639           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1640         }
1641       }
1642     } else {
1643       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1644       movz(dst, (u_int32_t)imm_h[0], 0);
1645       for (i = 1; i < 4; i++) {
1646         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1647       }
1648     }
1649   }
1650 }
1651 
1652 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1653 {
1654 #ifndef PRODUCT
1655     {
1656       char buffer[64];
1657       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1658       block_comment(buffer);
1659     }
1660 #endif
1661   if (operand_valid_for_logical_immediate(true, imm32)) {
1662     orrw(dst, zr, imm32);
1663   } else {
1664     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1665     // constant
1666     u_int32_t imm_h[2];
1667     imm_h[0] = imm32 & 0xffff;
1668     imm_h[1] = ((imm32 >> 16) & 0xffff);
1669     if (imm_h[0] == 0) {
1670       movzw(dst, imm_h[1], 16);
1671     } else if (imm_h[0] == 0xffff) {
1672       movnw(dst, imm_h[1] ^ 0xffff, 16);
1673     } else if (imm_h[1] == 0) {
1674       movzw(dst, imm_h[0], 0);
1675     } else if (imm_h[1] == 0xffff) {
1676       movnw(dst, imm_h[0] ^ 0xffff, 0);
1677     } else {
1678       // use a MOVZ and MOVK (makes it easier to debug)
1679       movzw(dst, imm_h[0], 0);
1680       movkw(dst, imm_h[1], 16);
1681     }
1682   }
1683 }
1684 
1685 // Form an address from base + offset in Rd.  Rd may or may
1686 // not actually be used: you must use the Address that is returned.
1687 // It is up to you to ensure that the shift provided matches the size
1688 // of your data.
1689 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1690   if (Address::offset_ok_for_immed(byte_offset, shift))
1691     // It fits; no need for any heroics
1692     return Address(base, byte_offset);
1693 
1694   // Don't do anything clever with negative or misaligned offsets
1695   unsigned mask = (1 << shift) - 1;
1696   if (byte_offset < 0 || byte_offset & mask) {
1697     mov(Rd, byte_offset);
1698     add(Rd, base, Rd);
1699     return Address(Rd);
1700   }
1701 
1702   // See if we can do this with two 12-bit offsets
1703   {
1704     unsigned long word_offset = byte_offset >> shift;
1705     unsigned long masked_offset = word_offset & 0xfff000;
1706     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1707         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1708       add(Rd, base, masked_offset << shift);
1709       word_offset -= masked_offset;
1710       return Address(Rd, word_offset << shift);
1711     }
1712   }
1713 
1714   // Do it the hard way
1715   mov(Rd, byte_offset);
1716   add(Rd, base, Rd);
1717   return Address(Rd);
1718 }
1719 
1720 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1721   if (UseLSE) {
1722     mov(tmp, 1);
1723     ldadd(Assembler::word, tmp, zr, counter_addr);
1724     return;
1725   }
1726   Label retry_load;
1727   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1728     prfm(Address(counter_addr), PSTL1STRM);
1729   bind(retry_load);
1730   // flush and load exclusive from the memory location
1731   ldxrw(tmp, counter_addr);
1732   addw(tmp, tmp, 1);
1733   // if we store+flush with no intervening write tmp wil be zero
1734   stxrw(tmp2, tmp, counter_addr);
1735   cbnzw(tmp2, retry_load);
1736 }
1737 
1738 
1739 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1740                                     bool want_remainder, Register scratch)
1741 {
1742   // Full implementation of Java idiv and irem.  The function
1743   // returns the (pc) offset of the div instruction - may be needed
1744   // for implicit exceptions.
1745   //
1746   // constraint : ra/rb =/= scratch
1747   //         normal case
1748   //
1749   // input : ra: dividend
1750   //         rb: divisor
1751   //
1752   // result: either
1753   //         quotient  (= ra idiv rb)
1754   //         remainder (= ra irem rb)
1755 
1756   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1757 
1758   int idivl_offset = offset();
1759   if (! want_remainder) {
1760     sdivw(result, ra, rb);
1761   } else {
1762     sdivw(scratch, ra, rb);
1763     Assembler::msubw(result, scratch, rb, ra);
1764   }
1765 
1766   return idivl_offset;
1767 }
1768 
1769 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1770                                     bool want_remainder, Register scratch)
1771 {
1772   // Full implementation of Java ldiv and lrem.  The function
1773   // returns the (pc) offset of the div instruction - may be needed
1774   // for implicit exceptions.
1775   //
1776   // constraint : ra/rb =/= scratch
1777   //         normal case
1778   //
1779   // input : ra: dividend
1780   //         rb: divisor
1781   //
1782   // result: either
1783   //         quotient  (= ra idiv rb)
1784   //         remainder (= ra irem rb)
1785 
1786   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1787 
1788   int idivq_offset = offset();
1789   if (! want_remainder) {
1790     sdiv(result, ra, rb);
1791   } else {
1792     sdiv(scratch, ra, rb);
1793     Assembler::msub(result, scratch, rb, ra);
1794   }
1795 
1796   return idivq_offset;
1797 }
1798 
1799 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1800   address prev = pc() - NativeMembar::instruction_size;
1801   address last = code()->last_insn();
1802   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1803     NativeMembar *bar = NativeMembar_at(prev);
1804     // We are merging two memory barrier instructions.  On AArch64 we
1805     // can do this simply by ORing them together.
1806     bar->set_kind(bar->get_kind() | order_constraint);
1807     BLOCK_COMMENT("merged membar");
1808   } else {
1809     code()->set_last_insn(pc());
1810     dmb(Assembler::barrier(order_constraint));
1811   }
1812 }
1813 
1814 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1815   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1816     merge_ldst(rt, adr, size_in_bytes, is_store);
1817     code()->clear_last_insn();
1818     return true;
1819   } else {
1820     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1821     const unsigned mask = size_in_bytes - 1;
1822     if (adr.getMode() == Address::base_plus_offset &&
1823         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1824       code()->set_last_insn(pc());
1825     }
1826     return false;
1827   }
1828 }
1829 
1830 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1831   // We always try to merge two adjacent loads into one ldp.
1832   if (!try_merge_ldst(Rx, adr, 8, false)) {
1833     Assembler::ldr(Rx, adr);
1834   }
1835 }
1836 
1837 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1838   // We always try to merge two adjacent loads into one ldp.
1839   if (!try_merge_ldst(Rw, adr, 4, false)) {
1840     Assembler::ldrw(Rw, adr);
1841   }
1842 }
1843 
1844 void MacroAssembler::str(Register Rx, const Address &adr) {
1845   // We always try to merge two adjacent stores into one stp.
1846   if (!try_merge_ldst(Rx, adr, 8, true)) {
1847     Assembler::str(Rx, adr);
1848   }
1849 }
1850 
1851 void MacroAssembler::strw(Register Rw, const Address &adr) {
1852   // We always try to merge two adjacent stores into one stp.
1853   if (!try_merge_ldst(Rw, adr, 4, true)) {
1854     Assembler::strw(Rw, adr);
1855   }
1856 }
1857 
1858 // MacroAssembler routines found actually to be needed
1859 
1860 void MacroAssembler::push(Register src)
1861 {
1862   str(src, Address(pre(esp, -1 * wordSize)));
1863 }
1864 
1865 void MacroAssembler::pop(Register dst)
1866 {
1867   ldr(dst, Address(post(esp, 1 * wordSize)));
1868 }
1869 
1870 // Note: load_unsigned_short used to be called load_unsigned_word.
1871 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1872   int off = offset();
1873   ldrh(dst, src);
1874   return off;
1875 }
1876 
1877 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1878   int off = offset();
1879   ldrb(dst, src);
1880   return off;
1881 }
1882 
1883 int MacroAssembler::load_signed_short(Register dst, Address src) {
1884   int off = offset();
1885   ldrsh(dst, src);
1886   return off;
1887 }
1888 
1889 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1890   int off = offset();
1891   ldrsb(dst, src);
1892   return off;
1893 }
1894 
1895 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1896   int off = offset();
1897   ldrshw(dst, src);
1898   return off;
1899 }
1900 
1901 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1902   int off = offset();
1903   ldrsbw(dst, src);
1904   return off;
1905 }
1906 
1907 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1908   switch (size_in_bytes) {
1909   case  8:  ldr(dst, src); break;
1910   case  4:  ldrw(dst, src); break;
1911   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1912   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1913   default:  ShouldNotReachHere();
1914   }
1915 }
1916 
1917 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1918   switch (size_in_bytes) {
1919   case  8:  str(src, dst); break;
1920   case  4:  strw(src, dst); break;
1921   case  2:  strh(src, dst); break;
1922   case  1:  strb(src, dst); break;
1923   default:  ShouldNotReachHere();
1924   }
1925 }
1926 
1927 void MacroAssembler::decrementw(Register reg, int value)
1928 {
1929   if (value < 0)  { incrementw(reg, -value);      return; }
1930   if (value == 0) {                               return; }
1931   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1932   /* else */ {
1933     guarantee(reg != rscratch2, "invalid dst for register decrement");
1934     movw(rscratch2, (unsigned)value);
1935     subw(reg, reg, rscratch2);
1936   }
1937 }
1938 
1939 void MacroAssembler::decrement(Register reg, int value)
1940 {
1941   if (value < 0)  { increment(reg, -value);      return; }
1942   if (value == 0) {                              return; }
1943   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1944   /* else */ {
1945     assert(reg != rscratch2, "invalid dst for register decrement");
1946     mov(rscratch2, (unsigned long)value);
1947     sub(reg, reg, rscratch2);
1948   }
1949 }
1950 
1951 void MacroAssembler::decrementw(Address dst, int value)
1952 {
1953   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1954   if (dst.getMode() == Address::literal) {
1955     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1956     lea(rscratch2, dst);
1957     dst = Address(rscratch2);
1958   }
1959   ldrw(rscratch1, dst);
1960   decrementw(rscratch1, value);
1961   strw(rscratch1, dst);
1962 }
1963 
1964 void MacroAssembler::decrement(Address dst, int value)
1965 {
1966   assert(!dst.uses(rscratch1), "invalid address for decrement");
1967   if (dst.getMode() == Address::literal) {
1968     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1969     lea(rscratch2, dst);
1970     dst = Address(rscratch2);
1971   }
1972   ldr(rscratch1, dst);
1973   decrement(rscratch1, value);
1974   str(rscratch1, dst);
1975 }
1976 
1977 void MacroAssembler::incrementw(Register reg, int value)
1978 {
1979   if (value < 0)  { decrementw(reg, -value);      return; }
1980   if (value == 0) {                               return; }
1981   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1982   /* else */ {
1983     assert(reg != rscratch2, "invalid dst for register increment");
1984     movw(rscratch2, (unsigned)value);
1985     addw(reg, reg, rscratch2);
1986   }
1987 }
1988 
1989 void MacroAssembler::increment(Register reg, int value)
1990 {
1991   if (value < 0)  { decrement(reg, -value);      return; }
1992   if (value == 0) {                              return; }
1993   if (value < (1 << 12)) { add(reg, reg, value); return; }
1994   /* else */ {
1995     assert(reg != rscratch2, "invalid dst for register increment");
1996     movw(rscratch2, (unsigned)value);
1997     add(reg, reg, rscratch2);
1998   }
1999 }
2000 
2001 void MacroAssembler::incrementw(Address dst, int value)
2002 {
2003   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2004   if (dst.getMode() == Address::literal) {
2005     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2006     lea(rscratch2, dst);
2007     dst = Address(rscratch2);
2008   }
2009   ldrw(rscratch1, dst);
2010   incrementw(rscratch1, value);
2011   strw(rscratch1, dst);
2012 }
2013 
2014 void MacroAssembler::increment(Address dst, int value)
2015 {
2016   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2017   if (dst.getMode() == Address::literal) {
2018     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2019     lea(rscratch2, dst);
2020     dst = Address(rscratch2);
2021   }
2022   ldr(rscratch1, dst);
2023   increment(rscratch1, value);
2024   str(rscratch1, dst);
2025 }
2026 
2027 
2028 void MacroAssembler::pusha() {
2029   push(0x7fffffff, sp);
2030 }
2031 
2032 void MacroAssembler::popa() {
2033   pop(0x7fffffff, sp);
2034 }
2035 
2036 // Push lots of registers in the bit set supplied.  Don't push sp.
2037 // Return the number of words pushed
2038 int MacroAssembler::push(unsigned int bitset, Register stack) {
2039   int words_pushed = 0;
2040 
2041   // Scan bitset to accumulate register pairs
2042   unsigned char regs[32];
2043   int count = 0;
2044   for (int reg = 0; reg <= 30; reg++) {
2045     if (1 & bitset)
2046       regs[count++] = reg;
2047     bitset >>= 1;
2048   }
2049   regs[count++] = zr->encoding_nocheck();
2050   count &= ~1;  // Only push an even nuber of regs
2051 
2052   if (count) {
2053     stp(as_Register(regs[0]), as_Register(regs[1]),
2054        Address(pre(stack, -count * wordSize)));
2055     words_pushed += 2;
2056   }
2057   for (int i = 2; i < count; i += 2) {
2058     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2059        Address(stack, i * wordSize));
2060     words_pushed += 2;
2061   }
2062 
2063   assert(words_pushed == count, "oops, pushed != count");
2064 
2065   return count;
2066 }
2067 
2068 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2069   int words_pushed = 0;
2070 
2071   // Scan bitset to accumulate register pairs
2072   unsigned char regs[32];
2073   int count = 0;
2074   for (int reg = 0; reg <= 30; reg++) {
2075     if (1 & bitset)
2076       regs[count++] = reg;
2077     bitset >>= 1;
2078   }
2079   regs[count++] = zr->encoding_nocheck();
2080   count &= ~1;
2081 
2082   for (int i = 2; i < count; i += 2) {
2083     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2084        Address(stack, i * wordSize));
2085     words_pushed += 2;
2086   }
2087   if (count) {
2088     ldp(as_Register(regs[0]), as_Register(regs[1]),
2089        Address(post(stack, count * wordSize)));
2090     words_pushed += 2;
2091   }
2092 
2093   assert(words_pushed == count, "oops, pushed != count");
2094 
2095   return count;
2096 }
2097 #ifdef ASSERT
2098 void MacroAssembler::verify_heapbase(const char* msg) {
2099 #if 0
2100   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2101   assert (Universe::heap() != NULL, "java heap should be initialized");
2102   if (CheckCompressedOops) {
2103     Label ok;
2104     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2105     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2106     br(Assembler::EQ, ok);
2107     stop(msg);
2108     bind(ok);
2109     pop(1 << rscratch1->encoding(), sp);
2110   }
2111 #endif
2112 }
2113 #endif
2114 
2115 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2116   Label done, not_weak;
2117   cbz(value, done);           // Use NULL as-is.
2118 
2119   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2120   tbz(r0, 0, not_weak);    // Test for jweak tag.
2121 
2122   // Resolve jweak.
2123   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2124                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2125   verify_oop(value);
2126   b(done);
2127 
2128   bind(not_weak);
2129   // Resolve (untagged) jobject.
2130   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2131   verify_oop(value);
2132   bind(done);
2133 }
2134 
2135 void MacroAssembler::stop(const char* msg) {
2136   address ip = pc();
2137   pusha();
2138   mov(c_rarg0, (address)msg);
2139   mov(c_rarg1, (address)ip);
2140   mov(c_rarg2, sp);
2141   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2142   // call(c_rarg3);
2143   blrt(c_rarg3, 3, 0, 1);
2144   hlt(0);
2145 }
2146 
2147 void MacroAssembler::unimplemented(const char* what) {
2148   const char* buf = NULL;
2149   {
2150     ResourceMark rm;
2151     stringStream ss;
2152     ss.print("unimplemented: %s", what);
2153     buf = code_string(ss.as_string());
2154   }
2155   stop(buf);
2156 }
2157 
2158 // If a constant does not fit in an immediate field, generate some
2159 // number of MOV instructions and then perform the operation.
2160 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2161                                            add_sub_imm_insn insn1,
2162                                            add_sub_reg_insn insn2) {
2163   assert(Rd != zr, "Rd = zr and not setting flags?");
2164   if (operand_valid_for_add_sub_immediate((int)imm)) {
2165     (this->*insn1)(Rd, Rn, imm);
2166   } else {
2167     if (uabs(imm) < (1 << 24)) {
2168        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2169        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2170     } else {
2171        assert_different_registers(Rd, Rn);
2172        mov(Rd, (uint64_t)imm);
2173        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2174     }
2175   }
2176 }
2177 
2178 // Seperate vsn which sets the flags. Optimisations are more restricted
2179 // because we must set the flags correctly.
2180 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2181                                            add_sub_imm_insn insn1,
2182                                            add_sub_reg_insn insn2) {
2183   if (operand_valid_for_add_sub_immediate((int)imm)) {
2184     (this->*insn1)(Rd, Rn, imm);
2185   } else {
2186     assert_different_registers(Rd, Rn);
2187     assert(Rd != zr, "overflow in immediate operand");
2188     mov(Rd, (uint64_t)imm);
2189     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2190   }
2191 }
2192 
2193 
2194 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2195   if (increment.is_register()) {
2196     add(Rd, Rn, increment.as_register());
2197   } else {
2198     add(Rd, Rn, increment.as_constant());
2199   }
2200 }
2201 
2202 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2203   if (increment.is_register()) {
2204     addw(Rd, Rn, increment.as_register());
2205   } else {
2206     addw(Rd, Rn, increment.as_constant());
2207   }
2208 }
2209 
2210 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2211   if (decrement.is_register()) {
2212     sub(Rd, Rn, decrement.as_register());
2213   } else {
2214     sub(Rd, Rn, decrement.as_constant());
2215   }
2216 }
2217 
2218 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2219   if (decrement.is_register()) {
2220     subw(Rd, Rn, decrement.as_register());
2221   } else {
2222     subw(Rd, Rn, decrement.as_constant());
2223   }
2224 }
2225 
2226 void MacroAssembler::reinit_heapbase()
2227 {
2228   if (UseCompressedOops) {
2229     if (Universe::is_fully_initialized()) {
2230       mov(rheapbase, Universe::narrow_ptrs_base());
2231     } else {
2232       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2233       ldr(rheapbase, Address(rheapbase));
2234     }
2235   }
2236 }
2237 
2238 // this simulates the behaviour of the x86 cmpxchg instruction using a
2239 // load linked/store conditional pair. we use the acquire/release
2240 // versions of these instructions so that we flush pending writes as
2241 // per Java semantics.
2242 
2243 // n.b the x86 version assumes the old value to be compared against is
2244 // in rax and updates rax with the value located in memory if the
2245 // cmpxchg fails. we supply a register for the old value explicitly
2246 
2247 // the aarch64 load linked/store conditional instructions do not
2248 // accept an offset. so, unlike x86, we must provide a plain register
2249 // to identify the memory word to be compared/exchanged rather than a
2250 // register+offset Address.
2251 
2252 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2253                                 Label &succeed, Label *fail) {
2254   // oldv holds comparison value
2255   // newv holds value to write in exchange
2256   // addr identifies memory word to compare against/update
2257   if (UseLSE) {
2258     mov(tmp, oldv);
2259     casal(Assembler::xword, oldv, newv, addr);
2260     cmp(tmp, oldv);
2261     br(Assembler::EQ, succeed);
2262     membar(AnyAny);
2263   } else {
2264     Label retry_load, nope;
2265     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2266       prfm(Address(addr), PSTL1STRM);
2267     bind(retry_load);
2268     // flush and load exclusive from the memory location
2269     // and fail if it is not what we expect
2270     ldaxr(tmp, addr);
2271     cmp(tmp, oldv);
2272     br(Assembler::NE, nope);
2273     // if we store+flush with no intervening write tmp wil be zero
2274     stlxr(tmp, newv, addr);
2275     cbzw(tmp, succeed);
2276     // retry so we only ever return after a load fails to compare
2277     // ensures we don't return a stale value after a failed write.
2278     b(retry_load);
2279     // if the memory word differs we return it in oldv and signal a fail
2280     bind(nope);
2281     membar(AnyAny);
2282     mov(oldv, tmp);
2283   }
2284   if (fail)
2285     b(*fail);
2286 }
2287 
2288 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2289                                         Label &succeed, Label *fail) {
2290   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2291   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2292 }
2293 
2294 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2295                                 Label &succeed, Label *fail) {
2296   // oldv holds comparison value
2297   // newv holds value to write in exchange
2298   // addr identifies memory word to compare against/update
2299   // tmp returns 0/1 for success/failure
2300   if (UseLSE) {
2301     mov(tmp, oldv);
2302     casal(Assembler::word, oldv, newv, addr);
2303     cmp(tmp, oldv);
2304     br(Assembler::EQ, succeed);
2305     membar(AnyAny);
2306   } else {
2307     Label retry_load, nope;
2308     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2309       prfm(Address(addr), PSTL1STRM);
2310     bind(retry_load);
2311     // flush and load exclusive from the memory location
2312     // and fail if it is not what we expect
2313     ldaxrw(tmp, addr);
2314     cmp(tmp, oldv);
2315     br(Assembler::NE, nope);
2316     // if we store+flush with no intervening write tmp wil be zero
2317     stlxrw(tmp, newv, addr);
2318     cbzw(tmp, succeed);
2319     // retry so we only ever return after a load fails to compare
2320     // ensures we don't return a stale value after a failed write.
2321     b(retry_load);
2322     // if the memory word differs we return it in oldv and signal a fail
2323     bind(nope);
2324     membar(AnyAny);
2325     mov(oldv, tmp);
2326   }
2327   if (fail)
2328     b(*fail);
2329 }
2330 
2331 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2332 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2333 // Pass a register for the result, otherwise pass noreg.
2334 
2335 // Clobbers rscratch1
2336 void MacroAssembler::cmpxchg(Register addr, Register expected,
2337                              Register new_val,
2338                              enum operand_size size,
2339                              bool acquire, bool release,
2340                              bool weak,
2341                              Register result) {
2342   if (result == noreg)  result = rscratch1;
2343   if (UseLSE) {
2344     mov(result, expected);
2345     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2346     cmp(result, expected);
2347   } else {
2348     BLOCK_COMMENT("cmpxchg {");
2349     Label retry_load, done;
2350     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2351       prfm(Address(addr), PSTL1STRM);
2352     bind(retry_load);
2353     load_exclusive(result, addr, size, acquire);
2354     if (size == xword)
2355       cmp(result, expected);
2356     else
2357       cmpw(result, expected);
2358     br(Assembler::NE, done);
2359     store_exclusive(rscratch1, new_val, addr, size, release);
2360     if (weak) {
2361       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2362     } else {
2363       cbnzw(rscratch1, retry_load);
2364     }
2365     bind(done);
2366     BLOCK_COMMENT("} cmpxchg");
2367   }
2368 }
2369 
2370 static bool different(Register a, RegisterOrConstant b, Register c) {
2371   if (b.is_constant())
2372     return a != c;
2373   else
2374     return a != b.as_register() && a != c && b.as_register() != c;
2375 }
2376 
2377 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2378 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2379   if (UseLSE) {                                                         \
2380     prev = prev->is_valid() ? prev : zr;                                \
2381     if (incr.is_register()) {                                           \
2382       AOP(sz, incr.as_register(), prev, addr);                          \
2383     } else {                                                            \
2384       mov(rscratch2, incr.as_constant());                               \
2385       AOP(sz, rscratch2, prev, addr);                                   \
2386     }                                                                   \
2387     return;                                                             \
2388   }                                                                     \
2389   Register result = rscratch2;                                          \
2390   if (prev->is_valid())                                                 \
2391     result = different(prev, incr, addr) ? prev : rscratch2;            \
2392                                                                         \
2393   Label retry_load;                                                     \
2394   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2395     prfm(Address(addr), PSTL1STRM);                                     \
2396   bind(retry_load);                                                     \
2397   LDXR(result, addr);                                                   \
2398   OP(rscratch1, result, incr);                                          \
2399   STXR(rscratch2, rscratch1, addr);                                     \
2400   cbnzw(rscratch2, retry_load);                                         \
2401   if (prev->is_valid() && prev != result) {                             \
2402     IOP(prev, rscratch1, incr);                                         \
2403   }                                                                     \
2404 }
2405 
2406 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2407 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2408 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2409 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2410 
2411 #undef ATOMIC_OP
2412 
2413 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2414 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2415   if (UseLSE) {                                                         \
2416     prev = prev->is_valid() ? prev : zr;                                \
2417     AOP(sz, newv, prev, addr);                                          \
2418     return;                                                             \
2419   }                                                                     \
2420   Register result = rscratch2;                                          \
2421   if (prev->is_valid())                                                 \
2422     result = different(prev, newv, addr) ? prev : rscratch2;            \
2423                                                                         \
2424   Label retry_load;                                                     \
2425   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2426     prfm(Address(addr), PSTL1STRM);                                     \
2427   bind(retry_load);                                                     \
2428   LDXR(result, addr);                                                   \
2429   STXR(rscratch1, newv, addr);                                          \
2430   cbnzw(rscratch1, retry_load);                                         \
2431   if (prev->is_valid() && prev != result)                               \
2432     mov(prev, result);                                                  \
2433 }
2434 
2435 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2436 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2437 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2438 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2439 
2440 #undef ATOMIC_XCHG
2441 
2442 #ifndef PRODUCT
2443 extern "C" void findpc(intptr_t x);
2444 #endif
2445 
2446 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2447 {
2448   // In order to get locks to work, we need to fake a in_VM state
2449   if (ShowMessageBoxOnError ) {
2450     JavaThread* thread = JavaThread::current();
2451     JavaThreadState saved_state = thread->thread_state();
2452     thread->set_thread_state(_thread_in_vm);
2453 #ifndef PRODUCT
2454     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2455       ttyLocker ttyl;
2456       BytecodeCounter::print();
2457     }
2458 #endif
2459     if (os::message_box(msg, "Execution stopped, print registers?")) {
2460       ttyLocker ttyl;
2461       tty->print_cr(" pc = 0x%016lx", pc);
2462 #ifndef PRODUCT
2463       tty->cr();
2464       findpc(pc);
2465       tty->cr();
2466 #endif
2467       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2468       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2469       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2470       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2471       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2472       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2473       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2474       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2475       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2476       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2477       tty->print_cr("r10 = 0x%016lx", regs[10]);
2478       tty->print_cr("r11 = 0x%016lx", regs[11]);
2479       tty->print_cr("r12 = 0x%016lx", regs[12]);
2480       tty->print_cr("r13 = 0x%016lx", regs[13]);
2481       tty->print_cr("r14 = 0x%016lx", regs[14]);
2482       tty->print_cr("r15 = 0x%016lx", regs[15]);
2483       tty->print_cr("r16 = 0x%016lx", regs[16]);
2484       tty->print_cr("r17 = 0x%016lx", regs[17]);
2485       tty->print_cr("r18 = 0x%016lx", regs[18]);
2486       tty->print_cr("r19 = 0x%016lx", regs[19]);
2487       tty->print_cr("r20 = 0x%016lx", regs[20]);
2488       tty->print_cr("r21 = 0x%016lx", regs[21]);
2489       tty->print_cr("r22 = 0x%016lx", regs[22]);
2490       tty->print_cr("r23 = 0x%016lx", regs[23]);
2491       tty->print_cr("r24 = 0x%016lx", regs[24]);
2492       tty->print_cr("r25 = 0x%016lx", regs[25]);
2493       tty->print_cr("r26 = 0x%016lx", regs[26]);
2494       tty->print_cr("r27 = 0x%016lx", regs[27]);
2495       tty->print_cr("r28 = 0x%016lx", regs[28]);
2496       tty->print_cr("r30 = 0x%016lx", regs[30]);
2497       tty->print_cr("r31 = 0x%016lx", regs[31]);
2498       BREAKPOINT;
2499     }
2500     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2501   } else {
2502     ttyLocker ttyl;
2503     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2504                     msg);
2505     assert(false, "DEBUG MESSAGE: %s", msg);
2506   }
2507 }
2508 
2509 #ifdef BUILTIN_SIM
2510 // routine to generate an x86 prolog for a stub function which
2511 // bootstraps into the generated ARM code which directly follows the
2512 // stub
2513 //
2514 // the argument encodes the number of general and fp registers
2515 // passed by the caller and the callng convention (currently just
2516 // the number of general registers and assumes C argument passing)
2517 
2518 extern "C" {
2519 int aarch64_stub_prolog_size();
2520 void aarch64_stub_prolog();
2521 void aarch64_prolog();
2522 }
2523 
2524 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2525                                    address *prolog_ptr)
2526 {
2527   int calltype = (((ret_type & 0x3) << 8) |
2528                   ((fp_arg_count & 0xf) << 4) |
2529                   (gp_arg_count & 0xf));
2530 
2531   // the addresses for the x86 to ARM entry code we need to use
2532   address start = pc();
2533   // printf("start = %lx\n", start);
2534   int byteCount =  aarch64_stub_prolog_size();
2535   // printf("byteCount = %x\n", byteCount);
2536   int instructionCount = (byteCount + 3)/ 4;
2537   // printf("instructionCount = %x\n", instructionCount);
2538   for (int i = 0; i < instructionCount; i++) {
2539     nop();
2540   }
2541 
2542   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2543 
2544   // write the address of the setup routine and the call format at the
2545   // end of into the copied code
2546   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2547   if (prolog_ptr)
2548     patch_end[-2] = (u_int64_t)prolog_ptr;
2549   patch_end[-1] = calltype;
2550 }
2551 #endif
2552 
2553 void MacroAssembler::push_call_clobbered_registers() {
2554   int step = 4 * wordSize;
2555   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2556   sub(sp, sp, step);
2557   mov(rscratch1, -step);
2558   // Push v0-v7, v16-v31.
2559   for (int i = 31; i>= 4; i -= 4) {
2560     if (i <= v7->encoding() || i >= v16->encoding())
2561       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2562           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2563   }
2564   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2565       as_FloatRegister(3), T1D, Address(sp));
2566 }
2567 
2568 void MacroAssembler::pop_call_clobbered_registers() {
2569   for (int i = 0; i < 32; i += 4) {
2570     if (i <= v7->encoding() || i >= v16->encoding())
2571       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2572           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2573   }
2574 
2575   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2576 }
2577 
2578 void MacroAssembler::push_CPU_state(bool save_vectors) {
2579   int step = (save_vectors ? 8 : 4) * wordSize;
2580   push(0x3fffffff, sp);         // integer registers except lr & sp
2581   mov(rscratch1, -step);
2582   sub(sp, sp, step);
2583   for (int i = 28; i >= 4; i -= 4) {
2584     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2585         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2586   }
2587   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2588 }
2589 
2590 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2591   int step = (restore_vectors ? 8 : 4) * wordSize;
2592   for (int i = 0; i <= 28; i += 4)
2593     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2594         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2595   pop(0x3fffffff, sp);         // integer registers except lr & sp
2596 }
2597 
2598 /**
2599  * Helpers for multiply_to_len().
2600  */
2601 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2602                                      Register src1, Register src2) {
2603   adds(dest_lo, dest_lo, src1);
2604   adc(dest_hi, dest_hi, zr);
2605   adds(dest_lo, dest_lo, src2);
2606   adc(final_dest_hi, dest_hi, zr);
2607 }
2608 
2609 // Generate an address from (r + r1 extend offset).  "size" is the
2610 // size of the operand.  The result may be in rscratch2.
2611 Address MacroAssembler::offsetted_address(Register r, Register r1,
2612                                           Address::extend ext, int offset, int size) {
2613   if (offset || (ext.shift() % size != 0)) {
2614     lea(rscratch2, Address(r, r1, ext));
2615     return Address(rscratch2, offset);
2616   } else {
2617     return Address(r, r1, ext);
2618   }
2619 }
2620 
2621 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2622 {
2623   assert(offset >= 0, "spill to negative address?");
2624   // Offset reachable ?
2625   //   Not aligned - 9 bits signed offset
2626   //   Aligned - 12 bits unsigned offset shifted
2627   Register base = sp;
2628   if ((offset & (size-1)) && offset >= (1<<8)) {
2629     add(tmp, base, offset & ((1<<12)-1));
2630     base = tmp;
2631     offset &= -1<<12;
2632   }
2633 
2634   if (offset >= (1<<12) * size) {
2635     add(tmp, base, offset & (((1<<12)-1)<<12));
2636     base = tmp;
2637     offset &= ~(((1<<12)-1)<<12);
2638   }
2639 
2640   return Address(base, offset);
2641 }
2642 
2643 // Checks whether offset is aligned.
2644 // Returns true if it is, else false.
2645 bool MacroAssembler::merge_alignment_check(Register base,
2646                                            size_t size,
2647                                            long cur_offset,
2648                                            long prev_offset) const {
2649   if (AvoidUnalignedAccesses) {
2650     if (base == sp) {
2651       // Checks whether low offset if aligned to pair of registers.
2652       long pair_mask = size * 2 - 1;
2653       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2654       return (offset & pair_mask) == 0;
2655     } else { // If base is not sp, we can't guarantee the access is aligned.
2656       return false;
2657     }
2658   } else {
2659     long mask = size - 1;
2660     // Load/store pair instruction only supports element size aligned offset.
2661     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2662   }
2663 }
2664 
2665 // Checks whether current and previous loads/stores can be merged.
2666 // Returns true if it can be merged, else false.
2667 bool MacroAssembler::ldst_can_merge(Register rt,
2668                                     const Address &adr,
2669                                     size_t cur_size_in_bytes,
2670                                     bool is_store) const {
2671   address prev = pc() - NativeInstruction::instruction_size;
2672   address last = code()->last_insn();
2673 
2674   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2675     return false;
2676   }
2677 
2678   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2679     return false;
2680   }
2681 
2682   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2683   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2684 
2685   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2686   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2687 
2688   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2689     return false;
2690   }
2691 
2692   long max_offset = 63 * prev_size_in_bytes;
2693   long min_offset = -64 * prev_size_in_bytes;
2694 
2695   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2696 
2697   // Only same base can be merged.
2698   if (adr.base() != prev_ldst->base()) {
2699     return false;
2700   }
2701 
2702   long cur_offset = adr.offset();
2703   long prev_offset = prev_ldst->offset();
2704   size_t diff = abs(cur_offset - prev_offset);
2705   if (diff != prev_size_in_bytes) {
2706     return false;
2707   }
2708 
2709   // Following cases can not be merged:
2710   // ldr x2, [x2, #8]
2711   // ldr x3, [x2, #16]
2712   // or:
2713   // ldr x2, [x3, #8]
2714   // ldr x2, [x3, #16]
2715   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2716   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2717     return false;
2718   }
2719 
2720   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2721   // Offset range must be in ldp/stp instruction's range.
2722   if (low_offset > max_offset || low_offset < min_offset) {
2723     return false;
2724   }
2725 
2726   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2727     return true;
2728   }
2729 
2730   return false;
2731 }
2732 
2733 // Merge current load/store with previous load/store into ldp/stp.
2734 void MacroAssembler::merge_ldst(Register rt,
2735                                 const Address &adr,
2736                                 size_t cur_size_in_bytes,
2737                                 bool is_store) {
2738 
2739   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2740 
2741   Register rt_low, rt_high;
2742   address prev = pc() - NativeInstruction::instruction_size;
2743   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2744 
2745   long offset;
2746 
2747   if (adr.offset() < prev_ldst->offset()) {
2748     offset = adr.offset();
2749     rt_low = rt;
2750     rt_high = prev_ldst->target();
2751   } else {
2752     offset = prev_ldst->offset();
2753     rt_low = prev_ldst->target();
2754     rt_high = rt;
2755   }
2756 
2757   Address adr_p = Address(prev_ldst->base(), offset);
2758   // Overwrite previous generated binary.
2759   code_section()->set_end(prev);
2760 
2761   const int sz = prev_ldst->size_in_bytes();
2762   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2763   if (!is_store) {
2764     BLOCK_COMMENT("merged ldr pair");
2765     if (sz == 8) {
2766       ldp(rt_low, rt_high, adr_p);
2767     } else {
2768       ldpw(rt_low, rt_high, adr_p);
2769     }
2770   } else {
2771     BLOCK_COMMENT("merged str pair");
2772     if (sz == 8) {
2773       stp(rt_low, rt_high, adr_p);
2774     } else {
2775       stpw(rt_low, rt_high, adr_p);
2776     }
2777   }
2778 }
2779 
2780 /**
2781  * Multiply 64 bit by 64 bit first loop.
2782  */
2783 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2784                                            Register y, Register y_idx, Register z,
2785                                            Register carry, Register product,
2786                                            Register idx, Register kdx) {
2787   //
2788   //  jlong carry, x[], y[], z[];
2789   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2790   //    huge_128 product = y[idx] * x[xstart] + carry;
2791   //    z[kdx] = (jlong)product;
2792   //    carry  = (jlong)(product >>> 64);
2793   //  }
2794   //  z[xstart] = carry;
2795   //
2796 
2797   Label L_first_loop, L_first_loop_exit;
2798   Label L_one_x, L_one_y, L_multiply;
2799 
2800   subsw(xstart, xstart, 1);
2801   br(Assembler::MI, L_one_x);
2802 
2803   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2804   ldr(x_xstart, Address(rscratch1));
2805   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2806 
2807   bind(L_first_loop);
2808   subsw(idx, idx, 1);
2809   br(Assembler::MI, L_first_loop_exit);
2810   subsw(idx, idx, 1);
2811   br(Assembler::MI, L_one_y);
2812   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2813   ldr(y_idx, Address(rscratch1));
2814   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2815   bind(L_multiply);
2816 
2817   // AArch64 has a multiply-accumulate instruction that we can't use
2818   // here because it has no way to process carries, so we have to use
2819   // separate add and adc instructions.  Bah.
2820   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2821   mul(product, x_xstart, y_idx);
2822   adds(product, product, carry);
2823   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2824 
2825   subw(kdx, kdx, 2);
2826   ror(product, product, 32); // back to big-endian
2827   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2828 
2829   b(L_first_loop);
2830 
2831   bind(L_one_y);
2832   ldrw(y_idx, Address(y,  0));
2833   b(L_multiply);
2834 
2835   bind(L_one_x);
2836   ldrw(x_xstart, Address(x,  0));
2837   b(L_first_loop);
2838 
2839   bind(L_first_loop_exit);
2840 }
2841 
2842 /**
2843  * Multiply 128 bit by 128. Unrolled inner loop.
2844  *
2845  */
2846 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2847                                              Register carry, Register carry2,
2848                                              Register idx, Register jdx,
2849                                              Register yz_idx1, Register yz_idx2,
2850                                              Register tmp, Register tmp3, Register tmp4,
2851                                              Register tmp6, Register product_hi) {
2852 
2853   //   jlong carry, x[], y[], z[];
2854   //   int kdx = ystart+1;
2855   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2856   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2857   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2858   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2859   //     carry  = (jlong)(tmp4 >>> 64);
2860   //     z[kdx+idx+1] = (jlong)tmp3;
2861   //     z[kdx+idx] = (jlong)tmp4;
2862   //   }
2863   //   idx += 2;
2864   //   if (idx > 0) {
2865   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2866   //     z[kdx+idx] = (jlong)yz_idx1;
2867   //     carry  = (jlong)(yz_idx1 >>> 64);
2868   //   }
2869   //
2870 
2871   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2872 
2873   lsrw(jdx, idx, 2);
2874 
2875   bind(L_third_loop);
2876 
2877   subsw(jdx, jdx, 1);
2878   br(Assembler::MI, L_third_loop_exit);
2879   subw(idx, idx, 4);
2880 
2881   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2882 
2883   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2884 
2885   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2886 
2887   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2888   ror(yz_idx2, yz_idx2, 32);
2889 
2890   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2891 
2892   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2893   umulh(tmp4, product_hi, yz_idx1);
2894 
2895   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2896   ror(rscratch2, rscratch2, 32);
2897 
2898   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2899   umulh(carry2, product_hi, yz_idx2);
2900 
2901   // propagate sum of both multiplications into carry:tmp4:tmp3
2902   adds(tmp3, tmp3, carry);
2903   adc(tmp4, tmp4, zr);
2904   adds(tmp3, tmp3, rscratch1);
2905   adcs(tmp4, tmp4, tmp);
2906   adc(carry, carry2, zr);
2907   adds(tmp4, tmp4, rscratch2);
2908   adc(carry, carry, zr);
2909 
2910   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2911   ror(tmp4, tmp4, 32);
2912   stp(tmp4, tmp3, Address(tmp6, 0));
2913 
2914   b(L_third_loop);
2915   bind (L_third_loop_exit);
2916 
2917   andw (idx, idx, 0x3);
2918   cbz(idx, L_post_third_loop_done);
2919 
2920   Label L_check_1;
2921   subsw(idx, idx, 2);
2922   br(Assembler::MI, L_check_1);
2923 
2924   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2925   ldr(yz_idx1, Address(rscratch1, 0));
2926   ror(yz_idx1, yz_idx1, 32);
2927   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2928   umulh(tmp4, product_hi, yz_idx1);
2929   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2930   ldr(yz_idx2, Address(rscratch1, 0));
2931   ror(yz_idx2, yz_idx2, 32);
2932 
2933   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2934 
2935   ror(tmp3, tmp3, 32);
2936   str(tmp3, Address(rscratch1, 0));
2937 
2938   bind (L_check_1);
2939 
2940   andw (idx, idx, 0x1);
2941   subsw(idx, idx, 1);
2942   br(Assembler::MI, L_post_third_loop_done);
2943   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2944   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2945   umulh(carry2, tmp4, product_hi);
2946   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2947 
2948   add2_with_carry(carry2, tmp3, tmp4, carry);
2949 
2950   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2951   extr(carry, carry2, tmp3, 32);
2952 
2953   bind(L_post_third_loop_done);
2954 }
2955 
2956 /**
2957  * Code for BigInteger::multiplyToLen() instrinsic.
2958  *
2959  * r0: x
2960  * r1: xlen
2961  * r2: y
2962  * r3: ylen
2963  * r4:  z
2964  * r5: zlen
2965  * r10: tmp1
2966  * r11: tmp2
2967  * r12: tmp3
2968  * r13: tmp4
2969  * r14: tmp5
2970  * r15: tmp6
2971  * r16: tmp7
2972  *
2973  */
2974 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2975                                      Register z, Register zlen,
2976                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2977                                      Register tmp5, Register tmp6, Register product_hi) {
2978 
2979   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2980 
2981   const Register idx = tmp1;
2982   const Register kdx = tmp2;
2983   const Register xstart = tmp3;
2984 
2985   const Register y_idx = tmp4;
2986   const Register carry = tmp5;
2987   const Register product  = xlen;
2988   const Register x_xstart = zlen;  // reuse register
2989 
2990   // First Loop.
2991   //
2992   //  final static long LONG_MASK = 0xffffffffL;
2993   //  int xstart = xlen - 1;
2994   //  int ystart = ylen - 1;
2995   //  long carry = 0;
2996   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2997   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2998   //    z[kdx] = (int)product;
2999   //    carry = product >>> 32;
3000   //  }
3001   //  z[xstart] = (int)carry;
3002   //
3003 
3004   movw(idx, ylen);      // idx = ylen;
3005   movw(kdx, zlen);      // kdx = xlen+ylen;
3006   mov(carry, zr);       // carry = 0;
3007 
3008   Label L_done;
3009 
3010   movw(xstart, xlen);
3011   subsw(xstart, xstart, 1);
3012   br(Assembler::MI, L_done);
3013 
3014   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3015 
3016   Label L_second_loop;
3017   cbzw(kdx, L_second_loop);
3018 
3019   Label L_carry;
3020   subw(kdx, kdx, 1);
3021   cbzw(kdx, L_carry);
3022 
3023   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3024   lsr(carry, carry, 32);
3025   subw(kdx, kdx, 1);
3026 
3027   bind(L_carry);
3028   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3029 
3030   // Second and third (nested) loops.
3031   //
3032   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3033   //   carry = 0;
3034   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3035   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3036   //                    (z[k] & LONG_MASK) + carry;
3037   //     z[k] = (int)product;
3038   //     carry = product >>> 32;
3039   //   }
3040   //   z[i] = (int)carry;
3041   // }
3042   //
3043   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3044 
3045   const Register jdx = tmp1;
3046 
3047   bind(L_second_loop);
3048   mov(carry, zr);                // carry = 0;
3049   movw(jdx, ylen);               // j = ystart+1
3050 
3051   subsw(xstart, xstart, 1);      // i = xstart-1;
3052   br(Assembler::MI, L_done);
3053 
3054   str(z, Address(pre(sp, -4 * wordSize)));
3055 
3056   Label L_last_x;
3057   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3058   subsw(xstart, xstart, 1);       // i = xstart-1;
3059   br(Assembler::MI, L_last_x);
3060 
3061   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3062   ldr(product_hi, Address(rscratch1));
3063   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3064 
3065   Label L_third_loop_prologue;
3066   bind(L_third_loop_prologue);
3067 
3068   str(ylen, Address(sp, wordSize));
3069   stp(x, xstart, Address(sp, 2 * wordSize));
3070   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3071                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3072   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3073   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3074 
3075   addw(tmp3, xlen, 1);
3076   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3077   subsw(tmp3, tmp3, 1);
3078   br(Assembler::MI, L_done);
3079 
3080   lsr(carry, carry, 32);
3081   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3082   b(L_second_loop);
3083 
3084   // Next infrequent code is moved outside loops.
3085   bind(L_last_x);
3086   ldrw(product_hi, Address(x,  0));
3087   b(L_third_loop_prologue);
3088 
3089   bind(L_done);
3090 }
3091 
3092 // Code for BigInteger::mulAdd instrinsic
3093 // out     = r0
3094 // in      = r1
3095 // offset  = r2  (already out.length-offset)
3096 // len     = r3
3097 // k       = r4
3098 //
3099 // pseudo code from java implementation:
3100 // carry = 0;
3101 // offset = out.length-offset - 1;
3102 // for (int j=len-1; j >= 0; j--) {
3103 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3104 //     out[offset--] = (int)product;
3105 //     carry = product >>> 32;
3106 // }
3107 // return (int)carry;
3108 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3109       Register len, Register k) {
3110     Label LOOP, END;
3111     // pre-loop
3112     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3113     csel(out, zr, out, Assembler::EQ);
3114     br(Assembler::EQ, END);
3115     add(in, in, len, LSL, 2); // in[j+1] address
3116     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3117     mov(out, zr); // used to keep carry now
3118     BIND(LOOP);
3119     ldrw(rscratch1, Address(pre(in, -4)));
3120     madd(rscratch1, rscratch1, k, out);
3121     ldrw(rscratch2, Address(pre(offset, -4)));
3122     add(rscratch1, rscratch1, rscratch2);
3123     strw(rscratch1, Address(offset));
3124     lsr(out, rscratch1, 32);
3125     subs(len, len, 1);
3126     br(Assembler::NE, LOOP);
3127     BIND(END);
3128 }
3129 
3130 /**
3131  * Emits code to update CRC-32 with a byte value according to constants in table
3132  *
3133  * @param [in,out]crc   Register containing the crc.
3134  * @param [in]val       Register containing the byte to fold into the CRC.
3135  * @param [in]table     Register containing the table of crc constants.
3136  *
3137  * uint32_t crc;
3138  * val = crc_table[(val ^ crc) & 0xFF];
3139  * crc = val ^ (crc >> 8);
3140  *
3141  */
3142 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3143   eor(val, val, crc);
3144   andr(val, val, 0xff);
3145   ldrw(val, Address(table, val, Address::lsl(2)));
3146   eor(crc, val, crc, Assembler::LSR, 8);
3147 }
3148 
3149 /**
3150  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3151  *
3152  * @param [in,out]crc   Register containing the crc.
3153  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3154  * @param [in]table0    Register containing table 0 of crc constants.
3155  * @param [in]table1    Register containing table 1 of crc constants.
3156  * @param [in]table2    Register containing table 2 of crc constants.
3157  * @param [in]table3    Register containing table 3 of crc constants.
3158  *
3159  * uint32_t crc;
3160  *   v = crc ^ v
3161  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3162  *
3163  */
3164 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3165         Register table0, Register table1, Register table2, Register table3,
3166         bool upper) {
3167   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3168   uxtb(tmp, v);
3169   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3170   ubfx(tmp, v, 8, 8);
3171   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3172   eor(crc, crc, tmp);
3173   ubfx(tmp, v, 16, 8);
3174   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3175   eor(crc, crc, tmp);
3176   ubfx(tmp, v, 24, 8);
3177   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3178   eor(crc, crc, tmp);
3179 }
3180 
3181 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3182         Register len, Register tmp0, Register tmp1, Register tmp2,
3183         Register tmp3) {
3184     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3185     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3186 
3187     mvnw(crc, crc);
3188 
3189     subs(len, len, 128);
3190     br(Assembler::GE, CRC_by64_pre);
3191   BIND(CRC_less64);
3192     adds(len, len, 128-32);
3193     br(Assembler::GE, CRC_by32_loop);
3194   BIND(CRC_less32);
3195     adds(len, len, 32-4);
3196     br(Assembler::GE, CRC_by4_loop);
3197     adds(len, len, 4);
3198     br(Assembler::GT, CRC_by1_loop);
3199     b(L_exit);
3200 
3201   BIND(CRC_by32_loop);
3202     ldp(tmp0, tmp1, Address(post(buf, 16)));
3203     subs(len, len, 32);
3204     crc32x(crc, crc, tmp0);
3205     ldr(tmp2, Address(post(buf, 8)));
3206     crc32x(crc, crc, tmp1);
3207     ldr(tmp3, Address(post(buf, 8)));
3208     crc32x(crc, crc, tmp2);
3209     crc32x(crc, crc, tmp3);
3210     br(Assembler::GE, CRC_by32_loop);
3211     cmn(len, 32);
3212     br(Assembler::NE, CRC_less32);
3213     b(L_exit);
3214 
3215   BIND(CRC_by4_loop);
3216     ldrw(tmp0, Address(post(buf, 4)));
3217     subs(len, len, 4);
3218     crc32w(crc, crc, tmp0);
3219     br(Assembler::GE, CRC_by4_loop);
3220     adds(len, len, 4);
3221     br(Assembler::LE, L_exit);
3222   BIND(CRC_by1_loop);
3223     ldrb(tmp0, Address(post(buf, 1)));
3224     subs(len, len, 1);
3225     crc32b(crc, crc, tmp0);
3226     br(Assembler::GT, CRC_by1_loop);
3227     b(L_exit);
3228 
3229   BIND(CRC_by64_pre);
3230     sub(buf, buf, 8);
3231     ldp(tmp0, tmp1, Address(buf, 8));
3232     crc32x(crc, crc, tmp0);
3233     ldr(tmp2, Address(buf, 24));
3234     crc32x(crc, crc, tmp1);
3235     ldr(tmp3, Address(buf, 32));
3236     crc32x(crc, crc, tmp2);
3237     ldr(tmp0, Address(buf, 40));
3238     crc32x(crc, crc, tmp3);
3239     ldr(tmp1, Address(buf, 48));
3240     crc32x(crc, crc, tmp0);
3241     ldr(tmp2, Address(buf, 56));
3242     crc32x(crc, crc, tmp1);
3243     ldr(tmp3, Address(pre(buf, 64)));
3244 
3245     b(CRC_by64_loop);
3246 
3247     align(CodeEntryAlignment);
3248   BIND(CRC_by64_loop);
3249     subs(len, len, 64);
3250     crc32x(crc, crc, tmp2);
3251     ldr(tmp0, Address(buf, 8));
3252     crc32x(crc, crc, tmp3);
3253     ldr(tmp1, Address(buf, 16));
3254     crc32x(crc, crc, tmp0);
3255     ldr(tmp2, Address(buf, 24));
3256     crc32x(crc, crc, tmp1);
3257     ldr(tmp3, Address(buf, 32));
3258     crc32x(crc, crc, tmp2);
3259     ldr(tmp0, Address(buf, 40));
3260     crc32x(crc, crc, tmp3);
3261     ldr(tmp1, Address(buf, 48));
3262     crc32x(crc, crc, tmp0);
3263     ldr(tmp2, Address(buf, 56));
3264     crc32x(crc, crc, tmp1);
3265     ldr(tmp3, Address(pre(buf, 64)));
3266     br(Assembler::GE, CRC_by64_loop);
3267 
3268     // post-loop
3269     crc32x(crc, crc, tmp2);
3270     crc32x(crc, crc, tmp3);
3271 
3272     sub(len, len, 64);
3273     add(buf, buf, 8);
3274     cmn(len, 128);
3275     br(Assembler::NE, CRC_less64);
3276   BIND(L_exit);
3277     mvnw(crc, crc);
3278 }
3279 
3280 /**
3281  * @param crc   register containing existing CRC (32-bit)
3282  * @param buf   register pointing to input byte buffer (byte*)
3283  * @param len   register containing number of bytes
3284  * @param table register that will contain address of CRC table
3285  * @param tmp   scratch register
3286  */
3287 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3288         Register table0, Register table1, Register table2, Register table3,
3289         Register tmp, Register tmp2, Register tmp3) {
3290   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3291   unsigned long offset;
3292 
3293   if (UseCRC32) {
3294       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3295       return;
3296   }
3297 
3298     mvnw(crc, crc);
3299 
3300     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3301     if (offset) add(table0, table0, offset);
3302     add(table1, table0, 1*256*sizeof(juint));
3303     add(table2, table0, 2*256*sizeof(juint));
3304     add(table3, table0, 3*256*sizeof(juint));
3305 
3306   if (UseNeon) {
3307       cmp(len, 64);
3308       br(Assembler::LT, L_by16);
3309       eor(v16, T16B, v16, v16);
3310 
3311     Label L_fold;
3312 
3313       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3314 
3315       ld1(v0, v1, T2D, post(buf, 32));
3316       ld1r(v4, T2D, post(tmp, 8));
3317       ld1r(v5, T2D, post(tmp, 8));
3318       ld1r(v6, T2D, post(tmp, 8));
3319       ld1r(v7, T2D, post(tmp, 8));
3320       mov(v16, T4S, 0, crc);
3321 
3322       eor(v0, T16B, v0, v16);
3323       sub(len, len, 64);
3324 
3325     BIND(L_fold);
3326       pmull(v22, T8H, v0, v5, T8B);
3327       pmull(v20, T8H, v0, v7, T8B);
3328       pmull(v23, T8H, v0, v4, T8B);
3329       pmull(v21, T8H, v0, v6, T8B);
3330 
3331       pmull2(v18, T8H, v0, v5, T16B);
3332       pmull2(v16, T8H, v0, v7, T16B);
3333       pmull2(v19, T8H, v0, v4, T16B);
3334       pmull2(v17, T8H, v0, v6, T16B);
3335 
3336       uzp1(v24, T8H, v20, v22);
3337       uzp2(v25, T8H, v20, v22);
3338       eor(v20, T16B, v24, v25);
3339 
3340       uzp1(v26, T8H, v16, v18);
3341       uzp2(v27, T8H, v16, v18);
3342       eor(v16, T16B, v26, v27);
3343 
3344       ushll2(v22, T4S, v20, T8H, 8);
3345       ushll(v20, T4S, v20, T4H, 8);
3346 
3347       ushll2(v18, T4S, v16, T8H, 8);
3348       ushll(v16, T4S, v16, T4H, 8);
3349 
3350       eor(v22, T16B, v23, v22);
3351       eor(v18, T16B, v19, v18);
3352       eor(v20, T16B, v21, v20);
3353       eor(v16, T16B, v17, v16);
3354 
3355       uzp1(v17, T2D, v16, v20);
3356       uzp2(v21, T2D, v16, v20);
3357       eor(v17, T16B, v17, v21);
3358 
3359       ushll2(v20, T2D, v17, T4S, 16);
3360       ushll(v16, T2D, v17, T2S, 16);
3361 
3362       eor(v20, T16B, v20, v22);
3363       eor(v16, T16B, v16, v18);
3364 
3365       uzp1(v17, T2D, v20, v16);
3366       uzp2(v21, T2D, v20, v16);
3367       eor(v28, T16B, v17, v21);
3368 
3369       pmull(v22, T8H, v1, v5, T8B);
3370       pmull(v20, T8H, v1, v7, T8B);
3371       pmull(v23, T8H, v1, v4, T8B);
3372       pmull(v21, T8H, v1, v6, T8B);
3373 
3374       pmull2(v18, T8H, v1, v5, T16B);
3375       pmull2(v16, T8H, v1, v7, T16B);
3376       pmull2(v19, T8H, v1, v4, T16B);
3377       pmull2(v17, T8H, v1, v6, T16B);
3378 
3379       ld1(v0, v1, T2D, post(buf, 32));
3380 
3381       uzp1(v24, T8H, v20, v22);
3382       uzp2(v25, T8H, v20, v22);
3383       eor(v20, T16B, v24, v25);
3384 
3385       uzp1(v26, T8H, v16, v18);
3386       uzp2(v27, T8H, v16, v18);
3387       eor(v16, T16B, v26, v27);
3388 
3389       ushll2(v22, T4S, v20, T8H, 8);
3390       ushll(v20, T4S, v20, T4H, 8);
3391 
3392       ushll2(v18, T4S, v16, T8H, 8);
3393       ushll(v16, T4S, v16, T4H, 8);
3394 
3395       eor(v22, T16B, v23, v22);
3396       eor(v18, T16B, v19, v18);
3397       eor(v20, T16B, v21, v20);
3398       eor(v16, T16B, v17, v16);
3399 
3400       uzp1(v17, T2D, v16, v20);
3401       uzp2(v21, T2D, v16, v20);
3402       eor(v16, T16B, v17, v21);
3403 
3404       ushll2(v20, T2D, v16, T4S, 16);
3405       ushll(v16, T2D, v16, T2S, 16);
3406 
3407       eor(v20, T16B, v22, v20);
3408       eor(v16, T16B, v16, v18);
3409 
3410       uzp1(v17, T2D, v20, v16);
3411       uzp2(v21, T2D, v20, v16);
3412       eor(v20, T16B, v17, v21);
3413 
3414       shl(v16, T2D, v28, 1);
3415       shl(v17, T2D, v20, 1);
3416 
3417       eor(v0, T16B, v0, v16);
3418       eor(v1, T16B, v1, v17);
3419 
3420       subs(len, len, 32);
3421       br(Assembler::GE, L_fold);
3422 
3423       mov(crc, 0);
3424       mov(tmp, v0, T1D, 0);
3425       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3426       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3427       mov(tmp, v0, T1D, 1);
3428       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3429       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3430       mov(tmp, v1, T1D, 0);
3431       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3432       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3433       mov(tmp, v1, T1D, 1);
3434       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3435       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3436 
3437       add(len, len, 32);
3438   }
3439 
3440   BIND(L_by16);
3441     subs(len, len, 16);
3442     br(Assembler::GE, L_by16_loop);
3443     adds(len, len, 16-4);
3444     br(Assembler::GE, L_by4_loop);
3445     adds(len, len, 4);
3446     br(Assembler::GT, L_by1_loop);
3447     b(L_exit);
3448 
3449   BIND(L_by4_loop);
3450     ldrw(tmp, Address(post(buf, 4)));
3451     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3452     subs(len, len, 4);
3453     br(Assembler::GE, L_by4_loop);
3454     adds(len, len, 4);
3455     br(Assembler::LE, L_exit);
3456   BIND(L_by1_loop);
3457     subs(len, len, 1);
3458     ldrb(tmp, Address(post(buf, 1)));
3459     update_byte_crc32(crc, tmp, table0);
3460     br(Assembler::GT, L_by1_loop);
3461     b(L_exit);
3462 
3463     align(CodeEntryAlignment);
3464   BIND(L_by16_loop);
3465     subs(len, len, 16);
3466     ldp(tmp, tmp3, Address(post(buf, 16)));
3467     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3468     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3469     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3470     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3471     br(Assembler::GE, L_by16_loop);
3472     adds(len, len, 16-4);
3473     br(Assembler::GE, L_by4_loop);
3474     adds(len, len, 4);
3475     br(Assembler::GT, L_by1_loop);
3476   BIND(L_exit);
3477     mvnw(crc, crc);
3478 }
3479 
3480 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3481         Register len, Register tmp0, Register tmp1, Register tmp2,
3482         Register tmp3) {
3483     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3484     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3485 
3486     subs(len, len, 128);
3487     br(Assembler::GE, CRC_by64_pre);
3488   BIND(CRC_less64);
3489     adds(len, len, 128-32);
3490     br(Assembler::GE, CRC_by32_loop);
3491   BIND(CRC_less32);
3492     adds(len, len, 32-4);
3493     br(Assembler::GE, CRC_by4_loop);
3494     adds(len, len, 4);
3495     br(Assembler::GT, CRC_by1_loop);
3496     b(L_exit);
3497 
3498   BIND(CRC_by32_loop);
3499     ldp(tmp0, tmp1, Address(post(buf, 16)));
3500     subs(len, len, 32);
3501     crc32cx(crc, crc, tmp0);
3502     ldr(tmp2, Address(post(buf, 8)));
3503     crc32cx(crc, crc, tmp1);
3504     ldr(tmp3, Address(post(buf, 8)));
3505     crc32cx(crc, crc, tmp2);
3506     crc32cx(crc, crc, tmp3);
3507     br(Assembler::GE, CRC_by32_loop);
3508     cmn(len, 32);
3509     br(Assembler::NE, CRC_less32);
3510     b(L_exit);
3511 
3512   BIND(CRC_by4_loop);
3513     ldrw(tmp0, Address(post(buf, 4)));
3514     subs(len, len, 4);
3515     crc32cw(crc, crc, tmp0);
3516     br(Assembler::GE, CRC_by4_loop);
3517     adds(len, len, 4);
3518     br(Assembler::LE, L_exit);
3519   BIND(CRC_by1_loop);
3520     ldrb(tmp0, Address(post(buf, 1)));
3521     subs(len, len, 1);
3522     crc32cb(crc, crc, tmp0);
3523     br(Assembler::GT, CRC_by1_loop);
3524     b(L_exit);
3525 
3526   BIND(CRC_by64_pre);
3527     sub(buf, buf, 8);
3528     ldp(tmp0, tmp1, Address(buf, 8));
3529     crc32cx(crc, crc, tmp0);
3530     ldr(tmp2, Address(buf, 24));
3531     crc32cx(crc, crc, tmp1);
3532     ldr(tmp3, Address(buf, 32));
3533     crc32cx(crc, crc, tmp2);
3534     ldr(tmp0, Address(buf, 40));
3535     crc32cx(crc, crc, tmp3);
3536     ldr(tmp1, Address(buf, 48));
3537     crc32cx(crc, crc, tmp0);
3538     ldr(tmp2, Address(buf, 56));
3539     crc32cx(crc, crc, tmp1);
3540     ldr(tmp3, Address(pre(buf, 64)));
3541 
3542     b(CRC_by64_loop);
3543 
3544     align(CodeEntryAlignment);
3545   BIND(CRC_by64_loop);
3546     subs(len, len, 64);
3547     crc32cx(crc, crc, tmp2);
3548     ldr(tmp0, Address(buf, 8));
3549     crc32cx(crc, crc, tmp3);
3550     ldr(tmp1, Address(buf, 16));
3551     crc32cx(crc, crc, tmp0);
3552     ldr(tmp2, Address(buf, 24));
3553     crc32cx(crc, crc, tmp1);
3554     ldr(tmp3, Address(buf, 32));
3555     crc32cx(crc, crc, tmp2);
3556     ldr(tmp0, Address(buf, 40));
3557     crc32cx(crc, crc, tmp3);
3558     ldr(tmp1, Address(buf, 48));
3559     crc32cx(crc, crc, tmp0);
3560     ldr(tmp2, Address(buf, 56));
3561     crc32cx(crc, crc, tmp1);
3562     ldr(tmp3, Address(pre(buf, 64)));
3563     br(Assembler::GE, CRC_by64_loop);
3564 
3565     // post-loop
3566     crc32cx(crc, crc, tmp2);
3567     crc32cx(crc, crc, tmp3);
3568 
3569     sub(len, len, 64);
3570     add(buf, buf, 8);
3571     cmn(len, 128);
3572     br(Assembler::NE, CRC_less64);
3573   BIND(L_exit);
3574 }
3575 
3576 /**
3577  * @param crc   register containing existing CRC (32-bit)
3578  * @param buf   register pointing to input byte buffer (byte*)
3579  * @param len   register containing number of bytes
3580  * @param table register that will contain address of CRC table
3581  * @param tmp   scratch register
3582  */
3583 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3584         Register table0, Register table1, Register table2, Register table3,
3585         Register tmp, Register tmp2, Register tmp3) {
3586   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3587 }
3588 
3589 
3590 SkipIfEqual::SkipIfEqual(
3591     MacroAssembler* masm, const bool* flag_addr, bool value) {
3592   _masm = masm;
3593   unsigned long offset;
3594   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3595   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3596   _masm->cbzw(rscratch1, _label);
3597 }
3598 
3599 SkipIfEqual::~SkipIfEqual() {
3600   _masm->bind(_label);
3601 }
3602 
3603 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3604   Address adr;
3605   switch(dst.getMode()) {
3606   case Address::base_plus_offset:
3607     // This is the expected mode, although we allow all the other
3608     // forms below.
3609     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3610     break;
3611   default:
3612     lea(rscratch2, dst);
3613     adr = Address(rscratch2);
3614     break;
3615   }
3616   ldr(rscratch1, adr);
3617   add(rscratch1, rscratch1, src);
3618   str(rscratch1, adr);
3619 }
3620 
3621 void MacroAssembler::cmpptr(Register src1, Address src2) {
3622   unsigned long offset;
3623   adrp(rscratch1, src2, offset);
3624   ldr(rscratch1, Address(rscratch1, offset));
3625   cmp(src1, rscratch1);
3626 }
3627 
3628 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3629   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3630   bs->obj_equals(this, obj1, obj2);
3631 }
3632 
3633 void MacroAssembler::load_klass(Register dst, Register src) {
3634   if (UseCompressedClassPointers) {
3635     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3636     decode_klass_not_null(dst);
3637   } else {
3638     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3639   }
3640 }
3641 
3642 // ((OopHandle)result).resolve();
3643 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3644   // OopHandle::resolve is an indirection.
3645   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3646 }
3647 
3648 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3649   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3650   ldr(dst, Address(rmethod, Method::const_offset()));
3651   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3652   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3653   ldr(dst, Address(dst, mirror_offset));
3654   resolve_oop_handle(dst, tmp);
3655 }
3656 
3657 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3658   if (UseCompressedClassPointers) {
3659     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3660     if (Universe::narrow_klass_base() == NULL) {
3661       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3662       return;
3663     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3664                && Universe::narrow_klass_shift() == 0) {
3665       // Only the bottom 32 bits matter
3666       cmpw(trial_klass, tmp);
3667       return;
3668     }
3669     decode_klass_not_null(tmp);
3670   } else {
3671     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3672   }
3673   cmp(trial_klass, tmp);
3674 }
3675 
3676 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3677   load_klass(dst, src);
3678   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3679 }
3680 
3681 void MacroAssembler::store_klass(Register dst, Register src) {
3682   // FIXME: Should this be a store release?  concurrent gcs assumes
3683   // klass length is valid if klass field is not null.
3684   if (UseCompressedClassPointers) {
3685     encode_klass_not_null(src);
3686     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3687   } else {
3688     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3689   }
3690 }
3691 
3692 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3693   if (UseCompressedClassPointers) {
3694     // Store to klass gap in destination
3695     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3696   }
3697 }
3698 
3699 // Algorithm must match CompressedOops::encode.
3700 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3701 #ifdef ASSERT
3702   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3703 #endif
3704   verify_oop(s, "broken oop in encode_heap_oop");
3705   if (Universe::narrow_oop_base() == NULL) {
3706     if (Universe::narrow_oop_shift() != 0) {
3707       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3708       lsr(d, s, LogMinObjAlignmentInBytes);
3709     } else {
3710       mov(d, s);
3711     }
3712   } else {
3713     subs(d, s, rheapbase);
3714     csel(d, d, zr, Assembler::HS);
3715     lsr(d, d, LogMinObjAlignmentInBytes);
3716 
3717     /*  Old algorithm: is this any worse?
3718     Label nonnull;
3719     cbnz(r, nonnull);
3720     sub(r, r, rheapbase);
3721     bind(nonnull);
3722     lsr(r, r, LogMinObjAlignmentInBytes);
3723     */
3724   }
3725 }
3726 
3727 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3728 #ifdef ASSERT
3729   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3730   if (CheckCompressedOops) {
3731     Label ok;
3732     cbnz(r, ok);
3733     stop("null oop passed to encode_heap_oop_not_null");
3734     bind(ok);
3735   }
3736 #endif
3737   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3738   if (Universe::narrow_oop_base() != NULL) {
3739     sub(r, r, rheapbase);
3740   }
3741   if (Universe::narrow_oop_shift() != 0) {
3742     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3743     lsr(r, r, LogMinObjAlignmentInBytes);
3744   }
3745 }
3746 
3747 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3748 #ifdef ASSERT
3749   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3750   if (CheckCompressedOops) {
3751     Label ok;
3752     cbnz(src, ok);
3753     stop("null oop passed to encode_heap_oop_not_null2");
3754     bind(ok);
3755   }
3756 #endif
3757   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3758 
3759   Register data = src;
3760   if (Universe::narrow_oop_base() != NULL) {
3761     sub(dst, src, rheapbase);
3762     data = dst;
3763   }
3764   if (Universe::narrow_oop_shift() != 0) {
3765     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3766     lsr(dst, data, LogMinObjAlignmentInBytes);
3767     data = dst;
3768   }
3769   if (data == src)
3770     mov(dst, src);
3771 }
3772 
3773 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3774 #ifdef ASSERT
3775   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3776 #endif
3777   if (Universe::narrow_oop_base() == NULL) {
3778     if (Universe::narrow_oop_shift() != 0 || d != s) {
3779       lsl(d, s, Universe::narrow_oop_shift());
3780     }
3781   } else {
3782     Label done;
3783     if (d != s)
3784       mov(d, s);
3785     cbz(s, done);
3786     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3787     bind(done);
3788   }
3789   verify_oop(d, "broken oop in decode_heap_oop");
3790 }
3791 
3792 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3793   assert (UseCompressedOops, "should only be used for compressed headers");
3794   assert (Universe::heap() != NULL, "java heap should be initialized");
3795   // Cannot assert, unverified entry point counts instructions (see .ad file)
3796   // vtableStubs also counts instructions in pd_code_size_limit.
3797   // Also do not verify_oop as this is called by verify_oop.
3798   if (Universe::narrow_oop_shift() != 0) {
3799     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3800     if (Universe::narrow_oop_base() != NULL) {
3801       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3802     } else {
3803       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3804     }
3805   } else {
3806     assert (Universe::narrow_oop_base() == NULL, "sanity");
3807   }
3808 }
3809 
3810 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3811   assert (UseCompressedOops, "should only be used for compressed headers");
3812   assert (Universe::heap() != NULL, "java heap should be initialized");
3813   // Cannot assert, unverified entry point counts instructions (see .ad file)
3814   // vtableStubs also counts instructions in pd_code_size_limit.
3815   // Also do not verify_oop as this is called by verify_oop.
3816   if (Universe::narrow_oop_shift() != 0) {
3817     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3818     if (Universe::narrow_oop_base() != NULL) {
3819       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3820     } else {
3821       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3822     }
3823   } else {
3824     assert (Universe::narrow_oop_base() == NULL, "sanity");
3825     if (dst != src) {
3826       mov(dst, src);
3827     }
3828   }
3829 }
3830 
3831 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3832   if (Universe::narrow_klass_base() == NULL) {
3833     if (Universe::narrow_klass_shift() != 0) {
3834       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3835       lsr(dst, src, LogKlassAlignmentInBytes);
3836     } else {
3837       if (dst != src) mov(dst, src);
3838     }
3839     return;
3840   }
3841 
3842   if (use_XOR_for_compressed_class_base) {
3843     if (Universe::narrow_klass_shift() != 0) {
3844       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3845       lsr(dst, dst, LogKlassAlignmentInBytes);
3846     } else {
3847       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3848     }
3849     return;
3850   }
3851 
3852   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3853       && Universe::narrow_klass_shift() == 0) {
3854     movw(dst, src);
3855     return;
3856   }
3857 
3858 #ifdef ASSERT
3859   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3860 #endif
3861 
3862   Register rbase = dst;
3863   if (dst == src) rbase = rheapbase;
3864   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3865   sub(dst, src, rbase);
3866   if (Universe::narrow_klass_shift() != 0) {
3867     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3868     lsr(dst, dst, LogKlassAlignmentInBytes);
3869   }
3870   if (dst == src) reinit_heapbase();
3871 }
3872 
3873 void MacroAssembler::encode_klass_not_null(Register r) {
3874   encode_klass_not_null(r, r);
3875 }
3876 
3877 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3878   Register rbase = dst;
3879   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3880 
3881   if (Universe::narrow_klass_base() == NULL) {
3882     if (Universe::narrow_klass_shift() != 0) {
3883       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3884       lsl(dst, src, LogKlassAlignmentInBytes);
3885     } else {
3886       if (dst != src) mov(dst, src);
3887     }
3888     return;
3889   }
3890 
3891   if (use_XOR_for_compressed_class_base) {
3892     if (Universe::narrow_klass_shift() != 0) {
3893       lsl(dst, src, LogKlassAlignmentInBytes);
3894       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3895     } else {
3896       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3897     }
3898     return;
3899   }
3900 
3901   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3902       && Universe::narrow_klass_shift() == 0) {
3903     if (dst != src)
3904       movw(dst, src);
3905     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3906     return;
3907   }
3908 
3909   // Cannot assert, unverified entry point counts instructions (see .ad file)
3910   // vtableStubs also counts instructions in pd_code_size_limit.
3911   // Also do not verify_oop as this is called by verify_oop.
3912   if (dst == src) rbase = rheapbase;
3913   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3914   if (Universe::narrow_klass_shift() != 0) {
3915     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3916     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3917   } else {
3918     add(dst, rbase, src);
3919   }
3920   if (dst == src) reinit_heapbase();
3921 }
3922 
3923 void  MacroAssembler::decode_klass_not_null(Register r) {
3924   decode_klass_not_null(r, r);
3925 }
3926 
3927 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3928 #ifdef ASSERT
3929   {
3930     ThreadInVMfromUnknown tiv;
3931     assert (UseCompressedOops, "should only be used for compressed oops");
3932     assert (Universe::heap() != NULL, "java heap should be initialized");
3933     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3934     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3935   }
3936 #endif
3937   int oop_index = oop_recorder()->find_index(obj);
3938   InstructionMark im(this);
3939   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3940   code_section()->relocate(inst_mark(), rspec);
3941   movz(dst, 0xDEAD, 16);
3942   movk(dst, 0xBEEF);
3943 }
3944 
3945 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3946   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3947   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3948   int index = oop_recorder()->find_index(k);
3949   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3950 
3951   InstructionMark im(this);
3952   RelocationHolder rspec = metadata_Relocation::spec(index);
3953   code_section()->relocate(inst_mark(), rspec);
3954   narrowKlass nk = Klass::encode_klass(k);
3955   movz(dst, (nk >> 16), 16);
3956   movk(dst, nk & 0xffff);
3957 }
3958 
3959 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3960                                     Register dst, Address src,
3961                                     Register tmp1, Register thread_tmp) {
3962   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3963   decorators = AccessInternal::decorator_fixup(decorators);
3964   bool as_raw = (decorators & AS_RAW) != 0;
3965   if (as_raw) {
3966     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3967   } else {
3968     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3969   }
3970 }
3971 
3972 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3973                                      Address dst, Register src,
3974                                      Register tmp1, Register thread_tmp) {
3975   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3976   decorators = AccessInternal::decorator_fixup(decorators);
3977   bool as_raw = (decorators & AS_RAW) != 0;
3978   if (as_raw) {
3979     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3980   } else {
3981     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3982   }
3983 }
3984 
3985 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
3986                                    Register thread_tmp, DecoratorSet decorators) {
3987   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3988 }
3989 
3990 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
3991                                             Register thread_tmp, DecoratorSet decorators) {
3992   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
3993 }
3994 
3995 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
3996                                     Register thread_tmp, DecoratorSet decorators) {
3997   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
3998 }
3999 
4000 // Used for storing NULLs.
4001 void MacroAssembler::store_heap_oop_null(Address dst) {
4002   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4003 }
4004 
4005 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4006   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4007   int index = oop_recorder()->allocate_metadata_index(obj);
4008   RelocationHolder rspec = metadata_Relocation::spec(index);
4009   return Address((address)obj, rspec);
4010 }
4011 
4012 // Move an oop into a register.  immediate is true if we want
4013 // immediate instrcutions, i.e. we are not going to patch this
4014 // instruction while the code is being executed by another thread.  In
4015 // that case we can use move immediates rather than the constant pool.
4016 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4017   int oop_index;
4018   if (obj == NULL) {
4019     oop_index = oop_recorder()->allocate_oop_index(obj);
4020   } else {
4021 #ifdef ASSERT
4022     {
4023       ThreadInVMfromUnknown tiv;
4024       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4025     }
4026 #endif
4027     oop_index = oop_recorder()->find_index(obj);
4028   }
4029   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4030   if (! immediate) {
4031     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4032     ldr_constant(dst, Address(dummy, rspec));
4033   } else
4034     mov(dst, Address((address)obj, rspec));
4035 }
4036 
4037 // Move a metadata address into a register.
4038 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4039   int oop_index;
4040   if (obj == NULL) {
4041     oop_index = oop_recorder()->allocate_metadata_index(obj);
4042   } else {
4043     oop_index = oop_recorder()->find_index(obj);
4044   }
4045   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4046   mov(dst, Address((address)obj, rspec));
4047 }
4048 
4049 Address MacroAssembler::constant_oop_address(jobject obj) {
4050 #ifdef ASSERT
4051   {
4052     ThreadInVMfromUnknown tiv;
4053     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4054     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4055   }
4056 #endif
4057   int oop_index = oop_recorder()->find_index(obj);
4058   return Address((address)obj, oop_Relocation::spec(oop_index));
4059 }
4060 
4061 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4062 void MacroAssembler::tlab_allocate(Register obj,
4063                                    Register var_size_in_bytes,
4064                                    int con_size_in_bytes,
4065                                    Register t1,
4066                                    Register t2,
4067                                    Label& slow_case) {
4068   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4069   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4070 }
4071 
4072 // Defines obj, preserves var_size_in_bytes
4073 void MacroAssembler::eden_allocate(Register obj,
4074                                    Register var_size_in_bytes,
4075                                    int con_size_in_bytes,
4076                                    Register t1,
4077                                    Label& slow_case) {
4078   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4079   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4080 }
4081 
4082 // Zero words; len is in bytes
4083 // Destroys all registers except addr
4084 // len must be a nonzero multiple of wordSize
4085 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4086   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4087 
4088 #ifdef ASSERT
4089   { Label L;
4090     tst(len, BytesPerWord - 1);
4091     br(Assembler::EQ, L);
4092     stop("len is not a multiple of BytesPerWord");
4093     bind(L);
4094   }
4095 #endif
4096 
4097 #ifndef PRODUCT
4098   block_comment("zero memory");
4099 #endif
4100 
4101   Label loop;
4102   Label entry;
4103 
4104 //  Algorithm:
4105 //
4106 //    scratch1 = cnt & 7;
4107 //    cnt -= scratch1;
4108 //    p += scratch1;
4109 //    switch (scratch1) {
4110 //      do {
4111 //        cnt -= 8;
4112 //          p[-8] = 0;
4113 //        case 7:
4114 //          p[-7] = 0;
4115 //        case 6:
4116 //          p[-6] = 0;
4117 //          // ...
4118 //        case 1:
4119 //          p[-1] = 0;
4120 //        case 0:
4121 //          p += 8;
4122 //      } while (cnt);
4123 //    }
4124 
4125   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4126 
4127   lsr(len, len, LogBytesPerWord);
4128   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4129   sub(len, len, rscratch1);      // cnt -= unroll
4130   // t1 always points to the end of the region we're about to zero
4131   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4132   adr(rscratch2, entry);
4133   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4134   br(rscratch2);
4135   bind(loop);
4136   sub(len, len, unroll);
4137   for (int i = -unroll; i < 0; i++)
4138     Assembler::str(zr, Address(t1, i * wordSize));
4139   bind(entry);
4140   add(t1, t1, unroll * wordSize);
4141   cbnz(len, loop);
4142 }
4143 
4144 void MacroAssembler::verify_tlab() {
4145 #ifdef ASSERT
4146   if (UseTLAB && VerifyOops) {
4147     Label next, ok;
4148 
4149     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4150 
4151     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4152     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4153     cmp(rscratch2, rscratch1);
4154     br(Assembler::HS, next);
4155     STOP("assert(top >= start)");
4156     should_not_reach_here();
4157 
4158     bind(next);
4159     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4160     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4161     cmp(rscratch2, rscratch1);
4162     br(Assembler::HS, ok);
4163     STOP("assert(top <= end)");
4164     should_not_reach_here();
4165 
4166     bind(ok);
4167     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4168   }
4169 #endif
4170 }
4171 
4172 // Writes to stack successive pages until offset reached to check for
4173 // stack overflow + shadow pages.  This clobbers tmp.
4174 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4175   assert_different_registers(tmp, size, rscratch1);
4176   mov(tmp, sp);
4177   // Bang stack for total size given plus shadow page size.
4178   // Bang one page at a time because large size can bang beyond yellow and
4179   // red zones.
4180   Label loop;
4181   mov(rscratch1, os::vm_page_size());
4182   bind(loop);
4183   lea(tmp, Address(tmp, -os::vm_page_size()));
4184   subsw(size, size, rscratch1);
4185   str(size, Address(tmp));
4186   br(Assembler::GT, loop);
4187 
4188   // Bang down shadow pages too.
4189   // At this point, (tmp-0) is the last address touched, so don't
4190   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4191   // was post-decremented.)  Skip this address by starting at i=1, and
4192   // touch a few more pages below.  N.B.  It is important to touch all
4193   // the way down to and including i=StackShadowPages.
4194   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4195     // this could be any sized move but this is can be a debugging crumb
4196     // so the bigger the better.
4197     lea(tmp, Address(tmp, -os::vm_page_size()));
4198     str(size, Address(tmp));
4199   }
4200 }
4201 
4202 
4203 // Move the address of the polling page into dest.
4204 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4205   if (SafepointMechanism::uses_thread_local_poll()) {
4206     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4207   } else {
4208     unsigned long off;
4209     adrp(dest, Address(page, rtype), off);
4210     assert(off == 0, "polling page must be page aligned");
4211   }
4212 }
4213 
4214 // Move the address of the polling page into r, then read the polling
4215 // page.
4216 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4217   get_polling_page(r, page, rtype);
4218   return read_polling_page(r, rtype);
4219 }
4220 
4221 // Read the polling page.  The address of the polling page must
4222 // already be in r.
4223 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4224   InstructionMark im(this);
4225   code_section()->relocate(inst_mark(), rtype);
4226   ldrw(zr, Address(r, 0));
4227   return inst_mark();
4228 }
4229 
4230 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4231   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4232   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4233   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4234   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4235   long offset_low = dest_page - low_page;
4236   long offset_high = dest_page - high_page;
4237 
4238   assert(is_valid_AArch64_address(dest.target()), "bad address");
4239   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4240 
4241   InstructionMark im(this);
4242   code_section()->relocate(inst_mark(), dest.rspec());
4243   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4244   // the code cache so that if it is relocated we know it will still reach
4245   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4246     _adrp(reg1, dest.target());
4247   } else {
4248     unsigned long target = (unsigned long)dest.target();
4249     unsigned long adrp_target
4250       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4251 
4252     _adrp(reg1, (address)adrp_target);
4253     movk(reg1, target >> 32, 32);
4254   }
4255   byte_offset = (unsigned long)dest.target() & 0xfff;
4256 }
4257 
4258 void MacroAssembler::load_byte_map_base(Register reg) {
4259   jbyte *byte_map_base =
4260     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4261 
4262   if (is_valid_AArch64_address((address)byte_map_base)) {
4263     // Strictly speaking the byte_map_base isn't an address at all,
4264     // and it might even be negative.
4265     unsigned long offset;
4266     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4267     // We expect offset to be zero with most collectors.
4268     if (offset != 0) {
4269       add(reg, reg, offset);
4270     }
4271   } else {
4272     mov(reg, (uint64_t)byte_map_base);
4273   }
4274 }
4275 
4276 void MacroAssembler::build_frame(int framesize) {
4277   assert(framesize > 0, "framesize must be > 0");
4278   if (framesize < ((1 << 9) + 2 * wordSize)) {
4279     sub(sp, sp, framesize);
4280     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4281     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4282   } else {
4283     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4284     if (PreserveFramePointer) mov(rfp, sp);
4285     if (framesize < ((1 << 12) + 2 * wordSize))
4286       sub(sp, sp, framesize - 2 * wordSize);
4287     else {
4288       mov(rscratch1, framesize - 2 * wordSize);
4289       sub(sp, sp, rscratch1);
4290     }
4291   }
4292 }
4293 
4294 void MacroAssembler::remove_frame(int framesize) {
4295   assert(framesize > 0, "framesize must be > 0");
4296   if (framesize < ((1 << 9) + 2 * wordSize)) {
4297     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4298     add(sp, sp, framesize);
4299   } else {
4300     if (framesize < ((1 << 12) + 2 * wordSize))
4301       add(sp, sp, framesize - 2 * wordSize);
4302     else {
4303       mov(rscratch1, framesize - 2 * wordSize);
4304       add(sp, sp, rscratch1);
4305     }
4306     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4307   }
4308 }
4309 
4310 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4311 
4312 // Search for str1 in str2 and return index or -1
4313 void MacroAssembler::string_indexof(Register str2, Register str1,
4314                                     Register cnt2, Register cnt1,
4315                                     Register tmp1, Register tmp2,
4316                                     Register tmp3, Register tmp4,
4317                                     Register tmp5, Register tmp6,
4318                                     int icnt1, Register result, int ae) {
4319   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4320   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4321 
4322   Register ch1 = rscratch1;
4323   Register ch2 = rscratch2;
4324   Register cnt1tmp = tmp1;
4325   Register cnt2tmp = tmp2;
4326   Register cnt1_neg = cnt1;
4327   Register cnt2_neg = cnt2;
4328   Register result_tmp = tmp4;
4329 
4330   bool isL = ae == StrIntrinsicNode::LL;
4331 
4332   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4333   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4334   int str1_chr_shift = str1_isL ? 0:1;
4335   int str2_chr_shift = str2_isL ? 0:1;
4336   int str1_chr_size = str1_isL ? 1:2;
4337   int str2_chr_size = str2_isL ? 1:2;
4338   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4339                                       (chr_insn)&MacroAssembler::ldrh;
4340   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4341                                       (chr_insn)&MacroAssembler::ldrh;
4342   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4343   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4344 
4345   // Note, inline_string_indexOf() generates checks:
4346   // if (substr.count > string.count) return -1;
4347   // if (substr.count == 0) return 0;
4348 
4349   // We have two strings, a source string in str2, cnt2 and a pattern string
4350   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4351 
4352   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4353   // With a small pattern and source we use linear scan.
4354 
4355   if (icnt1 == -1) {
4356     sub(result_tmp, cnt2, cnt1);
4357     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4358     br(LT, LINEARSEARCH);
4359     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4360     cmp(cnt1, 256);
4361     lsr(tmp1, cnt2, 2);
4362     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4363     br(GE, LINEARSTUB);
4364   }
4365 
4366 // The Boyer Moore alogorithm is based on the description here:-
4367 //
4368 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4369 //
4370 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4371 // and the 'Good Suffix' rule.
4372 //
4373 // These rules are essentially heuristics for how far we can shift the
4374 // pattern along the search string.
4375 //
4376 // The implementation here uses the 'Bad Character' rule only because of the
4377 // complexity of initialisation for the 'Good Suffix' rule.
4378 //
4379 // This is also known as the Boyer-Moore-Horspool algorithm:-
4380 //
4381 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4382 //
4383 // This particular implementation has few java-specific optimizations.
4384 //
4385 // #define ASIZE 256
4386 //
4387 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4388 //       int i, j;
4389 //       unsigned c;
4390 //       unsigned char bc[ASIZE];
4391 //
4392 //       /* Preprocessing */
4393 //       for (i = 0; i < ASIZE; ++i)
4394 //          bc[i] = m;
4395 //       for (i = 0; i < m - 1; ) {
4396 //          c = x[i];
4397 //          ++i;
4398 //          // c < 256 for Latin1 string, so, no need for branch
4399 //          #ifdef PATTERN_STRING_IS_LATIN1
4400 //          bc[c] = m - i;
4401 //          #else
4402 //          if (c < ASIZE) bc[c] = m - i;
4403 //          #endif
4404 //       }
4405 //
4406 //       /* Searching */
4407 //       j = 0;
4408 //       while (j <= n - m) {
4409 //          c = y[i+j];
4410 //          if (x[m-1] == c)
4411 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4412 //          if (i < 0) return j;
4413 //          // c < 256 for Latin1 string, so, no need for branch
4414 //          #ifdef SOURCE_STRING_IS_LATIN1
4415 //          // LL case: (c< 256) always true. Remove branch
4416 //          j += bc[y[j+m-1]];
4417 //          #endif
4418 //          #ifndef PATTERN_STRING_IS_UTF
4419 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4420 //          if (c < ASIZE)
4421 //            j += bc[y[j+m-1]];
4422 //          else
4423 //            j += 1
4424 //          #endif
4425 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4426 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4427 //          if (c < ASIZE)
4428 //            j += bc[y[j+m-1]];
4429 //          else
4430 //            j += m
4431 //          #endif
4432 //       }
4433 //    }
4434 
4435   if (icnt1 == -1) {
4436     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4437         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4438     Register cnt1end = tmp2;
4439     Register str2end = cnt2;
4440     Register skipch = tmp2;
4441 
4442     // str1 length is >=8, so, we can read at least 1 register for cases when
4443     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4444     // UL case. We'll re-read last character in inner pre-loop code to have
4445     // single outer pre-loop load
4446     const int firstStep = isL ? 7 : 3;
4447 
4448     const int ASIZE = 256;
4449     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4450     sub(sp, sp, ASIZE);
4451     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4452     mov(ch1, sp);
4453     BIND(BM_INIT_LOOP);
4454       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4455       subs(tmp5, tmp5, 1);
4456       br(GT, BM_INIT_LOOP);
4457 
4458       sub(cnt1tmp, cnt1, 1);
4459       mov(tmp5, str2);
4460       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4461       sub(ch2, cnt1, 1);
4462       mov(tmp3, str1);
4463     BIND(BCLOOP);
4464       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4465       if (!str1_isL) {
4466         cmp(ch1, ASIZE);
4467         br(HS, BCSKIP);
4468       }
4469       strb(ch2, Address(sp, ch1));
4470     BIND(BCSKIP);
4471       subs(ch2, ch2, 1);
4472       br(GT, BCLOOP);
4473 
4474       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4475       if (str1_isL == str2_isL) {
4476         // load last 8 bytes (8LL/4UU symbols)
4477         ldr(tmp6, Address(tmp6, -wordSize));
4478       } else {
4479         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4480         // convert Latin1 to UTF. We'll have to wait until load completed, but
4481         // it's still faster than per-character loads+checks
4482         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4483         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4484         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4485         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4486         orr(ch2, ch1, ch2, LSL, 16);
4487         orr(tmp6, tmp6, tmp3, LSL, 48);
4488         orr(tmp6, tmp6, ch2, LSL, 16);
4489       }
4490     BIND(BMLOOPSTR2);
4491       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4492       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4493       if (str1_isL == str2_isL) {
4494         // re-init tmp3. It's for free because it's executed in parallel with
4495         // load above. Alternative is to initialize it before loop, but it'll
4496         // affect performance on in-order systems with 2 or more ld/st pipelines
4497         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4498       }
4499       if (!isL) { // UU/UL case
4500         lsl(ch2, cnt1tmp, 1); // offset in bytes
4501       }
4502       cmp(tmp3, skipch);
4503       br(NE, BMSKIP);
4504       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4505       mov(ch1, tmp6);
4506       if (isL) {
4507         b(BMLOOPSTR1_AFTER_LOAD);
4508       } else {
4509         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4510         b(BMLOOPSTR1_CMP);
4511       }
4512     BIND(BMLOOPSTR1);
4513       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4514       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4515     BIND(BMLOOPSTR1_AFTER_LOAD);
4516       subs(cnt1tmp, cnt1tmp, 1);
4517       br(LT, BMLOOPSTR1_LASTCMP);
4518     BIND(BMLOOPSTR1_CMP);
4519       cmp(ch1, ch2);
4520       br(EQ, BMLOOPSTR1);
4521     BIND(BMSKIP);
4522       if (!isL) {
4523         // if we've met UTF symbol while searching Latin1 pattern, then we can
4524         // skip cnt1 symbols
4525         if (str1_isL != str2_isL) {
4526           mov(result_tmp, cnt1);
4527         } else {
4528           mov(result_tmp, 1);
4529         }
4530         cmp(skipch, ASIZE);
4531         br(HS, BMADV);
4532       }
4533       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4534     BIND(BMADV);
4535       sub(cnt1tmp, cnt1, 1);
4536       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4537       cmp(str2, str2end);
4538       br(LE, BMLOOPSTR2);
4539       add(sp, sp, ASIZE);
4540       b(NOMATCH);
4541     BIND(BMLOOPSTR1_LASTCMP);
4542       cmp(ch1, ch2);
4543       br(NE, BMSKIP);
4544     BIND(BMMATCH);
4545       sub(result, str2, tmp5);
4546       if (!str2_isL) lsr(result, result, 1);
4547       add(sp, sp, ASIZE);
4548       b(DONE);
4549 
4550     BIND(LINEARSTUB);
4551     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4552     br(LT, LINEAR_MEDIUM);
4553     mov(result, zr);
4554     RuntimeAddress stub = NULL;
4555     if (isL) {
4556       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4557       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4558     } else if (str1_isL) {
4559       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4560        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4561     } else {
4562       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4563       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4564     }
4565     trampoline_call(stub);
4566     b(DONE);
4567   }
4568 
4569   BIND(LINEARSEARCH);
4570   {
4571     Label DO1, DO2, DO3;
4572 
4573     Register str2tmp = tmp2;
4574     Register first = tmp3;
4575 
4576     if (icnt1 == -1)
4577     {
4578         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4579 
4580         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4581         br(LT, DOSHORT);
4582       BIND(LINEAR_MEDIUM);
4583         (this->*str1_load_1chr)(first, Address(str1));
4584         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4585         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4586         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4587         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4588 
4589       BIND(FIRST_LOOP);
4590         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4591         cmp(first, ch2);
4592         br(EQ, STR1_LOOP);
4593       BIND(STR2_NEXT);
4594         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4595         br(LE, FIRST_LOOP);
4596         b(NOMATCH);
4597 
4598       BIND(STR1_LOOP);
4599         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4600         add(cnt2tmp, cnt2_neg, str2_chr_size);
4601         br(GE, MATCH);
4602 
4603       BIND(STR1_NEXT);
4604         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4605         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4606         cmp(ch1, ch2);
4607         br(NE, STR2_NEXT);
4608         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4609         add(cnt2tmp, cnt2tmp, str2_chr_size);
4610         br(LT, STR1_NEXT);
4611         b(MATCH);
4612 
4613       BIND(DOSHORT);
4614       if (str1_isL == str2_isL) {
4615         cmp(cnt1, 2);
4616         br(LT, DO1);
4617         br(GT, DO3);
4618       }
4619     }
4620 
4621     if (icnt1 == 4) {
4622       Label CH1_LOOP;
4623 
4624         (this->*load_4chr)(ch1, str1);
4625         sub(result_tmp, cnt2, 4);
4626         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4627         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4628 
4629       BIND(CH1_LOOP);
4630         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4631         cmp(ch1, ch2);
4632         br(EQ, MATCH);
4633         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4634         br(LE, CH1_LOOP);
4635         b(NOMATCH);
4636       }
4637 
4638     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4639       Label CH1_LOOP;
4640 
4641       BIND(DO2);
4642         (this->*load_2chr)(ch1, str1);
4643         if (icnt1 == 2) {
4644           sub(result_tmp, cnt2, 2);
4645         }
4646         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4647         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4648       BIND(CH1_LOOP);
4649         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4650         cmp(ch1, ch2);
4651         br(EQ, MATCH);
4652         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4653         br(LE, CH1_LOOP);
4654         b(NOMATCH);
4655     }
4656 
4657     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4658       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4659 
4660       BIND(DO3);
4661         (this->*load_2chr)(first, str1);
4662         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4663         if (icnt1 == 3) {
4664           sub(result_tmp, cnt2, 3);
4665         }
4666         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4667         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4668       BIND(FIRST_LOOP);
4669         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4670         cmpw(first, ch2);
4671         br(EQ, STR1_LOOP);
4672       BIND(STR2_NEXT);
4673         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4674         br(LE, FIRST_LOOP);
4675         b(NOMATCH);
4676 
4677       BIND(STR1_LOOP);
4678         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4679         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4680         cmp(ch1, ch2);
4681         br(NE, STR2_NEXT);
4682         b(MATCH);
4683     }
4684 
4685     if (icnt1 == -1 || icnt1 == 1) {
4686       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4687 
4688       BIND(DO1);
4689         (this->*str1_load_1chr)(ch1, str1);
4690         cmp(cnt2, 8);
4691         br(LT, DO1_SHORT);
4692 
4693         sub(result_tmp, cnt2, 8/str2_chr_size);
4694         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4695         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4696         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4697 
4698         if (str2_isL) {
4699           orr(ch1, ch1, ch1, LSL, 8);
4700         }
4701         orr(ch1, ch1, ch1, LSL, 16);
4702         orr(ch1, ch1, ch1, LSL, 32);
4703       BIND(CH1_LOOP);
4704         ldr(ch2, Address(str2, cnt2_neg));
4705         eor(ch2, ch1, ch2);
4706         sub(tmp1, ch2, tmp3);
4707         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4708         bics(tmp1, tmp1, tmp2);
4709         br(NE, HAS_ZERO);
4710         adds(cnt2_neg, cnt2_neg, 8);
4711         br(LT, CH1_LOOP);
4712 
4713         cmp(cnt2_neg, 8);
4714         mov(cnt2_neg, 0);
4715         br(LT, CH1_LOOP);
4716         b(NOMATCH);
4717 
4718       BIND(HAS_ZERO);
4719         rev(tmp1, tmp1);
4720         clz(tmp1, tmp1);
4721         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4722         b(MATCH);
4723 
4724       BIND(DO1_SHORT);
4725         mov(result_tmp, cnt2);
4726         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4727         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4728       BIND(DO1_LOOP);
4729         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4730         cmpw(ch1, ch2);
4731         br(EQ, MATCH);
4732         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4733         br(LT, DO1_LOOP);
4734     }
4735   }
4736   BIND(NOMATCH);
4737     mov(result, -1);
4738     b(DONE);
4739   BIND(MATCH);
4740     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4741   BIND(DONE);
4742 }
4743 
4744 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4745 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4746 
4747 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4748                                          Register ch, Register result,
4749                                          Register tmp1, Register tmp2, Register tmp3)
4750 {
4751   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4752   Register cnt1_neg = cnt1;
4753   Register ch1 = rscratch1;
4754   Register result_tmp = rscratch2;
4755 
4756   cmp(cnt1, 4);
4757   br(LT, DO1_SHORT);
4758 
4759   orr(ch, ch, ch, LSL, 16);
4760   orr(ch, ch, ch, LSL, 32);
4761 
4762   sub(cnt1, cnt1, 4);
4763   mov(result_tmp, cnt1);
4764   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4765   sub(cnt1_neg, zr, cnt1, LSL, 1);
4766 
4767   mov(tmp3, 0x0001000100010001);
4768 
4769   BIND(CH1_LOOP);
4770     ldr(ch1, Address(str1, cnt1_neg));
4771     eor(ch1, ch, ch1);
4772     sub(tmp1, ch1, tmp3);
4773     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4774     bics(tmp1, tmp1, tmp2);
4775     br(NE, HAS_ZERO);
4776     adds(cnt1_neg, cnt1_neg, 8);
4777     br(LT, CH1_LOOP);
4778 
4779     cmp(cnt1_neg, 8);
4780     mov(cnt1_neg, 0);
4781     br(LT, CH1_LOOP);
4782     b(NOMATCH);
4783 
4784   BIND(HAS_ZERO);
4785     rev(tmp1, tmp1);
4786     clz(tmp1, tmp1);
4787     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4788     b(MATCH);
4789 
4790   BIND(DO1_SHORT);
4791     mov(result_tmp, cnt1);
4792     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4793     sub(cnt1_neg, zr, cnt1, LSL, 1);
4794   BIND(DO1_LOOP);
4795     ldrh(ch1, Address(str1, cnt1_neg));
4796     cmpw(ch, ch1);
4797     br(EQ, MATCH);
4798     adds(cnt1_neg, cnt1_neg, 2);
4799     br(LT, DO1_LOOP);
4800   BIND(NOMATCH);
4801     mov(result, -1);
4802     b(DONE);
4803   BIND(MATCH);
4804     add(result, result_tmp, cnt1_neg, ASR, 1);
4805   BIND(DONE);
4806 }
4807 
4808 // Compare strings.
4809 void MacroAssembler::string_compare(Register str1, Register str2,
4810     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4811     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4812   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4813       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4814       SHORT_LOOP_START, TAIL_CHECK;
4815 
4816   const int STUB_THRESHOLD = 64 + 8;
4817   bool isLL = ae == StrIntrinsicNode::LL;
4818   bool isLU = ae == StrIntrinsicNode::LU;
4819   bool isUL = ae == StrIntrinsicNode::UL;
4820 
4821   bool str1_isL = isLL || isLU;
4822   bool str2_isL = isLL || isUL;
4823 
4824   int str1_chr_shift = str1_isL ? 0 : 1;
4825   int str2_chr_shift = str2_isL ? 0 : 1;
4826   int str1_chr_size = str1_isL ? 1 : 2;
4827   int str2_chr_size = str2_isL ? 1 : 2;
4828   int minCharsInWord = isLL ? wordSize : wordSize/2;
4829 
4830   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4831   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4832                                       (chr_insn)&MacroAssembler::ldrh;
4833   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4834                                       (chr_insn)&MacroAssembler::ldrh;
4835   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4836                             (uxt_insn)&MacroAssembler::uxthw;
4837 
4838   BLOCK_COMMENT("string_compare {");
4839 
4840   // Bizzarely, the counts are passed in bytes, regardless of whether they
4841   // are L or U strings, however the result is always in characters.
4842   if (!str1_isL) asrw(cnt1, cnt1, 1);
4843   if (!str2_isL) asrw(cnt2, cnt2, 1);
4844 
4845   // Compute the minimum of the string lengths and save the difference.
4846   subsw(result, cnt1, cnt2);
4847   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4848 
4849   // A very short string
4850   cmpw(cnt2, minCharsInWord);
4851   br(Assembler::LT, SHORT_STRING);
4852 
4853   // Compare longwords
4854   // load first parts of strings and finish initialization while loading
4855   {
4856     if (str1_isL == str2_isL) { // LL or UU
4857       ldr(tmp1, Address(str1));
4858       cmp(str1, str2);
4859       br(Assembler::EQ, DONE);
4860       ldr(tmp2, Address(str2));
4861       cmp(cnt2, STUB_THRESHOLD);
4862       br(GE, STUB);
4863       subsw(cnt2, cnt2, minCharsInWord);
4864       br(EQ, TAIL_CHECK);
4865       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4866       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4867       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4868     } else if (isLU) {
4869       ldrs(vtmp, Address(str1));
4870       cmp(str1, str2);
4871       br(Assembler::EQ, DONE);
4872       ldr(tmp2, Address(str2));
4873       cmp(cnt2, STUB_THRESHOLD);
4874       br(GE, STUB);
4875       subsw(cnt2, cnt2, 4);
4876       br(EQ, TAIL_CHECK);
4877       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4878       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4879       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4880       zip1(vtmp, T8B, vtmp, vtmpZ);
4881       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4882       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4883       add(cnt1, cnt1, 4);
4884       fmovd(tmp1, vtmp);
4885     } else { // UL case
4886       ldr(tmp1, Address(str1));
4887       cmp(str1, str2);
4888       br(Assembler::EQ, DONE);
4889       ldrs(vtmp, Address(str2));
4890       cmp(cnt2, STUB_THRESHOLD);
4891       br(GE, STUB);
4892       subsw(cnt2, cnt2, 4);
4893       br(EQ, TAIL_CHECK);
4894       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4895       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4896       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4897       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4898       zip1(vtmp, T8B, vtmp, vtmpZ);
4899       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4900       add(cnt1, cnt1, 8);
4901       fmovd(tmp2, vtmp);
4902     }
4903     adds(cnt2, cnt2, isUL ? 4 : 8);
4904     br(GE, TAIL);
4905     eor(rscratch2, tmp1, tmp2);
4906     cbnz(rscratch2, DIFFERENCE);
4907     // main loop
4908     bind(NEXT_WORD);
4909     if (str1_isL == str2_isL) {
4910       ldr(tmp1, Address(str1, cnt2));
4911       ldr(tmp2, Address(str2, cnt2));
4912       adds(cnt2, cnt2, 8);
4913     } else if (isLU) {
4914       ldrs(vtmp, Address(str1, cnt1));
4915       ldr(tmp2, Address(str2, cnt2));
4916       add(cnt1, cnt1, 4);
4917       zip1(vtmp, T8B, vtmp, vtmpZ);
4918       fmovd(tmp1, vtmp);
4919       adds(cnt2, cnt2, 8);
4920     } else { // UL
4921       ldrs(vtmp, Address(str2, cnt2));
4922       ldr(tmp1, Address(str1, cnt1));
4923       zip1(vtmp, T8B, vtmp, vtmpZ);
4924       add(cnt1, cnt1, 8);
4925       fmovd(tmp2, vtmp);
4926       adds(cnt2, cnt2, 4);
4927     }
4928     br(GE, TAIL);
4929 
4930     eor(rscratch2, tmp1, tmp2);
4931     cbz(rscratch2, NEXT_WORD);
4932     b(DIFFERENCE);
4933     bind(TAIL);
4934     eor(rscratch2, tmp1, tmp2);
4935     cbnz(rscratch2, DIFFERENCE);
4936     // Last longword.  In the case where length == 4 we compare the
4937     // same longword twice, but that's still faster than another
4938     // conditional branch.
4939     if (str1_isL == str2_isL) {
4940       ldr(tmp1, Address(str1));
4941       ldr(tmp2, Address(str2));
4942     } else if (isLU) {
4943       ldrs(vtmp, Address(str1));
4944       ldr(tmp2, Address(str2));
4945       zip1(vtmp, T8B, vtmp, vtmpZ);
4946       fmovd(tmp1, vtmp);
4947     } else { // UL
4948       ldrs(vtmp, Address(str2));
4949       ldr(tmp1, Address(str1));
4950       zip1(vtmp, T8B, vtmp, vtmpZ);
4951       fmovd(tmp2, vtmp);
4952     }
4953     bind(TAIL_CHECK);
4954     eor(rscratch2, tmp1, tmp2);
4955     cbz(rscratch2, DONE);
4956 
4957     // Find the first different characters in the longwords and
4958     // compute their difference.
4959     bind(DIFFERENCE);
4960     rev(rscratch2, rscratch2);
4961     clz(rscratch2, rscratch2);
4962     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4963     lsrv(tmp1, tmp1, rscratch2);
4964     (this->*ext_chr)(tmp1, tmp1);
4965     lsrv(tmp2, tmp2, rscratch2);
4966     (this->*ext_chr)(tmp2, tmp2);
4967     subw(result, tmp1, tmp2);
4968     b(DONE);
4969   }
4970 
4971   bind(STUB);
4972     RuntimeAddress stub = NULL;
4973     switch(ae) {
4974       case StrIntrinsicNode::LL:
4975         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4976         break;
4977       case StrIntrinsicNode::UU:
4978         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4979         break;
4980       case StrIntrinsicNode::LU:
4981         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4982         break;
4983       case StrIntrinsicNode::UL:
4984         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
4985         break;
4986       default:
4987         ShouldNotReachHere();
4988      }
4989     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
4990     trampoline_call(stub);
4991     b(DONE);
4992 
4993   bind(SHORT_STRING);
4994   // Is the minimum length zero?
4995   cbz(cnt2, DONE);
4996   // arrange code to do most branches while loading and loading next characters
4997   // while comparing previous
4998   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
4999   subs(cnt2, cnt2, 1);
5000   br(EQ, SHORT_LAST_INIT);
5001   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5002   b(SHORT_LOOP_START);
5003   bind(SHORT_LOOP);
5004   subs(cnt2, cnt2, 1);
5005   br(EQ, SHORT_LAST);
5006   bind(SHORT_LOOP_START);
5007   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5008   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5009   cmp(tmp1, cnt1);
5010   br(NE, SHORT_LOOP_TAIL);
5011   subs(cnt2, cnt2, 1);
5012   br(EQ, SHORT_LAST2);
5013   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5014   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5015   cmp(tmp2, rscratch1);
5016   br(EQ, SHORT_LOOP);
5017   sub(result, tmp2, rscratch1);
5018   b(DONE);
5019   bind(SHORT_LOOP_TAIL);
5020   sub(result, tmp1, cnt1);
5021   b(DONE);
5022   bind(SHORT_LAST2);
5023   cmp(tmp2, rscratch1);
5024   br(EQ, DONE);
5025   sub(result, tmp2, rscratch1);
5026 
5027   b(DONE);
5028   bind(SHORT_LAST_INIT);
5029   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5030   bind(SHORT_LAST);
5031   cmp(tmp1, cnt1);
5032   br(EQ, DONE);
5033   sub(result, tmp1, cnt1);
5034 
5035   bind(DONE);
5036 
5037   BLOCK_COMMENT("} string_compare");
5038 }
5039 
5040 // This method checks if provided byte array contains byte with highest bit set.
5041 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5042     // Simple and most common case of aligned small array which is not at the
5043     // end of memory page is placed here. All other cases are in stub.
5044     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5045     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5046     assert_different_registers(ary1, len, result);
5047 
5048     cmpw(len, 0);
5049     br(LE, SET_RESULT);
5050     cmpw(len, 4 * wordSize);
5051     br(GE, STUB_LONG); // size > 32 then go to stub
5052 
5053     int shift = 64 - exact_log2(os::vm_page_size());
5054     lsl(rscratch1, ary1, shift);
5055     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5056     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5057     br(CS, STUB); // at the end of page then go to stub
5058     subs(len, len, wordSize);
5059     br(LT, END);
5060 
5061   BIND(LOOP);
5062     ldr(rscratch1, Address(post(ary1, wordSize)));
5063     tst(rscratch1, UPPER_BIT_MASK);
5064     br(NE, SET_RESULT);
5065     subs(len, len, wordSize);
5066     br(GE, LOOP);
5067     cmpw(len, -wordSize);
5068     br(EQ, SET_RESULT);
5069 
5070   BIND(END);
5071     ldr(result, Address(ary1));
5072     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5073     lslv(result, result, len);
5074     tst(result, UPPER_BIT_MASK);
5075     b(SET_RESULT);
5076 
5077   BIND(STUB);
5078     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5079     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5080     trampoline_call(has_neg);
5081     b(DONE);
5082 
5083   BIND(STUB_LONG);
5084     RuntimeAddress has_neg_long =  RuntimeAddress(
5085             StubRoutines::aarch64::has_negatives_long());
5086     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5087     trampoline_call(has_neg_long);
5088     b(DONE);
5089 
5090   BIND(SET_RESULT);
5091     cset(result, NE); // set true or false
5092 
5093   BIND(DONE);
5094 }
5095 
5096 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5097                                    Register tmp4, Register tmp5, Register result,
5098                                    Register cnt1, int elem_size) {
5099   Label DONE, SAME;
5100   Register tmp1 = rscratch1;
5101   Register tmp2 = rscratch2;
5102   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5103   int elem_per_word = wordSize/elem_size;
5104   int log_elem_size = exact_log2(elem_size);
5105   int length_offset = arrayOopDesc::length_offset_in_bytes();
5106   int base_offset
5107     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5108   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5109 
5110   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5111   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5112 
5113 #ifndef PRODUCT
5114   {
5115     const char kind = (elem_size == 2) ? 'U' : 'L';
5116     char comment[64];
5117     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5118     BLOCK_COMMENT(comment);
5119   }
5120 #endif
5121 
5122   // if (a1 == a2)
5123   //     return true;
5124   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5125   br(EQ, SAME);
5126 
5127   if (UseSimpleArrayEquals) {
5128     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5129     // if (a1 == null || a2 == null)
5130     //     return false;
5131     // a1 & a2 == 0 means (some-pointer is null) or
5132     // (very-rare-or-even-probably-impossible-pointer-values)
5133     // so, we can save one branch in most cases
5134     tst(a1, a2);
5135     mov(result, false);
5136     br(EQ, A_MIGHT_BE_NULL);
5137     // if (a1.length != a2.length)
5138     //      return false;
5139     bind(A_IS_NOT_NULL);
5140     ldrw(cnt1, Address(a1, length_offset));
5141     ldrw(cnt2, Address(a2, length_offset));
5142     eorw(tmp5, cnt1, cnt2);
5143     cbnzw(tmp5, DONE);
5144     lea(a1, Address(a1, base_offset));
5145     lea(a2, Address(a2, base_offset));
5146     // Check for short strings, i.e. smaller than wordSize.
5147     subs(cnt1, cnt1, elem_per_word);
5148     br(Assembler::LT, SHORT);
5149     // Main 8 byte comparison loop.
5150     bind(NEXT_WORD); {
5151       ldr(tmp1, Address(post(a1, wordSize)));
5152       ldr(tmp2, Address(post(a2, wordSize)));
5153       subs(cnt1, cnt1, elem_per_word);
5154       eor(tmp5, tmp1, tmp2);
5155       cbnz(tmp5, DONE);
5156     } br(GT, NEXT_WORD);
5157     // Last longword.  In the case where length == 4 we compare the
5158     // same longword twice, but that's still faster than another
5159     // conditional branch.
5160     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5161     // length == 4.
5162     if (log_elem_size > 0)
5163       lsl(cnt1, cnt1, log_elem_size);
5164     ldr(tmp3, Address(a1, cnt1));
5165     ldr(tmp4, Address(a2, cnt1));
5166     eor(tmp5, tmp3, tmp4);
5167     cbnz(tmp5, DONE);
5168     b(SAME);
5169     bind(A_MIGHT_BE_NULL);
5170     // in case both a1 and a2 are not-null, proceed with loads
5171     cbz(a1, DONE);
5172     cbz(a2, DONE);
5173     b(A_IS_NOT_NULL);
5174     bind(SHORT);
5175 
5176     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5177     {
5178       ldrw(tmp1, Address(post(a1, 4)));
5179       ldrw(tmp2, Address(post(a2, 4)));
5180       eorw(tmp5, tmp1, tmp2);
5181       cbnzw(tmp5, DONE);
5182     }
5183     bind(TAIL03);
5184     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5185     {
5186       ldrh(tmp3, Address(post(a1, 2)));
5187       ldrh(tmp4, Address(post(a2, 2)));
5188       eorw(tmp5, tmp3, tmp4);
5189       cbnzw(tmp5, DONE);
5190     }
5191     bind(TAIL01);
5192     if (elem_size == 1) { // Only needed when comparing byte arrays.
5193       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5194       {
5195         ldrb(tmp1, a1);
5196         ldrb(tmp2, a2);
5197         eorw(tmp5, tmp1, tmp2);
5198         cbnzw(tmp5, DONE);
5199       }
5200     }
5201   } else {
5202     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5203         CSET_EQ, LAST_CHECK;
5204     mov(result, false);
5205     cbz(a1, DONE);
5206     ldrw(cnt1, Address(a1, length_offset));
5207     cbz(a2, DONE);
5208     ldrw(cnt2, Address(a2, length_offset));
5209     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5210     // faster to perform another branch before comparing a1 and a2
5211     cmp(cnt1, elem_per_word);
5212     br(LE, SHORT); // short or same
5213     ldr(tmp3, Address(pre(a1, base_offset)));
5214     cmp(cnt1, stubBytesThreshold);
5215     br(GE, STUB);
5216     ldr(tmp4, Address(pre(a2, base_offset)));
5217     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5218     cmp(cnt2, cnt1);
5219     br(NE, DONE);
5220 
5221     // Main 16 byte comparison loop with 2 exits
5222     bind(NEXT_DWORD); {
5223       ldr(tmp1, Address(pre(a1, wordSize)));
5224       ldr(tmp2, Address(pre(a2, wordSize)));
5225       subs(cnt1, cnt1, 2 * elem_per_word);
5226       br(LE, TAIL);
5227       eor(tmp4, tmp3, tmp4);
5228       cbnz(tmp4, DONE);
5229       ldr(tmp3, Address(pre(a1, wordSize)));
5230       ldr(tmp4, Address(pre(a2, wordSize)));
5231       cmp(cnt1, elem_per_word);
5232       br(LE, TAIL2);
5233       cmp(tmp1, tmp2);
5234     } br(EQ, NEXT_DWORD);
5235     b(DONE);
5236 
5237     bind(TAIL);
5238     eor(tmp4, tmp3, tmp4);
5239     eor(tmp2, tmp1, tmp2);
5240     lslv(tmp2, tmp2, tmp5);
5241     orr(tmp5, tmp4, tmp2);
5242     cmp(tmp5, zr);
5243     b(CSET_EQ);
5244 
5245     bind(TAIL2);
5246     eor(tmp2, tmp1, tmp2);
5247     cbnz(tmp2, DONE);
5248     b(LAST_CHECK);
5249 
5250     bind(STUB);
5251     ldr(tmp4, Address(pre(a2, base_offset)));
5252     cmp(cnt2, cnt1);
5253     br(NE, DONE);
5254     if (elem_size == 2) { // convert to byte counter
5255       lsl(cnt1, cnt1, 1);
5256     }
5257     eor(tmp5, tmp3, tmp4);
5258     cbnz(tmp5, DONE);
5259     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5260     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5261     trampoline_call(stub);
5262     b(DONE);
5263 
5264     bind(EARLY_OUT);
5265     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5266     // so, if a2 == null => return false(0), else return true, so we can return a2
5267     mov(result, a2);
5268     b(DONE);
5269     bind(SHORT);
5270     cmp(cnt2, cnt1);
5271     br(NE, DONE);
5272     cbz(cnt1, SAME);
5273     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5274     ldr(tmp3, Address(a1, base_offset));
5275     ldr(tmp4, Address(a2, base_offset));
5276     bind(LAST_CHECK);
5277     eor(tmp4, tmp3, tmp4);
5278     lslv(tmp5, tmp4, tmp5);
5279     cmp(tmp5, zr);
5280     bind(CSET_EQ);
5281     cset(result, EQ);
5282     b(DONE);
5283   }
5284 
5285   bind(SAME);
5286   mov(result, true);
5287   // That's it.
5288   bind(DONE);
5289 
5290   BLOCK_COMMENT("} array_equals");
5291 }
5292 
5293 // Compare Strings
5294 
5295 // For Strings we're passed the address of the first characters in a1
5296 // and a2 and the length in cnt1.
5297 // elem_size is the element size in bytes: either 1 or 2.
5298 // There are two implementations.  For arrays >= 8 bytes, all
5299 // comparisons (including the final one, which may overlap) are
5300 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5301 // halfword, then a short, and then a byte.
5302 
5303 void MacroAssembler::string_equals(Register a1, Register a2,
5304                                    Register result, Register cnt1, int elem_size)
5305 {
5306   Label SAME, DONE, SHORT, NEXT_WORD;
5307   Register tmp1 = rscratch1;
5308   Register tmp2 = rscratch2;
5309   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5310 
5311   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5312   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5313 
5314 #ifndef PRODUCT
5315   {
5316     const char kind = (elem_size == 2) ? 'U' : 'L';
5317     char comment[64];
5318     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5319     BLOCK_COMMENT(comment);
5320   }
5321 #endif
5322 
5323   mov(result, false);
5324 
5325   // Check for short strings, i.e. smaller than wordSize.
5326   subs(cnt1, cnt1, wordSize);
5327   br(Assembler::LT, SHORT);
5328   // Main 8 byte comparison loop.
5329   bind(NEXT_WORD); {
5330     ldr(tmp1, Address(post(a1, wordSize)));
5331     ldr(tmp2, Address(post(a2, wordSize)));
5332     subs(cnt1, cnt1, wordSize);
5333     eor(tmp1, tmp1, tmp2);
5334     cbnz(tmp1, DONE);
5335   } br(GT, NEXT_WORD);
5336   // Last longword.  In the case where length == 4 we compare the
5337   // same longword twice, but that's still faster than another
5338   // conditional branch.
5339   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5340   // length == 4.
5341   ldr(tmp1, Address(a1, cnt1));
5342   ldr(tmp2, Address(a2, cnt1));
5343   eor(tmp2, tmp1, tmp2);
5344   cbnz(tmp2, DONE);
5345   b(SAME);
5346 
5347   bind(SHORT);
5348   Label TAIL03, TAIL01;
5349 
5350   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5351   {
5352     ldrw(tmp1, Address(post(a1, 4)));
5353     ldrw(tmp2, Address(post(a2, 4)));
5354     eorw(tmp1, tmp1, tmp2);
5355     cbnzw(tmp1, DONE);
5356   }
5357   bind(TAIL03);
5358   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5359   {
5360     ldrh(tmp1, Address(post(a1, 2)));
5361     ldrh(tmp2, Address(post(a2, 2)));
5362     eorw(tmp1, tmp1, tmp2);
5363     cbnzw(tmp1, DONE);
5364   }
5365   bind(TAIL01);
5366   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5367     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5368     {
5369       ldrb(tmp1, a1);
5370       ldrb(tmp2, a2);
5371       eorw(tmp1, tmp1, tmp2);
5372       cbnzw(tmp1, DONE);
5373     }
5374   }
5375   // Arrays are equal.
5376   bind(SAME);
5377   mov(result, true);
5378 
5379   // That's it.
5380   bind(DONE);
5381   BLOCK_COMMENT("} string_equals");
5382 }
5383 
5384 
5385 // The size of the blocks erased by the zero_blocks stub.  We must
5386 // handle anything smaller than this ourselves in zero_words().
5387 const int MacroAssembler::zero_words_block_size = 8;
5388 
5389 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5390 // possible, handling small word counts locally and delegating
5391 // anything larger to the zero_blocks stub.  It is expanded many times
5392 // in compiled code, so it is important to keep it short.
5393 
5394 // ptr:   Address of a buffer to be zeroed.
5395 // cnt:   Count in HeapWords.
5396 //
5397 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5398 void MacroAssembler::zero_words(Register ptr, Register cnt)
5399 {
5400   assert(is_power_of_2(zero_words_block_size), "adjust this");
5401   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5402 
5403   BLOCK_COMMENT("zero_words {");
5404   cmp(cnt, zero_words_block_size);
5405   Label around, done, done16;
5406   br(LO, around);
5407   {
5408     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5409     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5410     if (StubRoutines::aarch64::complete()) {
5411       trampoline_call(zero_blocks);
5412     } else {
5413       bl(zero_blocks);
5414     }
5415   }
5416   bind(around);
5417   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5418     Label l;
5419     tbz(cnt, exact_log2(i), l);
5420     for (int j = 0; j < i; j += 2) {
5421       stp(zr, zr, post(ptr, 16));
5422     }
5423     bind(l);
5424   }
5425   {
5426     Label l;
5427     tbz(cnt, 0, l);
5428     str(zr, Address(ptr));
5429     bind(l);
5430   }
5431   BLOCK_COMMENT("} zero_words");
5432 }
5433 
5434 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5435 // cnt:          Immediate count in HeapWords.
5436 #define SmallArraySize (18 * BytesPerLong)
5437 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5438 {
5439   BLOCK_COMMENT("zero_words {");
5440   int i = cnt & 1;  // store any odd word to start
5441   if (i) str(zr, Address(base));
5442 
5443   if (cnt <= SmallArraySize / BytesPerLong) {
5444     for (; i < (int)cnt; i += 2)
5445       stp(zr, zr, Address(base, i * wordSize));
5446   } else {
5447     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5448     int remainder = cnt % (2 * unroll);
5449     for (; i < remainder; i += 2)
5450       stp(zr, zr, Address(base, i * wordSize));
5451 
5452     Label loop;
5453     Register cnt_reg = rscratch1;
5454     Register loop_base = rscratch2;
5455     cnt = cnt - remainder;
5456     mov(cnt_reg, cnt);
5457     // adjust base and prebias by -2 * wordSize so we can pre-increment
5458     add(loop_base, base, (remainder - 2) * wordSize);
5459     bind(loop);
5460     sub(cnt_reg, cnt_reg, 2 * unroll);
5461     for (i = 1; i < unroll; i++)
5462       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5463     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5464     cbnz(cnt_reg, loop);
5465   }
5466   BLOCK_COMMENT("} zero_words");
5467 }
5468 
5469 // Zero blocks of memory by using DC ZVA.
5470 //
5471 // Aligns the base address first sufficently for DC ZVA, then uses
5472 // DC ZVA repeatedly for every full block.  cnt is the size to be
5473 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5474 // in cnt.
5475 //
5476 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5477 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5478 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5479   Register tmp = rscratch1;
5480   Register tmp2 = rscratch2;
5481   int zva_length = VM_Version::zva_length();
5482   Label initial_table_end, loop_zva;
5483   Label fini;
5484 
5485   // Base must be 16 byte aligned. If not just return and let caller handle it
5486   tst(base, 0x0f);
5487   br(Assembler::NE, fini);
5488   // Align base with ZVA length.
5489   neg(tmp, base);
5490   andr(tmp, tmp, zva_length - 1);
5491 
5492   // tmp: the number of bytes to be filled to align the base with ZVA length.
5493   add(base, base, tmp);
5494   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5495   adr(tmp2, initial_table_end);
5496   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5497   br(tmp2);
5498 
5499   for (int i = -zva_length + 16; i < 0; i += 16)
5500     stp(zr, zr, Address(base, i));
5501   bind(initial_table_end);
5502 
5503   sub(cnt, cnt, zva_length >> 3);
5504   bind(loop_zva);
5505   dc(Assembler::ZVA, base);
5506   subs(cnt, cnt, zva_length >> 3);
5507   add(base, base, zva_length);
5508   br(Assembler::GE, loop_zva);
5509   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5510   bind(fini);
5511 }
5512 
5513 // base:   Address of a buffer to be filled, 8 bytes aligned.
5514 // cnt:    Count in 8-byte unit.
5515 // value:  Value to be filled with.
5516 // base will point to the end of the buffer after filling.
5517 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5518 {
5519 //  Algorithm:
5520 //
5521 //    scratch1 = cnt & 7;
5522 //    cnt -= scratch1;
5523 //    p += scratch1;
5524 //    switch (scratch1) {
5525 //      do {
5526 //        cnt -= 8;
5527 //          p[-8] = v;
5528 //        case 7:
5529 //          p[-7] = v;
5530 //        case 6:
5531 //          p[-6] = v;
5532 //          // ...
5533 //        case 1:
5534 //          p[-1] = v;
5535 //        case 0:
5536 //          p += 8;
5537 //      } while (cnt);
5538 //    }
5539 
5540   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5541 
5542   Label fini, skip, entry, loop;
5543   const int unroll = 8; // Number of stp instructions we'll unroll
5544 
5545   cbz(cnt, fini);
5546   tbz(base, 3, skip);
5547   str(value, Address(post(base, 8)));
5548   sub(cnt, cnt, 1);
5549   bind(skip);
5550 
5551   andr(rscratch1, cnt, (unroll-1) * 2);
5552   sub(cnt, cnt, rscratch1);
5553   add(base, base, rscratch1, Assembler::LSL, 3);
5554   adr(rscratch2, entry);
5555   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5556   br(rscratch2);
5557 
5558   bind(loop);
5559   add(base, base, unroll * 16);
5560   for (int i = -unroll; i < 0; i++)
5561     stp(value, value, Address(base, i * 16));
5562   bind(entry);
5563   subs(cnt, cnt, unroll * 2);
5564   br(Assembler::GE, loop);
5565 
5566   tbz(cnt, 0, fini);
5567   str(value, Address(post(base, 8)));
5568   bind(fini);
5569 }
5570 
5571 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5572 // java/lang/StringUTF16.compress.
5573 void MacroAssembler::encode_iso_array(Register src, Register dst,
5574                       Register len, Register result,
5575                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5576                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5577 {
5578     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5579         NEXT_32_START, NEXT_32_PRFM_START;
5580     Register tmp1 = rscratch1, tmp2 = rscratch2;
5581 
5582       mov(result, len); // Save initial len
5583 
5584 #ifndef BUILTIN_SIM
5585       cmp(len, 8); // handle shortest strings first
5586       br(LT, LOOP_1);
5587       cmp(len, 32);
5588       br(LT, NEXT_8);
5589       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5590       // to convert chars to bytes
5591       if (SoftwarePrefetchHintDistance >= 0) {
5592         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5593         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5594         br(LE, NEXT_32_START);
5595         b(NEXT_32_PRFM_START);
5596         BIND(NEXT_32_PRFM);
5597           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5598         BIND(NEXT_32_PRFM_START);
5599           prfm(Address(src, SoftwarePrefetchHintDistance));
5600           orr(v4, T16B, Vtmp1, Vtmp2);
5601           orr(v5, T16B, Vtmp3, Vtmp4);
5602           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5603           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5604           stpq(Vtmp1, Vtmp3, dst);
5605           uzp2(v5, T16B, v4, v5); // high bytes
5606           umov(tmp2, v5, D, 1);
5607           fmovd(tmp1, v5);
5608           orr(tmp1, tmp1, tmp2);
5609           cbnz(tmp1, LOOP_8);
5610           sub(len, len, 32);
5611           add(dst, dst, 32);
5612           add(src, src, 64);
5613           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5614           br(GE, NEXT_32_PRFM);
5615           cmp(len, 32);
5616           br(LT, LOOP_8);
5617         BIND(NEXT_32);
5618           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5619         BIND(NEXT_32_START);
5620       } else {
5621         BIND(NEXT_32);
5622           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5623       }
5624       prfm(Address(src, SoftwarePrefetchHintDistance));
5625       uzp1(v4, T16B, Vtmp1, Vtmp2);
5626       uzp1(v5, T16B, Vtmp3, Vtmp4);
5627       stpq(v4, v5, dst);
5628       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5629       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5630       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5631       umov(tmp2, Vtmp1, D, 1);
5632       fmovd(tmp1, Vtmp1);
5633       orr(tmp1, tmp1, tmp2);
5634       cbnz(tmp1, LOOP_8);
5635       sub(len, len, 32);
5636       add(dst, dst, 32);
5637       add(src, src, 64);
5638       cmp(len, 32);
5639       br(GE, NEXT_32);
5640       cbz(len, DONE);
5641 
5642     BIND(LOOP_8);
5643       cmp(len, 8);
5644       br(LT, LOOP_1);
5645     BIND(NEXT_8);
5646       ld1(Vtmp1, T8H, src);
5647       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5648       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5649       strd(Vtmp2, dst);
5650       fmovd(tmp1, Vtmp3);
5651       cbnz(tmp1, NEXT_1);
5652 
5653       sub(len, len, 8);
5654       add(dst, dst, 8);
5655       add(src, src, 16);
5656       cmp(len, 8);
5657       br(GE, NEXT_8);
5658 
5659     BIND(LOOP_1);
5660 #endif
5661     cbz(len, DONE);
5662     BIND(NEXT_1);
5663       ldrh(tmp1, Address(post(src, 2)));
5664       strb(tmp1, Address(post(dst, 1)));
5665       tst(tmp1, 0xff00);
5666       br(NE, SET_RESULT);
5667       subs(len, len, 1);
5668       br(GT, NEXT_1);
5669 
5670     BIND(SET_RESULT);
5671       sub(result, result, len); // Return index where we stopped
5672                                 // Return len == 0 if we processed all
5673                                 // characters
5674     BIND(DONE);
5675 }
5676 
5677 
5678 // Inflate byte[] array to char[].
5679 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5680                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5681                                         Register tmp4) {
5682   Label big, done, after_init, to_stub;
5683 
5684   assert_different_registers(src, dst, len, tmp4, rscratch1);
5685 
5686   fmovd(vtmp1, zr);
5687   lsrw(tmp4, len, 3);
5688   bind(after_init);
5689   cbnzw(tmp4, big);
5690   // Short string: less than 8 bytes.
5691   {
5692     Label loop, tiny;
5693 
5694     cmpw(len, 4);
5695     br(LT, tiny);
5696     // Use SIMD to do 4 bytes.
5697     ldrs(vtmp2, post(src, 4));
5698     zip1(vtmp3, T8B, vtmp2, vtmp1);
5699     subw(len, len, 4);
5700     strd(vtmp3, post(dst, 8));
5701 
5702     cbzw(len, done);
5703 
5704     // Do the remaining bytes by steam.
5705     bind(loop);
5706     ldrb(tmp4, post(src, 1));
5707     strh(tmp4, post(dst, 2));
5708     subw(len, len, 1);
5709 
5710     bind(tiny);
5711     cbnz(len, loop);
5712 
5713     b(done);
5714   }
5715 
5716   if (SoftwarePrefetchHintDistance >= 0) {
5717     bind(to_stub);
5718       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5719       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5720       trampoline_call(stub);
5721       b(after_init);
5722   }
5723 
5724   // Unpack the bytes 8 at a time.
5725   bind(big);
5726   {
5727     Label loop, around, loop_last, loop_start;
5728 
5729     if (SoftwarePrefetchHintDistance >= 0) {
5730       const int large_loop_threshold = (64 + 16)/8;
5731       ldrd(vtmp2, post(src, 8));
5732       andw(len, len, 7);
5733       cmp(tmp4, large_loop_threshold);
5734       br(GE, to_stub);
5735       b(loop_start);
5736 
5737       bind(loop);
5738       ldrd(vtmp2, post(src, 8));
5739       bind(loop_start);
5740       subs(tmp4, tmp4, 1);
5741       br(EQ, loop_last);
5742       zip1(vtmp2, T16B, vtmp2, vtmp1);
5743       ldrd(vtmp3, post(src, 8));
5744       st1(vtmp2, T8H, post(dst, 16));
5745       subs(tmp4, tmp4, 1);
5746       zip1(vtmp3, T16B, vtmp3, vtmp1);
5747       st1(vtmp3, T8H, post(dst, 16));
5748       br(NE, loop);
5749       b(around);
5750       bind(loop_last);
5751       zip1(vtmp2, T16B, vtmp2, vtmp1);
5752       st1(vtmp2, T8H, post(dst, 16));
5753       bind(around);
5754       cbz(len, done);
5755     } else {
5756       andw(len, len, 7);
5757       bind(loop);
5758       ldrd(vtmp2, post(src, 8));
5759       sub(tmp4, tmp4, 1);
5760       zip1(vtmp3, T16B, vtmp2, vtmp1);
5761       st1(vtmp3, T8H, post(dst, 16));
5762       cbnz(tmp4, loop);
5763     }
5764   }
5765 
5766   // Do the tail of up to 8 bytes.
5767   add(src, src, len);
5768   ldrd(vtmp3, Address(src, -8));
5769   add(dst, dst, len, ext::uxtw, 1);
5770   zip1(vtmp3, T16B, vtmp3, vtmp1);
5771   strq(vtmp3, Address(dst, -16));
5772 
5773   bind(done);
5774 }
5775 
5776 // Compress char[] array to byte[].
5777 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5778                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5779                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5780                                          Register result) {
5781   encode_iso_array(src, dst, len, result,
5782                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5783   cmp(len, zr);
5784   csel(result, result, zr, EQ);
5785 }
5786 
5787 // get_thread() can be called anywhere inside generated code so we
5788 // need to save whatever non-callee save context might get clobbered
5789 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5790 // the call setup code.
5791 //
5792 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5793 //
5794 void MacroAssembler::get_thread(Register dst) {
5795   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5796   push(saved_regs, sp);
5797 
5798   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5799   blrt(lr, 1, 0, 1);
5800   if (dst != c_rarg0) {
5801     mov(dst, c_rarg0);
5802   }
5803 
5804   pop(saved_regs, sp);
5805 }