1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "oops/oop.hpp"
  44 #include "opto/compile.hpp"
  45 #include "opto/intrinsicnode.hpp"
  46 #include "opto/node.hpp"
  47 #include "runtime/biasedLocking.hpp"
  48 #include "runtime/icache.hpp"
  49 #include "runtime/interfaceSupport.inline.hpp"
  50 #include "runtime/jniHandles.inline.hpp"
  51 #include "runtime/sharedRuntime.hpp"
  52 #include "runtime/thread.hpp"
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // Patch any kind of instruction; there may be several instructions.
  65 // Return the total length (in bytes) of the instructions.
  66 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  67   int instructions = 1;
  68   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  69   long offset = (target - branch) >> 2;
  70   unsigned insn = *(unsigned*)branch;
  71   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  72     // Load register (literal)
  73     Instruction_aarch64::spatch(branch, 23, 5, offset);
  74   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  75     // Unconditional branch (immediate)
  76     Instruction_aarch64::spatch(branch, 25, 0, offset);
  77   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  78     // Conditional branch (immediate)
  79     Instruction_aarch64::spatch(branch, 23, 5, offset);
  80   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  81     // Compare & branch (immediate)
  82     Instruction_aarch64::spatch(branch, 23, 5, offset);
  83   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  84     // Test & branch (immediate)
  85     Instruction_aarch64::spatch(branch, 18, 5, offset);
  86   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  87     // PC-rel. addressing
  88     offset = target-branch;
  89     int shift = Instruction_aarch64::extract(insn, 31, 31);
  90     if (shift) {
  91       u_int64_t dest = (u_int64_t)target;
  92       uint64_t pc_page = (uint64_t)branch >> 12;
  93       uint64_t adr_page = (uint64_t)target >> 12;
  94       unsigned offset_lo = dest & 0xfff;
  95       offset = adr_page - pc_page;
  96 
  97       // We handle 4 types of PC relative addressing
  98       //   1 - adrp    Rx, target_page
  99       //       ldr/str Ry, [Rx, #offset_in_page]
 100       //   2 - adrp    Rx, target_page
 101       //       add     Ry, Rx, #offset_in_page
 102       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 103       //       movk    Rx, #imm16<<32
 104       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 105       // In the first 3 cases we must check that Rx is the same in the adrp and the
 106       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 107       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 108       // to be followed by a random unrelated ldr/str, add or movk instruction.
 109       //
 110       unsigned insn2 = ((unsigned*)branch)[1];
 111       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 112                 Instruction_aarch64::extract(insn, 4, 0) ==
 113                         Instruction_aarch64::extract(insn2, 9, 5)) {
 114         // Load/store register (unsigned immediate)
 115         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 116         Instruction_aarch64::patch(branch + sizeof (unsigned),
 117                                     21, 10, offset_lo >> size);
 118         guarantee(((dest >> size) << size) == dest, "misaligned target");
 119         instructions = 2;
 120       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 121                 Instruction_aarch64::extract(insn, 4, 0) ==
 122                         Instruction_aarch64::extract(insn2, 4, 0)) {
 123         // add (immediate)
 124         Instruction_aarch64::patch(branch + sizeof (unsigned),
 125                                    21, 10, offset_lo);
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 128                    Instruction_aarch64::extract(insn, 4, 0) ==
 129                      Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // movk #imm16<<32
 131         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 132         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 133         long pc_page = (long)branch >> 12;
 134         long adr_page = (long)dest >> 12;
 135         offset = adr_page - pc_page;
 136         instructions = 2;
 137       }
 138     }
 139     int offset_lo = offset & 3;
 140     offset >>= 2;
 141     Instruction_aarch64::spatch(branch, 23, 5, offset);
 142     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 143   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 144     u_int64_t dest = (u_int64_t)target;
 145     // Move wide constant
 146     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 147     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 148     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 149     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 150     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 151     assert(target_addr_for_insn(branch) == target, "should be");
 152     instructions = 3;
 153   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 154              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 155     // nothing to do
 156     assert(target == 0, "did not expect to relocate target for polling page load");
 157   } else {
 158     ShouldNotReachHere();
 159   }
 160   return instructions * NativeInstruction::instruction_size;
 161 }
 162 
 163 int MacroAssembler::patch_oop(address insn_addr, address o) {
 164   int instructions;
 165   unsigned insn = *(unsigned*)insn_addr;
 166   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 167 
 168   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 169   // narrow OOPs by setting the upper 16 bits in the first
 170   // instruction.
 171   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 172     // Move narrow OOP
 173     narrowOop n = CompressedOops::encode((oop)o);
 174     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 175     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 176     instructions = 2;
 177   } else {
 178     // Move wide OOP
 179     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 180     uintptr_t dest = (uintptr_t)o;
 181     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 184     instructions = 3;
 185   }
 186   return instructions * NativeInstruction::instruction_size;
 187 }
 188 
 189 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 190   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 191   // We encode narrow ones by setting the upper 16 bits in the first
 192   // instruction.
 193   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 194   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 195          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 196 
 197   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 198   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 199   return 2 * NativeInstruction::instruction_size;
 200 }
 201 
 202 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 203   long offset = 0;
 204   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 205     // Load register (literal)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207     return address(((uint64_t)insn_addr + (offset << 2)));
 208   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 209     // Unconditional branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 25, 0);
 211   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 212     // Conditional branch (immediate)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 215     // Compare & branch (immediate)
 216     offset = Instruction_aarch64::sextract(insn, 23, 5);
 217    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 218     // Test & branch (immediate)
 219     offset = Instruction_aarch64::sextract(insn, 18, 5);
 220   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 221     // PC-rel. addressing
 222     offset = Instruction_aarch64::extract(insn, 30, 29);
 223     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 224     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 225     if (shift) {
 226       offset <<= shift;
 227       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 228       target_page &= ((uint64_t)-1) << shift;
 229       // Return the target address for the following sequences
 230       //   1 - adrp    Rx, target_page
 231       //       ldr/str Ry, [Rx, #offset_in_page]
 232       //   2 - adrp    Rx, target_page
 233       //       add     Ry, Rx, #offset_in_page
 234       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 235       //       movk    Rx, #imm12<<32
 236       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //
 238       // In the first two cases  we check that the register is the same and
 239       // return the target_page + the offset within the page.
 240       // Otherwise we assume it is a page aligned relocation and return
 241       // the target page only.
 242       //
 243       unsigned insn2 = ((unsigned*)insn_addr)[1];
 244       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 245                 Instruction_aarch64::extract(insn, 4, 0) ==
 246                         Instruction_aarch64::extract(insn2, 9, 5)) {
 247         // Load/store register (unsigned immediate)
 248         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 249         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 250         return address(target_page + (byte_offset << size));
 251       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 4, 0)) {
 254         // add (immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         return address(target_page + byte_offset);
 257       } else {
 258         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 259                Instruction_aarch64::extract(insn, 4, 0) ==
 260                  Instruction_aarch64::extract(insn2, 4, 0)) {
 261           target_page = (target_page & 0xffffffff) |
 262                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 263         }
 264         return (address)target_page;
 265       }
 266     } else {
 267       ShouldNotReachHere();
 268     }
 269   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 270     u_int32_t *insns = (u_int32_t *)insn_addr;
 271     // Move wide constant: movz, movk, movk.  See movptr().
 272     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 273     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 274     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 275                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 277   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 278              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 279     return 0;
 280   } else {
 281     ShouldNotReachHere();
 282   }
 283   return address(((uint64_t)insn_addr + (offset << 2)));
 284 }
 285 
 286 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 287   dsb(Assembler::SY);
 288 }
 289 
 290 void MacroAssembler::safepoint_poll(Label& slow_path) {
 291   if (SafepointMechanism::uses_thread_local_poll()) {
 292     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 293     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 294   } else {
 295     unsigned long offset;
 296     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 297     ldrw(rscratch1, Address(rscratch1, offset));
 298     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 299     cbnz(rscratch1, slow_path);
 300   }
 301 }
 302 
 303 // Just like safepoint_poll, but use an acquiring load for thread-
 304 // local polling.
 305 //
 306 // We need an acquire here to ensure that any subsequent load of the
 307 // global SafepointSynchronize::_state flag is ordered after this load
 308 // of the local Thread::_polling page.  We don't want this poll to
 309 // return false (i.e. not safepointing) and a later poll of the global
 310 // SafepointSynchronize::_state spuriously to return true.
 311 //
 312 // This is to avoid a race when we're in a native->Java transition
 313 // racing the code which wakes up from a safepoint.
 314 //
 315 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 316   if (SafepointMechanism::uses_thread_local_poll()) {
 317     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 318     ldar(rscratch1, rscratch1);
 319     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 320   } else {
 321     safepoint_poll(slow_path);
 322   }
 323 }
 324 
 325 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 326   // we must set sp to zero to clear frame
 327   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 328 
 329   // must clear fp, so that compiled frames are not confused; it is
 330   // possible that we need it only for debugging
 331   if (clear_fp) {
 332     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 333   }
 334 
 335   // Always clear the pc because it could have been set by make_walkable()
 336   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 337 }
 338 
 339 // Calls to C land
 340 //
 341 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 342 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 343 // has to be reset to 0. This is required to allow proper stack traversal.
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Register last_java_pc,
 347                                          Register scratch) {
 348 
 349   if (last_java_pc->is_valid()) {
 350       str(last_java_pc, Address(rthread,
 351                                 JavaThread::frame_anchor_offset()
 352                                 + JavaFrameAnchor::last_Java_pc_offset()));
 353     }
 354 
 355   // determine last_java_sp register
 356   if (last_java_sp == sp) {
 357     mov(scratch, sp);
 358     last_java_sp = scratch;
 359   } else if (!last_java_sp->is_valid()) {
 360     last_java_sp = esp;
 361   }
 362 
 363   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 364 
 365   // last_java_fp is optional
 366   if (last_java_fp->is_valid()) {
 367     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 368   }
 369 }
 370 
 371 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 372                                          Register last_java_fp,
 373                                          address  last_java_pc,
 374                                          Register scratch) {
 375   if (last_java_pc != NULL) {
 376     adr(scratch, last_java_pc);
 377   } else {
 378     // FIXME: This is almost never correct.  We should delete all
 379     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 380     // correct return address instead.
 381     adr(scratch, pc());
 382   }
 383 
 384   str(scratch, Address(rthread,
 385                        JavaThread::frame_anchor_offset()
 386                        + JavaFrameAnchor::last_Java_pc_offset()));
 387 
 388   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 389 }
 390 
 391 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 392                                          Register last_java_fp,
 393                                          Label &L,
 394                                          Register scratch) {
 395   if (L.is_bound()) {
 396     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 397   } else {
 398     InstructionMark im(this);
 399     L.add_patch_at(code(), locator());
 400     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 401   }
 402 }
 403 
 404 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 405   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 406   assert(CodeCache::find_blob(entry.target()) != NULL,
 407          "destination of far call not found in code cache");
 408   if (far_branches()) {
 409     unsigned long offset;
 410     // We can use ADRP here because we know that the total size of
 411     // the code cache cannot exceed 2Gb.
 412     adrp(tmp, entry, offset);
 413     add(tmp, tmp, offset);
 414     if (cbuf) cbuf->set_insts_mark();
 415     blr(tmp);
 416   } else {
 417     if (cbuf) cbuf->set_insts_mark();
 418     bl(entry);
 419   }
 420 }
 421 
 422 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 423   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 424   assert(CodeCache::find_blob(entry.target()) != NULL,
 425          "destination of far call not found in code cache");
 426   if (far_branches()) {
 427     unsigned long offset;
 428     // We can use ADRP here because we know that the total size of
 429     // the code cache cannot exceed 2Gb.
 430     adrp(tmp, entry, offset);
 431     add(tmp, tmp, offset);
 432     if (cbuf) cbuf->set_insts_mark();
 433     br(tmp);
 434   } else {
 435     if (cbuf) cbuf->set_insts_mark();
 436     b(entry);
 437   }
 438 }
 439 
 440 void MacroAssembler::reserved_stack_check() {
 441     // testing if reserved zone needs to be enabled
 442     Label no_reserved_zone_enabling;
 443 
 444     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 445     cmp(sp, rscratch1);
 446     br(Assembler::LO, no_reserved_zone_enabling);
 447 
 448     enter();   // LR and FP are live.
 449     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 450     mov(c_rarg0, rthread);
 451     blr(rscratch1);
 452     leave();
 453 
 454     // We have already removed our own frame.
 455     // throw_delayed_StackOverflowError will think that it's been
 456     // called by our caller.
 457     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 458     br(rscratch1);
 459     should_not_reach_here();
 460 
 461     bind(no_reserved_zone_enabling);
 462 }
 463 
 464 int MacroAssembler::biased_locking_enter(Register lock_reg,
 465                                          Register obj_reg,
 466                                          Register swap_reg,
 467                                          Register tmp_reg,
 468                                          bool swap_reg_contains_mark,
 469                                          Label& done,
 470                                          Label* slow_case,
 471                                          BiasedLockingCounters* counters) {
 472   assert(UseBiasedLocking, "why call this otherwise?");
 473   assert_different_registers(lock_reg, obj_reg, swap_reg);
 474 
 475   if (PrintBiasedLockingStatistics && counters == NULL)
 476     counters = BiasedLocking::counters();
 477 
 478   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 479   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 480   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 481   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 482   Address saved_mark_addr(lock_reg, 0);
 483 
 484   // Biased locking
 485   // See whether the lock is currently biased toward our thread and
 486   // whether the epoch is still valid
 487   // Note that the runtime guarantees sufficient alignment of JavaThread
 488   // pointers to allow age to be placed into low bits
 489   // First check to see whether biasing is even enabled for this object
 490   Label cas_label;
 491   int null_check_offset = -1;
 492   if (!swap_reg_contains_mark) {
 493     null_check_offset = offset();
 494     ldr(swap_reg, mark_addr);
 495   }
 496   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 497   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 498   br(Assembler::NE, cas_label);
 499   // The bias pattern is present in the object's header. Need to check
 500   // whether the bias owner and the epoch are both still current.
 501   load_prototype_header(tmp_reg, obj_reg);
 502   orr(tmp_reg, tmp_reg, rthread);
 503   eor(tmp_reg, swap_reg, tmp_reg);
 504   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 505   if (counters != NULL) {
 506     Label around;
 507     cbnz(tmp_reg, around);
 508     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 509     b(done);
 510     bind(around);
 511   } else {
 512     cbz(tmp_reg, done);
 513   }
 514 
 515   Label try_revoke_bias;
 516   Label try_rebias;
 517 
 518   // At this point we know that the header has the bias pattern and
 519   // that we are not the bias owner in the current epoch. We need to
 520   // figure out more details about the state of the header in order to
 521   // know what operations can be legally performed on the object's
 522   // header.
 523 
 524   // If the low three bits in the xor result aren't clear, that means
 525   // the prototype header is no longer biased and we have to revoke
 526   // the bias on this object.
 527   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 528   cbnz(rscratch1, try_revoke_bias);
 529 
 530   // Biasing is still enabled for this data type. See whether the
 531   // epoch of the current bias is still valid, meaning that the epoch
 532   // bits of the mark word are equal to the epoch bits of the
 533   // prototype header. (Note that the prototype header's epoch bits
 534   // only change at a safepoint.) If not, attempt to rebias the object
 535   // toward the current thread. Note that we must be absolutely sure
 536   // that the current epoch is invalid in order to do this because
 537   // otherwise the manipulations it performs on the mark word are
 538   // illegal.
 539   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 540   cbnz(rscratch1, try_rebias);
 541 
 542   // The epoch of the current bias is still valid but we know nothing
 543   // about the owner; it might be set or it might be clear. Try to
 544   // acquire the bias of the object using an atomic operation. If this
 545   // fails we will go in to the runtime to revoke the object's bias.
 546   // Note that we first construct the presumed unbiased header so we
 547   // don't accidentally blow away another thread's valid bias.
 548   {
 549     Label here;
 550     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 551     andr(swap_reg, swap_reg, rscratch1);
 552     orr(tmp_reg, swap_reg, rthread);
 553     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 554     // If the biasing toward our thread failed, this means that
 555     // another thread succeeded in biasing it toward itself and we
 556     // need to revoke that bias. The revocation will occur in the
 557     // interpreter runtime in the slow case.
 558     bind(here);
 559     if (counters != NULL) {
 560       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 561                   tmp_reg, rscratch1, rscratch2);
 562     }
 563   }
 564   b(done);
 565 
 566   bind(try_rebias);
 567   // At this point we know the epoch has expired, meaning that the
 568   // current "bias owner", if any, is actually invalid. Under these
 569   // circumstances _only_, we are allowed to use the current header's
 570   // value as the comparison value when doing the cas to acquire the
 571   // bias in the current epoch. In other words, we allow transfer of
 572   // the bias from one thread to another directly in this situation.
 573   //
 574   // FIXME: due to a lack of registers we currently blow away the age
 575   // bits in this situation. Should attempt to preserve them.
 576   {
 577     Label here;
 578     load_prototype_header(tmp_reg, obj_reg);
 579     orr(tmp_reg, rthread, tmp_reg);
 580     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 581     // If the biasing toward our thread failed, then another thread
 582     // succeeded in biasing it toward itself and we need to revoke that
 583     // bias. The revocation will occur in the runtime in the slow case.
 584     bind(here);
 585     if (counters != NULL) {
 586       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 587                   tmp_reg, rscratch1, rscratch2);
 588     }
 589   }
 590   b(done);
 591 
 592   bind(try_revoke_bias);
 593   // The prototype mark in the klass doesn't have the bias bit set any
 594   // more, indicating that objects of this data type are not supposed
 595   // to be biased any more. We are going to try to reset the mark of
 596   // this object to the prototype value and fall through to the
 597   // CAS-based locking scheme. Note that if our CAS fails, it means
 598   // that another thread raced us for the privilege of revoking the
 599   // bias of this particular object, so it's okay to continue in the
 600   // normal locking code.
 601   //
 602   // FIXME: due to a lack of registers we currently blow away the age
 603   // bits in this situation. Should attempt to preserve them.
 604   {
 605     Label here, nope;
 606     load_prototype_header(tmp_reg, obj_reg);
 607     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 608     bind(here);
 609 
 610     // Fall through to the normal CAS-based lock, because no matter what
 611     // the result of the above CAS, some thread must have succeeded in
 612     // removing the bias bit from the object's header.
 613     if (counters != NULL) {
 614       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 615                   rscratch1, rscratch2);
 616     }
 617     bind(nope);
 618   }
 619 
 620   bind(cas_label);
 621 
 622   return null_check_offset;
 623 }
 624 
 625 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 626   assert(UseBiasedLocking, "why call this otherwise?");
 627 
 628   // Check for biased locking unlock case, which is a no-op
 629   // Note: we do not have to check the thread ID for two reasons.
 630   // First, the interpreter checks for IllegalMonitorStateException at
 631   // a higher level. Second, if the bias was revoked while we held the
 632   // lock, the object could not be rebiased toward another thread, so
 633   // the bias bit would be clear.
 634   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 635   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 636   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 637   br(Assembler::EQ, done);
 638 }
 639 
 640 static void pass_arg0(MacroAssembler* masm, Register arg) {
 641   if (c_rarg0 != arg ) {
 642     masm->mov(c_rarg0, arg);
 643   }
 644 }
 645 
 646 static void pass_arg1(MacroAssembler* masm, Register arg) {
 647   if (c_rarg1 != arg ) {
 648     masm->mov(c_rarg1, arg);
 649   }
 650 }
 651 
 652 static void pass_arg2(MacroAssembler* masm, Register arg) {
 653   if (c_rarg2 != arg ) {
 654     masm->mov(c_rarg2, arg);
 655   }
 656 }
 657 
 658 static void pass_arg3(MacroAssembler* masm, Register arg) {
 659   if (c_rarg3 != arg ) {
 660     masm->mov(c_rarg3, arg);
 661   }
 662 }
 663 
 664 void MacroAssembler::call_VM_base(Register oop_result,
 665                                   Register java_thread,
 666                                   Register last_java_sp,
 667                                   address  entry_point,
 668                                   int      number_of_arguments,
 669                                   bool     check_exceptions) {
 670    // determine java_thread register
 671   if (!java_thread->is_valid()) {
 672     java_thread = rthread;
 673   }
 674 
 675   // determine last_java_sp register
 676   if (!last_java_sp->is_valid()) {
 677     last_java_sp = esp;
 678   }
 679 
 680   // debugging support
 681   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 682   assert(java_thread == rthread, "unexpected register");
 683 #ifdef ASSERT
 684   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 685   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 686 #endif // ASSERT
 687 
 688   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 689   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 690 
 691   // push java thread (becomes first argument of C function)
 692 
 693   mov(c_rarg0, java_thread);
 694 
 695   // set last Java frame before call
 696   assert(last_java_sp != rfp, "can't use rfp");
 697 
 698   Label l;
 699   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 700 
 701   // do the call, remove parameters
 702   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 703 
 704   // reset last Java frame
 705   // Only interpreter should have to clear fp
 706   reset_last_Java_frame(true);
 707 
 708    // C++ interp handles this in the interpreter
 709   check_and_handle_popframe(java_thread);
 710   check_and_handle_earlyret(java_thread);
 711 
 712   if (check_exceptions) {
 713     // check for pending exceptions (java_thread is set upon return)
 714     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 715     Label ok;
 716     cbz(rscratch1, ok);
 717     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 718     br(rscratch1);
 719     bind(ok);
 720   }
 721 
 722   // get oop result if there is one and reset the value in the thread
 723   if (oop_result->is_valid()) {
 724     get_vm_result(oop_result, java_thread);
 725   }
 726 }
 727 
 728 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 729   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 730 }
 731 
 732 // Maybe emit a call via a trampoline.  If the code cache is small
 733 // trampolines won't be emitted.
 734 
 735 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 736   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 737   assert(entry.rspec().type() == relocInfo::runtime_call_type
 738          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 739          || entry.rspec().type() == relocInfo::static_call_type
 740          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 741 
 742   // We need a trampoline if branches are far.
 743   if (far_branches()) {
 744     // We don't want to emit a trampoline if C2 is generating dummy
 745     // code during its branch shortening phase.
 746     CompileTask* task = ciEnv::current()->task();
 747     bool in_scratch_emit_size =
 748       (task != NULL && is_c2_compile(task->comp_level()) &&
 749        Compile::current()->in_scratch_emit_size());
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 784   if (stub == NULL) {
 785     return NULL;  // CodeBuffer::expand failed
 786   }
 787 
 788   // Create a trampoline stub relocation which relates this trampoline stub
 789   // with the call instruction at insts_call_instruction_offset in the
 790   // instructions code-section.
 791   align(wordSize);
 792   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 793                                             + insts_call_instruction_offset));
 794   const int stub_start_offset = offset();
 795 
 796   // Now, create the trampoline stub's code:
 797   // - load the call
 798   // - call
 799   Label target;
 800   ldr(rscratch1, target);
 801   br(rscratch1);
 802   bind(target);
 803   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 804          "should be");
 805   emit_int64((int64_t)dest);
 806 
 807   const address stub_start_addr = addr_at(stub_start_offset);
 808 
 809   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 810 
 811   end_a_stub();
 812   return stub_start_addr;
 813 }
 814 
 815 address MacroAssembler::ic_call(address entry, jint method_index) {
 816   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 817   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 818   // unsigned long offset;
 819   // ldr_constant(rscratch2, const_ptr);
 820   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 821   return trampoline_call(Address(entry, rh));
 822 }
 823 
 824 // Implementation of call_VM versions
 825 
 826 void MacroAssembler::call_VM(Register oop_result,
 827                              address entry_point,
 828                              bool check_exceptions) {
 829   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 830 }
 831 
 832 void MacroAssembler::call_VM(Register oop_result,
 833                              address entry_point,
 834                              Register arg_1,
 835                              bool check_exceptions) {
 836   pass_arg1(this, arg_1);
 837   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 838 }
 839 
 840 void MacroAssembler::call_VM(Register oop_result,
 841                              address entry_point,
 842                              Register arg_1,
 843                              Register arg_2,
 844                              bool check_exceptions) {
 845   assert(arg_1 != c_rarg2, "smashed arg");
 846   pass_arg2(this, arg_2);
 847   pass_arg1(this, arg_1);
 848   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 849 }
 850 
 851 void MacroAssembler::call_VM(Register oop_result,
 852                              address entry_point,
 853                              Register arg_1,
 854                              Register arg_2,
 855                              Register arg_3,
 856                              bool check_exceptions) {
 857   assert(arg_1 != c_rarg3, "smashed arg");
 858   assert(arg_2 != c_rarg3, "smashed arg");
 859   pass_arg3(this, arg_3);
 860 
 861   assert(arg_1 != c_rarg2, "smashed arg");
 862   pass_arg2(this, arg_2);
 863 
 864   pass_arg1(this, arg_1);
 865   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 866 }
 867 
 868 void MacroAssembler::call_VM(Register oop_result,
 869                              Register last_java_sp,
 870                              address entry_point,
 871                              int number_of_arguments,
 872                              bool check_exceptions) {
 873   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 874 }
 875 
 876 void MacroAssembler::call_VM(Register oop_result,
 877                              Register last_java_sp,
 878                              address entry_point,
 879                              Register arg_1,
 880                              bool check_exceptions) {
 881   pass_arg1(this, arg_1);
 882   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 883 }
 884 
 885 void MacroAssembler::call_VM(Register oop_result,
 886                              Register last_java_sp,
 887                              address entry_point,
 888                              Register arg_1,
 889                              Register arg_2,
 890                              bool check_exceptions) {
 891 
 892   assert(arg_1 != c_rarg2, "smashed arg");
 893   pass_arg2(this, arg_2);
 894   pass_arg1(this, arg_1);
 895   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 896 }
 897 
 898 void MacroAssembler::call_VM(Register oop_result,
 899                              Register last_java_sp,
 900                              address entry_point,
 901                              Register arg_1,
 902                              Register arg_2,
 903                              Register arg_3,
 904                              bool check_exceptions) {
 905   assert(arg_1 != c_rarg3, "smashed arg");
 906   assert(arg_2 != c_rarg3, "smashed arg");
 907   pass_arg3(this, arg_3);
 908   assert(arg_1 != c_rarg2, "smashed arg");
 909   pass_arg2(this, arg_2);
 910   pass_arg1(this, arg_1);
 911   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 912 }
 913 
 914 
 915 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 916   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 917   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 918   verify_oop(oop_result, "broken oop in call_VM_base");
 919 }
 920 
 921 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 922   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 923   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 924 }
 925 
 926 void MacroAssembler::align(int modulus) {
 927   while (offset() % modulus != 0) nop();
 928 }
 929 
 930 // these are no-ops overridden by InterpreterMacroAssembler
 931 
 932 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 933 
 934 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 935 
 936 
 937 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 938                                                       Register tmp,
 939                                                       int offset) {
 940   intptr_t value = *delayed_value_addr;
 941   if (value != 0)
 942     return RegisterOrConstant(value + offset);
 943 
 944   // load indirectly to solve generation ordering problem
 945   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 946 
 947   if (offset != 0)
 948     add(tmp, tmp, offset);
 949 
 950   return RegisterOrConstant(tmp);
 951 }
 952 
 953 
 954 void MacroAssembler:: notify(int type) {
 955   if (type == bytecode_start) {
 956     // set_last_Java_frame(esp, rfp, (address)NULL);
 957     Assembler:: notify(type);
 958     // reset_last_Java_frame(true);
 959   }
 960   else
 961     Assembler:: notify(type);
 962 }
 963 
 964 // Look up the method for a megamorphic invokeinterface call.
 965 // The target method is determined by <intf_klass, itable_index>.
 966 // The receiver klass is in recv_klass.
 967 // On success, the result will be in method_result, and execution falls through.
 968 // On failure, execution transfers to the given label.
 969 void MacroAssembler::lookup_interface_method(Register recv_klass,
 970                                              Register intf_klass,
 971                                              RegisterOrConstant itable_index,
 972                                              Register method_result,
 973                                              Register scan_temp,
 974                                              Label& L_no_such_interface,
 975                          bool return_method) {
 976   assert_different_registers(recv_klass, intf_klass, scan_temp);
 977   assert_different_registers(method_result, intf_klass, scan_temp);
 978   assert(recv_klass != method_result || !return_method,
 979      "recv_klass can be destroyed when method isn't needed");
 980   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 981          "caller must use same register for non-constant itable index as for method");
 982 
 983   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 984   int vtable_base = in_bytes(Klass::vtable_start_offset());
 985   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 986   int scan_step   = itableOffsetEntry::size() * wordSize;
 987   int vte_size    = vtableEntry::size_in_bytes();
 988   assert(vte_size == wordSize, "else adjust times_vte_scale");
 989 
 990   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 991 
 992   // %%% Could store the aligned, prescaled offset in the klassoop.
 993   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 994   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 995   add(scan_temp, scan_temp, vtable_base);
 996 
 997   if (return_method) {
 998     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 999     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1000     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1001     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1002     if (itentry_off)
1003       add(recv_klass, recv_klass, itentry_off);
1004   }
1005 
1006   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1007   //   if (scan->interface() == intf) {
1008   //     result = (klass + scan->offset() + itable_index);
1009   //   }
1010   // }
1011   Label search, found_method;
1012 
1013   for (int peel = 1; peel >= 0; peel--) {
1014     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1015     cmp(intf_klass, method_result);
1016 
1017     if (peel) {
1018       br(Assembler::EQ, found_method);
1019     } else {
1020       br(Assembler::NE, search);
1021       // (invert the test to fall through to found_method...)
1022     }
1023 
1024     if (!peel)  break;
1025 
1026     bind(search);
1027 
1028     // Check that the previous entry is non-null.  A null entry means that
1029     // the receiver class doesn't implement the interface, and wasn't the
1030     // same as when the caller was compiled.
1031     cbz(method_result, L_no_such_interface);
1032     add(scan_temp, scan_temp, scan_step);
1033   }
1034 
1035   bind(found_method);
1036 
1037   // Got a hit.
1038   if (return_method) {
1039     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1040     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1041   }
1042 }
1043 
1044 // virtual method calling
1045 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1046                                            RegisterOrConstant vtable_index,
1047                                            Register method_result) {
1048   const int base = in_bytes(Klass::vtable_start_offset());
1049   assert(vtableEntry::size() * wordSize == 8,
1050          "adjust the scaling in the code below");
1051   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1052 
1053   if (vtable_index.is_register()) {
1054     lea(method_result, Address(recv_klass,
1055                                vtable_index.as_register(),
1056                                Address::lsl(LogBytesPerWord)));
1057     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1058   } else {
1059     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1060     ldr(method_result,
1061         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1062   }
1063 }
1064 
1065 void MacroAssembler::check_klass_subtype(Register sub_klass,
1066                            Register super_klass,
1067                            Register temp_reg,
1068                            Label& L_success) {
1069   Label L_failure;
1070   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1071   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1072   bind(L_failure);
1073 }
1074 
1075 
1076 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1077                                                    Register super_klass,
1078                                                    Register temp_reg,
1079                                                    Label* L_success,
1080                                                    Label* L_failure,
1081                                                    Label* L_slow_path,
1082                                         RegisterOrConstant super_check_offset) {
1083   assert_different_registers(sub_klass, super_klass, temp_reg);
1084   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1085   if (super_check_offset.is_register()) {
1086     assert_different_registers(sub_klass, super_klass,
1087                                super_check_offset.as_register());
1088   } else if (must_load_sco) {
1089     assert(temp_reg != noreg, "supply either a temp or a register offset");
1090   }
1091 
1092   Label L_fallthrough;
1093   int label_nulls = 0;
1094   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1095   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1096   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1097   assert(label_nulls <= 1, "at most one NULL in the batch");
1098 
1099   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1100   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1101   Address super_check_offset_addr(super_klass, sco_offset);
1102 
1103   // Hacked jmp, which may only be used just before L_fallthrough.
1104 #define final_jmp(label)                                                \
1105   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1106   else                            b(label)                /*omit semi*/
1107 
1108   // If the pointers are equal, we are done (e.g., String[] elements).
1109   // This self-check enables sharing of secondary supertype arrays among
1110   // non-primary types such as array-of-interface.  Otherwise, each such
1111   // type would need its own customized SSA.
1112   // We move this check to the front of the fast path because many
1113   // type checks are in fact trivially successful in this manner,
1114   // so we get a nicely predicted branch right at the start of the check.
1115   cmp(sub_klass, super_klass);
1116   br(Assembler::EQ, *L_success);
1117 
1118   // Check the supertype display:
1119   if (must_load_sco) {
1120     ldrw(temp_reg, super_check_offset_addr);
1121     super_check_offset = RegisterOrConstant(temp_reg);
1122   }
1123   Address super_check_addr(sub_klass, super_check_offset);
1124   ldr(rscratch1, super_check_addr);
1125   cmp(super_klass, rscratch1); // load displayed supertype
1126 
1127   // This check has worked decisively for primary supers.
1128   // Secondary supers are sought in the super_cache ('super_cache_addr').
1129   // (Secondary supers are interfaces and very deeply nested subtypes.)
1130   // This works in the same check above because of a tricky aliasing
1131   // between the super_cache and the primary super display elements.
1132   // (The 'super_check_addr' can address either, as the case requires.)
1133   // Note that the cache is updated below if it does not help us find
1134   // what we need immediately.
1135   // So if it was a primary super, we can just fail immediately.
1136   // Otherwise, it's the slow path for us (no success at this point).
1137 
1138   if (super_check_offset.is_register()) {
1139     br(Assembler::EQ, *L_success);
1140     cmp(super_check_offset.as_register(), sc_offset);
1141     if (L_failure == &L_fallthrough) {
1142       br(Assembler::EQ, *L_slow_path);
1143     } else {
1144       br(Assembler::NE, *L_failure);
1145       final_jmp(*L_slow_path);
1146     }
1147   } else if (super_check_offset.as_constant() == sc_offset) {
1148     // Need a slow path; fast failure is impossible.
1149     if (L_slow_path == &L_fallthrough) {
1150       br(Assembler::EQ, *L_success);
1151     } else {
1152       br(Assembler::NE, *L_slow_path);
1153       final_jmp(*L_success);
1154     }
1155   } else {
1156     // No slow path; it's a fast decision.
1157     if (L_failure == &L_fallthrough) {
1158       br(Assembler::EQ, *L_success);
1159     } else {
1160       br(Assembler::NE, *L_failure);
1161       final_jmp(*L_success);
1162     }
1163   }
1164 
1165   bind(L_fallthrough);
1166 
1167 #undef final_jmp
1168 }
1169 
1170 // These two are taken from x86, but they look generally useful
1171 
1172 // scans count pointer sized words at [addr] for occurence of value,
1173 // generic
1174 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1175                                 Register scratch) {
1176   Label Lloop, Lexit;
1177   cbz(count, Lexit);
1178   bind(Lloop);
1179   ldr(scratch, post(addr, wordSize));
1180   cmp(value, scratch);
1181   br(EQ, Lexit);
1182   sub(count, count, 1);
1183   cbnz(count, Lloop);
1184   bind(Lexit);
1185 }
1186 
1187 // scans count 4 byte words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldrw(scratch, post(addr, wordSize));
1195   cmpw(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1203                                                    Register super_klass,
1204                                                    Register temp_reg,
1205                                                    Register temp2_reg,
1206                                                    Label* L_success,
1207                                                    Label* L_failure,
1208                                                    bool set_cond_codes) {
1209   assert_different_registers(sub_klass, super_klass, temp_reg);
1210   if (temp2_reg != noreg)
1211     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1212 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1213 
1214   Label L_fallthrough;
1215   int label_nulls = 0;
1216   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1217   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1218   assert(label_nulls <= 1, "at most one NULL in the batch");
1219 
1220   // a couple of useful fields in sub_klass:
1221   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1222   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1223   Address secondary_supers_addr(sub_klass, ss_offset);
1224   Address super_cache_addr(     sub_klass, sc_offset);
1225 
1226   BLOCK_COMMENT("check_klass_subtype_slow_path");
1227 
1228   // Do a linear scan of the secondary super-klass chain.
1229   // This code is rarely used, so simplicity is a virtue here.
1230   // The repne_scan instruction uses fixed registers, which we must spill.
1231   // Don't worry too much about pre-existing connections with the input regs.
1232 
1233   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1234   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1235 
1236   RegSet pushed_registers;
1237   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1238   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1239 
1240   if (super_klass != r0 || UseCompressedOops) {
1241     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1242   }
1243 
1244   push(pushed_registers, sp);
1245 
1246   // Get super_klass value into r0 (even if it was in r5 or r2).
1247   if (super_klass != r0) {
1248     mov(r0, super_klass);
1249   }
1250 
1251 #ifndef PRODUCT
1252   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1253   Address pst_counter_addr(rscratch2);
1254   ldr(rscratch1, pst_counter_addr);
1255   add(rscratch1, rscratch1, 1);
1256   str(rscratch1, pst_counter_addr);
1257 #endif //PRODUCT
1258 
1259   // We will consult the secondary-super array.
1260   ldr(r5, secondary_supers_addr);
1261   // Load the array length.
1262   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1263   // Skip to start of data.
1264   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1265 
1266   cmp(sp, zr); // Clear Z flag; SP is never zero
1267   // Scan R2 words at [R5] for an occurrence of R0.
1268   // Set NZ/Z based on last compare.
1269   repne_scan(r5, r0, r2, rscratch1);
1270 
1271   // Unspill the temp. registers:
1272   pop(pushed_registers, sp);
1273 
1274   br(Assembler::NE, *L_failure);
1275 
1276   // Success.  Cache the super we found and proceed in triumph.
1277   str(super_klass, super_cache_addr);
1278 
1279   if (L_success != &L_fallthrough) {
1280     b(*L_success);
1281   }
1282 
1283 #undef IS_A_TEMP
1284 
1285   bind(L_fallthrough);
1286 }
1287 
1288 
1289 void MacroAssembler::verify_oop(Register reg, const char* s) {
1290   if (!VerifyOops) return;
1291 
1292   // Pass register number to verify_oop_subroutine
1293   const char* b = NULL;
1294   {
1295     ResourceMark rm;
1296     stringStream ss;
1297     ss.print("verify_oop: %s: %s", reg->name(), s);
1298     b = code_string(ss.as_string());
1299   }
1300   BLOCK_COMMENT("verify_oop {");
1301 
1302   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1303   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1304 
1305   mov(r0, reg);
1306   mov(rscratch1, (address)b);
1307 
1308   // call indirectly to solve generation ordering problem
1309   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1310   ldr(rscratch2, Address(rscratch2));
1311   blr(rscratch2);
1312 
1313   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1314   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1315 
1316   BLOCK_COMMENT("} verify_oop");
1317 }
1318 
1319 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1320   if (!VerifyOops) return;
1321 
1322   const char* b = NULL;
1323   {
1324     ResourceMark rm;
1325     stringStream ss;
1326     ss.print("verify_oop_addr: %s", s);
1327     b = code_string(ss.as_string());
1328   }
1329   BLOCK_COMMENT("verify_oop_addr {");
1330 
1331   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1332   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1333 
1334   // addr may contain sp so we will have to adjust it based on the
1335   // pushes that we just did.
1336   if (addr.uses(sp)) {
1337     lea(r0, addr);
1338     ldr(r0, Address(r0, 4 * wordSize));
1339   } else {
1340     ldr(r0, addr);
1341   }
1342   mov(rscratch1, (address)b);
1343 
1344   // call indirectly to solve generation ordering problem
1345   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1346   ldr(rscratch2, Address(rscratch2));
1347   blr(rscratch2);
1348 
1349   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1350   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1351 
1352   BLOCK_COMMENT("} verify_oop_addr");
1353 }
1354 
1355 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1356                                          int extra_slot_offset) {
1357   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1358   int stackElementSize = Interpreter::stackElementSize;
1359   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1360 #ifdef ASSERT
1361   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1362   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1363 #endif
1364   if (arg_slot.is_constant()) {
1365     return Address(esp, arg_slot.as_constant() * stackElementSize
1366                    + offset);
1367   } else {
1368     add(rscratch1, esp, arg_slot.as_register(),
1369         ext::uxtx, exact_log2(stackElementSize));
1370     return Address(rscratch1, offset);
1371   }
1372 }
1373 
1374 void MacroAssembler::call_VM_leaf_base(address entry_point,
1375                                        int number_of_arguments,
1376                                        Label *retaddr) {
1377   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1378 }
1379 
1380 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1381                                         int number_of_gp_arguments,
1382                                         int number_of_fp_arguments,
1383                                         ret_type type,
1384                                         Label *retaddr) {
1385   Label E, L;
1386 
1387   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1388 
1389   // We add 1 to number_of_arguments because the thread in arg0 is
1390   // not counted
1391   mov(rscratch1, entry_point);
1392   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1393   if (retaddr)
1394     bind(*retaddr);
1395 
1396   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1397   maybe_isb();
1398 }
1399 
1400 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1401   call_VM_leaf_base(entry_point, number_of_arguments);
1402 }
1403 
1404 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1405   pass_arg0(this, arg_0);
1406   call_VM_leaf_base(entry_point, 1);
1407 }
1408 
1409 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1410   pass_arg0(this, arg_0);
1411   pass_arg1(this, arg_1);
1412   call_VM_leaf_base(entry_point, 2);
1413 }
1414 
1415 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1416                                   Register arg_1, Register arg_2) {
1417   pass_arg0(this, arg_0);
1418   pass_arg1(this, arg_1);
1419   pass_arg2(this, arg_2);
1420   call_VM_leaf_base(entry_point, 3);
1421 }
1422 
1423 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1424   pass_arg0(this, arg_0);
1425   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1426 }
1427 
1428 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1429 
1430   assert(arg_0 != c_rarg1, "smashed arg");
1431   pass_arg1(this, arg_1);
1432   pass_arg0(this, arg_0);
1433   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1434 }
1435 
1436 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1437   assert(arg_0 != c_rarg2, "smashed arg");
1438   assert(arg_1 != c_rarg2, "smashed arg");
1439   pass_arg2(this, arg_2);
1440   assert(arg_0 != c_rarg1, "smashed arg");
1441   pass_arg1(this, arg_1);
1442   pass_arg0(this, arg_0);
1443   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1444 }
1445 
1446 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1447   assert(arg_0 != c_rarg3, "smashed arg");
1448   assert(arg_1 != c_rarg3, "smashed arg");
1449   assert(arg_2 != c_rarg3, "smashed arg");
1450   pass_arg3(this, arg_3);
1451   assert(arg_0 != c_rarg2, "smashed arg");
1452   assert(arg_1 != c_rarg2, "smashed arg");
1453   pass_arg2(this, arg_2);
1454   assert(arg_0 != c_rarg1, "smashed arg");
1455   pass_arg1(this, arg_1);
1456   pass_arg0(this, arg_0);
1457   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1458 }
1459 
1460 void MacroAssembler::null_check(Register reg, int offset) {
1461   if (needs_explicit_null_check(offset)) {
1462     // provoke OS NULL exception if reg = NULL by
1463     // accessing M[reg] w/o changing any registers
1464     // NOTE: this is plenty to provoke a segv
1465     ldr(zr, Address(reg));
1466   } else {
1467     // nothing to do, (later) access of M[reg + offset]
1468     // will provoke OS NULL exception if reg = NULL
1469   }
1470 }
1471 
1472 // MacroAssembler protected routines needed to implement
1473 // public methods
1474 
1475 void MacroAssembler::mov(Register r, Address dest) {
1476   code_section()->relocate(pc(), dest.rspec());
1477   u_int64_t imm64 = (u_int64_t)dest.target();
1478   movptr(r, imm64);
1479 }
1480 
1481 // Move a constant pointer into r.  In AArch64 mode the virtual
1482 // address space is 48 bits in size, so we only need three
1483 // instructions to create a patchable instruction sequence that can
1484 // reach anywhere.
1485 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1486 #ifndef PRODUCT
1487   {
1488     char buffer[64];
1489     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1490     block_comment(buffer);
1491   }
1492 #endif
1493   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1494   movz(r, imm64 & 0xffff);
1495   imm64 >>= 16;
1496   movk(r, imm64 & 0xffff, 16);
1497   imm64 >>= 16;
1498   movk(r, imm64 & 0xffff, 32);
1499 }
1500 
1501 // Macro to mov replicated immediate to vector register.
1502 //  Vd will get the following values for different arrangements in T
1503 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1504 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1505 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1506 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1507 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1508 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1509 //   T1D/T2D: invalid
1510 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1511   assert(T != T1D && T != T2D, "invalid arrangement");
1512   if (T == T8B || T == T16B) {
1513     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1514     movi(Vd, T, imm32 & 0xff, 0);
1515     return;
1516   }
1517   u_int32_t nimm32 = ~imm32;
1518   if (T == T4H || T == T8H) {
1519     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1520     imm32 &= 0xffff;
1521     nimm32 &= 0xffff;
1522   }
1523   u_int32_t x = imm32;
1524   int movi_cnt = 0;
1525   int movn_cnt = 0;
1526   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1527   x = nimm32;
1528   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1529   if (movn_cnt < movi_cnt) imm32 = nimm32;
1530   unsigned lsl = 0;
1531   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1532   if (movn_cnt < movi_cnt)
1533     mvni(Vd, T, imm32 & 0xff, lsl);
1534   else
1535     movi(Vd, T, imm32 & 0xff, lsl);
1536   imm32 >>= 8; lsl += 8;
1537   while (imm32) {
1538     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1539     if (movn_cnt < movi_cnt)
1540       bici(Vd, T, imm32 & 0xff, lsl);
1541     else
1542       orri(Vd, T, imm32 & 0xff, lsl);
1543     lsl += 8; imm32 >>= 8;
1544   }
1545 }
1546 
1547 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1548 {
1549 #ifndef PRODUCT
1550   {
1551     char buffer[64];
1552     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1553     block_comment(buffer);
1554   }
1555 #endif
1556   if (operand_valid_for_logical_immediate(false, imm64)) {
1557     orr(dst, zr, imm64);
1558   } else {
1559     // we can use a combination of MOVZ or MOVN with
1560     // MOVK to build up the constant
1561     u_int64_t imm_h[4];
1562     int zero_count = 0;
1563     int neg_count = 0;
1564     int i;
1565     for (i = 0; i < 4; i++) {
1566       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1567       if (imm_h[i] == 0) {
1568         zero_count++;
1569       } else if (imm_h[i] == 0xffffL) {
1570         neg_count++;
1571       }
1572     }
1573     if (zero_count == 4) {
1574       // one MOVZ will do
1575       movz(dst, 0);
1576     } else if (neg_count == 4) {
1577       // one MOVN will do
1578       movn(dst, 0);
1579     } else if (zero_count == 3) {
1580       for (i = 0; i < 4; i++) {
1581         if (imm_h[i] != 0L) {
1582           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1583           break;
1584         }
1585       }
1586     } else if (neg_count == 3) {
1587       // one MOVN will do
1588       for (int i = 0; i < 4; i++) {
1589         if (imm_h[i] != 0xffffL) {
1590           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1591           break;
1592         }
1593       }
1594     } else if (zero_count == 2) {
1595       // one MOVZ and one MOVK will do
1596       for (i = 0; i < 3; i++) {
1597         if (imm_h[i] != 0L) {
1598           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1599           i++;
1600           break;
1601         }
1602       }
1603       for (;i < 4; i++) {
1604         if (imm_h[i] != 0L) {
1605           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1606         }
1607       }
1608     } else if (neg_count == 2) {
1609       // one MOVN and one MOVK will do
1610       for (i = 0; i < 4; i++) {
1611         if (imm_h[i] != 0xffffL) {
1612           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1613           i++;
1614           break;
1615         }
1616       }
1617       for (;i < 4; i++) {
1618         if (imm_h[i] != 0xffffL) {
1619           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1620         }
1621       }
1622     } else if (zero_count == 1) {
1623       // one MOVZ and two MOVKs will do
1624       for (i = 0; i < 4; i++) {
1625         if (imm_h[i] != 0L) {
1626           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1627           i++;
1628           break;
1629         }
1630       }
1631       for (;i < 4; i++) {
1632         if (imm_h[i] != 0x0L) {
1633           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1634         }
1635       }
1636     } else if (neg_count == 1) {
1637       // one MOVN and two MOVKs will do
1638       for (i = 0; i < 4; i++) {
1639         if (imm_h[i] != 0xffffL) {
1640           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1641           i++;
1642           break;
1643         }
1644       }
1645       for (;i < 4; i++) {
1646         if (imm_h[i] != 0xffffL) {
1647           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1648         }
1649       }
1650     } else {
1651       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1652       movz(dst, (u_int32_t)imm_h[0], 0);
1653       for (i = 1; i < 4; i++) {
1654         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1655       }
1656     }
1657   }
1658 }
1659 
1660 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1661 {
1662 #ifndef PRODUCT
1663     {
1664       char buffer[64];
1665       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1666       block_comment(buffer);
1667     }
1668 #endif
1669   if (operand_valid_for_logical_immediate(true, imm32)) {
1670     orrw(dst, zr, imm32);
1671   } else {
1672     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1673     // constant
1674     u_int32_t imm_h[2];
1675     imm_h[0] = imm32 & 0xffff;
1676     imm_h[1] = ((imm32 >> 16) & 0xffff);
1677     if (imm_h[0] == 0) {
1678       movzw(dst, imm_h[1], 16);
1679     } else if (imm_h[0] == 0xffff) {
1680       movnw(dst, imm_h[1] ^ 0xffff, 16);
1681     } else if (imm_h[1] == 0) {
1682       movzw(dst, imm_h[0], 0);
1683     } else if (imm_h[1] == 0xffff) {
1684       movnw(dst, imm_h[0] ^ 0xffff, 0);
1685     } else {
1686       // use a MOVZ and MOVK (makes it easier to debug)
1687       movzw(dst, imm_h[0], 0);
1688       movkw(dst, imm_h[1], 16);
1689     }
1690   }
1691 }
1692 
1693 // Form an address from base + offset in Rd.  Rd may or may
1694 // not actually be used: you must use the Address that is returned.
1695 // It is up to you to ensure that the shift provided matches the size
1696 // of your data.
1697 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1698   if (Address::offset_ok_for_immed(byte_offset, shift))
1699     // It fits; no need for any heroics
1700     return Address(base, byte_offset);
1701 
1702   // Don't do anything clever with negative or misaligned offsets
1703   unsigned mask = (1 << shift) - 1;
1704   if (byte_offset < 0 || byte_offset & mask) {
1705     mov(Rd, byte_offset);
1706     add(Rd, base, Rd);
1707     return Address(Rd);
1708   }
1709 
1710   // See if we can do this with two 12-bit offsets
1711   {
1712     unsigned long word_offset = byte_offset >> shift;
1713     unsigned long masked_offset = word_offset & 0xfff000;
1714     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1715         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1716       add(Rd, base, masked_offset << shift);
1717       word_offset -= masked_offset;
1718       return Address(Rd, word_offset << shift);
1719     }
1720   }
1721 
1722   // Do it the hard way
1723   mov(Rd, byte_offset);
1724   add(Rd, base, Rd);
1725   return Address(Rd);
1726 }
1727 
1728 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1729   if (UseLSE) {
1730     mov(tmp, 1);
1731     ldadd(Assembler::word, tmp, zr, counter_addr);
1732     return;
1733   }
1734   Label retry_load;
1735   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1736     prfm(Address(counter_addr), PSTL1STRM);
1737   bind(retry_load);
1738   // flush and load exclusive from the memory location
1739   ldxrw(tmp, counter_addr);
1740   addw(tmp, tmp, 1);
1741   // if we store+flush with no intervening write tmp wil be zero
1742   stxrw(tmp2, tmp, counter_addr);
1743   cbnzw(tmp2, retry_load);
1744 }
1745 
1746 
1747 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1748                                     bool want_remainder, Register scratch)
1749 {
1750   // Full implementation of Java idiv and irem.  The function
1751   // returns the (pc) offset of the div instruction - may be needed
1752   // for implicit exceptions.
1753   //
1754   // constraint : ra/rb =/= scratch
1755   //         normal case
1756   //
1757   // input : ra: dividend
1758   //         rb: divisor
1759   //
1760   // result: either
1761   //         quotient  (= ra idiv rb)
1762   //         remainder (= ra irem rb)
1763 
1764   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1765 
1766   int idivl_offset = offset();
1767   if (! want_remainder) {
1768     sdivw(result, ra, rb);
1769   } else {
1770     sdivw(scratch, ra, rb);
1771     Assembler::msubw(result, scratch, rb, ra);
1772   }
1773 
1774   return idivl_offset;
1775 }
1776 
1777 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1778                                     bool want_remainder, Register scratch)
1779 {
1780   // Full implementation of Java ldiv and lrem.  The function
1781   // returns the (pc) offset of the div instruction - may be needed
1782   // for implicit exceptions.
1783   //
1784   // constraint : ra/rb =/= scratch
1785   //         normal case
1786   //
1787   // input : ra: dividend
1788   //         rb: divisor
1789   //
1790   // result: either
1791   //         quotient  (= ra idiv rb)
1792   //         remainder (= ra irem rb)
1793 
1794   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1795 
1796   int idivq_offset = offset();
1797   if (! want_remainder) {
1798     sdiv(result, ra, rb);
1799   } else {
1800     sdiv(scratch, ra, rb);
1801     Assembler::msub(result, scratch, rb, ra);
1802   }
1803 
1804   return idivq_offset;
1805 }
1806 
1807 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1808   address prev = pc() - NativeMembar::instruction_size;
1809   address last = code()->last_insn();
1810   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1811     NativeMembar *bar = NativeMembar_at(prev);
1812     // We are merging two memory barrier instructions.  On AArch64 we
1813     // can do this simply by ORing them together.
1814     bar->set_kind(bar->get_kind() | order_constraint);
1815     BLOCK_COMMENT("merged membar");
1816   } else {
1817     code()->set_last_insn(pc());
1818     dmb(Assembler::barrier(order_constraint));
1819   }
1820 }
1821 
1822 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1823   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1824     merge_ldst(rt, adr, size_in_bytes, is_store);
1825     code()->clear_last_insn();
1826     return true;
1827   } else {
1828     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1829     const unsigned mask = size_in_bytes - 1;
1830     if (adr.getMode() == Address::base_plus_offset &&
1831         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1832       code()->set_last_insn(pc());
1833     }
1834     return false;
1835   }
1836 }
1837 
1838 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1839   // We always try to merge two adjacent loads into one ldp.
1840   if (!try_merge_ldst(Rx, adr, 8, false)) {
1841     Assembler::ldr(Rx, adr);
1842   }
1843 }
1844 
1845 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1846   // We always try to merge two adjacent loads into one ldp.
1847   if (!try_merge_ldst(Rw, adr, 4, false)) {
1848     Assembler::ldrw(Rw, adr);
1849   }
1850 }
1851 
1852 void MacroAssembler::str(Register Rx, const Address &adr) {
1853   // We always try to merge two adjacent stores into one stp.
1854   if (!try_merge_ldst(Rx, adr, 8, true)) {
1855     Assembler::str(Rx, adr);
1856   }
1857 }
1858 
1859 void MacroAssembler::strw(Register Rw, const Address &adr) {
1860   // We always try to merge two adjacent stores into one stp.
1861   if (!try_merge_ldst(Rw, adr, 4, true)) {
1862     Assembler::strw(Rw, adr);
1863   }
1864 }
1865 
1866 // MacroAssembler routines found actually to be needed
1867 
1868 void MacroAssembler::push(Register src)
1869 {
1870   str(src, Address(pre(esp, -1 * wordSize)));
1871 }
1872 
1873 void MacroAssembler::pop(Register dst)
1874 {
1875   ldr(dst, Address(post(esp, 1 * wordSize)));
1876 }
1877 
1878 // Note: load_unsigned_short used to be called load_unsigned_word.
1879 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1880   int off = offset();
1881   ldrh(dst, src);
1882   return off;
1883 }
1884 
1885 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1886   int off = offset();
1887   ldrb(dst, src);
1888   return off;
1889 }
1890 
1891 int MacroAssembler::load_signed_short(Register dst, Address src) {
1892   int off = offset();
1893   ldrsh(dst, src);
1894   return off;
1895 }
1896 
1897 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1898   int off = offset();
1899   ldrsb(dst, src);
1900   return off;
1901 }
1902 
1903 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1904   int off = offset();
1905   ldrshw(dst, src);
1906   return off;
1907 }
1908 
1909 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1910   int off = offset();
1911   ldrsbw(dst, src);
1912   return off;
1913 }
1914 
1915 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1916   switch (size_in_bytes) {
1917   case  8:  ldr(dst, src); break;
1918   case  4:  ldrw(dst, src); break;
1919   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1920   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1921   default:  ShouldNotReachHere();
1922   }
1923 }
1924 
1925 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1926   switch (size_in_bytes) {
1927   case  8:  str(src, dst); break;
1928   case  4:  strw(src, dst); break;
1929   case  2:  strh(src, dst); break;
1930   case  1:  strb(src, dst); break;
1931   default:  ShouldNotReachHere();
1932   }
1933 }
1934 
1935 void MacroAssembler::decrementw(Register reg, int value)
1936 {
1937   if (value < 0)  { incrementw(reg, -value);      return; }
1938   if (value == 0) {                               return; }
1939   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1940   /* else */ {
1941     guarantee(reg != rscratch2, "invalid dst for register decrement");
1942     movw(rscratch2, (unsigned)value);
1943     subw(reg, reg, rscratch2);
1944   }
1945 }
1946 
1947 void MacroAssembler::decrement(Register reg, int value)
1948 {
1949   if (value < 0)  { increment(reg, -value);      return; }
1950   if (value == 0) {                              return; }
1951   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1952   /* else */ {
1953     assert(reg != rscratch2, "invalid dst for register decrement");
1954     mov(rscratch2, (unsigned long)value);
1955     sub(reg, reg, rscratch2);
1956   }
1957 }
1958 
1959 void MacroAssembler::decrementw(Address dst, int value)
1960 {
1961   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1962   if (dst.getMode() == Address::literal) {
1963     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1964     lea(rscratch2, dst);
1965     dst = Address(rscratch2);
1966   }
1967   ldrw(rscratch1, dst);
1968   decrementw(rscratch1, value);
1969   strw(rscratch1, dst);
1970 }
1971 
1972 void MacroAssembler::decrement(Address dst, int value)
1973 {
1974   assert(!dst.uses(rscratch1), "invalid address for decrement");
1975   if (dst.getMode() == Address::literal) {
1976     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1977     lea(rscratch2, dst);
1978     dst = Address(rscratch2);
1979   }
1980   ldr(rscratch1, dst);
1981   decrement(rscratch1, value);
1982   str(rscratch1, dst);
1983 }
1984 
1985 void MacroAssembler::incrementw(Register reg, int value)
1986 {
1987   if (value < 0)  { decrementw(reg, -value);      return; }
1988   if (value == 0) {                               return; }
1989   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1990   /* else */ {
1991     assert(reg != rscratch2, "invalid dst for register increment");
1992     movw(rscratch2, (unsigned)value);
1993     addw(reg, reg, rscratch2);
1994   }
1995 }
1996 
1997 void MacroAssembler::increment(Register reg, int value)
1998 {
1999   if (value < 0)  { decrement(reg, -value);      return; }
2000   if (value == 0) {                              return; }
2001   if (value < (1 << 12)) { add(reg, reg, value); return; }
2002   /* else */ {
2003     assert(reg != rscratch2, "invalid dst for register increment");
2004     movw(rscratch2, (unsigned)value);
2005     add(reg, reg, rscratch2);
2006   }
2007 }
2008 
2009 void MacroAssembler::incrementw(Address dst, int value)
2010 {
2011   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2012   if (dst.getMode() == Address::literal) {
2013     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2014     lea(rscratch2, dst);
2015     dst = Address(rscratch2);
2016   }
2017   ldrw(rscratch1, dst);
2018   incrementw(rscratch1, value);
2019   strw(rscratch1, dst);
2020 }
2021 
2022 void MacroAssembler::increment(Address dst, int value)
2023 {
2024   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2025   if (dst.getMode() == Address::literal) {
2026     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2027     lea(rscratch2, dst);
2028     dst = Address(rscratch2);
2029   }
2030   ldr(rscratch1, dst);
2031   increment(rscratch1, value);
2032   str(rscratch1, dst);
2033 }
2034 
2035 
2036 void MacroAssembler::pusha() {
2037   push(0x7fffffff, sp);
2038 }
2039 
2040 void MacroAssembler::popa() {
2041   pop(0x7fffffff, sp);
2042 }
2043 
2044 // Push lots of registers in the bit set supplied.  Don't push sp.
2045 // Return the number of words pushed
2046 int MacroAssembler::push(unsigned int bitset, Register stack) {
2047   int words_pushed = 0;
2048 
2049   // Scan bitset to accumulate register pairs
2050   unsigned char regs[32];
2051   int count = 0;
2052   for (int reg = 0; reg <= 30; reg++) {
2053     if (1 & bitset)
2054       regs[count++] = reg;
2055     bitset >>= 1;
2056   }
2057   regs[count++] = zr->encoding_nocheck();
2058   count &= ~1;  // Only push an even nuber of regs
2059 
2060   if (count) {
2061     stp(as_Register(regs[0]), as_Register(regs[1]),
2062        Address(pre(stack, -count * wordSize)));
2063     words_pushed += 2;
2064   }
2065   for (int i = 2; i < count; i += 2) {
2066     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2067        Address(stack, i * wordSize));
2068     words_pushed += 2;
2069   }
2070 
2071   assert(words_pushed == count, "oops, pushed != count");
2072 
2073   return count;
2074 }
2075 
2076 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2077   int words_pushed = 0;
2078 
2079   // Scan bitset to accumulate register pairs
2080   unsigned char regs[32];
2081   int count = 0;
2082   for (int reg = 0; reg <= 30; reg++) {
2083     if (1 & bitset)
2084       regs[count++] = reg;
2085     bitset >>= 1;
2086   }
2087   regs[count++] = zr->encoding_nocheck();
2088   count &= ~1;
2089 
2090   for (int i = 2; i < count; i += 2) {
2091     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2092        Address(stack, i * wordSize));
2093     words_pushed += 2;
2094   }
2095   if (count) {
2096     ldp(as_Register(regs[0]), as_Register(regs[1]),
2097        Address(post(stack, count * wordSize)));
2098     words_pushed += 2;
2099   }
2100 
2101   assert(words_pushed == count, "oops, pushed != count");
2102 
2103   return count;
2104 }
2105 #ifdef ASSERT
2106 void MacroAssembler::verify_heapbase(const char* msg) {
2107 #if 0
2108   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2109   assert (Universe::heap() != NULL, "java heap should be initialized");
2110   if (CheckCompressedOops) {
2111     Label ok;
2112     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2113     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2114     br(Assembler::EQ, ok);
2115     stop(msg);
2116     bind(ok);
2117     pop(1 << rscratch1->encoding(), sp);
2118   }
2119 #endif
2120 }
2121 #endif
2122 
2123 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2124   Label done, not_weak;
2125   cbz(value, done);           // Use NULL as-is.
2126 
2127   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2128   tbz(r0, 0, not_weak);    // Test for jweak tag.
2129 
2130   // Resolve jweak.
2131   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2132                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2133   verify_oop(value);
2134   b(done);
2135 
2136   bind(not_weak);
2137   // Resolve (untagged) jobject.
2138   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2139   verify_oop(value);
2140   bind(done);
2141 }
2142 
2143 void MacroAssembler::stop(const char* msg) {
2144   address ip = pc();
2145   pusha();
2146   mov(c_rarg0, (address)msg);
2147   mov(c_rarg1, (address)ip);
2148   mov(c_rarg2, sp);
2149   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2150   // call(c_rarg3);
2151   blrt(c_rarg3, 3, 0, 1);
2152   hlt(0);
2153 }
2154 
2155 void MacroAssembler::unimplemented(const char* what) {
2156   const char* buf = NULL;
2157   {
2158     ResourceMark rm;
2159     stringStream ss;
2160     ss.print("unimplemented: %s", what);
2161     buf = code_string(ss.as_string());
2162   }
2163   stop(buf);
2164 }
2165 
2166 // If a constant does not fit in an immediate field, generate some
2167 // number of MOV instructions and then perform the operation.
2168 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2169                                            add_sub_imm_insn insn1,
2170                                            add_sub_reg_insn insn2) {
2171   assert(Rd != zr, "Rd = zr and not setting flags?");
2172   if (operand_valid_for_add_sub_immediate((int)imm)) {
2173     (this->*insn1)(Rd, Rn, imm);
2174   } else {
2175     if (uabs(imm) < (1 << 24)) {
2176        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2177        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2178     } else {
2179        assert_different_registers(Rd, Rn);
2180        mov(Rd, (uint64_t)imm);
2181        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2182     }
2183   }
2184 }
2185 
2186 // Seperate vsn which sets the flags. Optimisations are more restricted
2187 // because we must set the flags correctly.
2188 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2189                                            add_sub_imm_insn insn1,
2190                                            add_sub_reg_insn insn2) {
2191   if (operand_valid_for_add_sub_immediate((int)imm)) {
2192     (this->*insn1)(Rd, Rn, imm);
2193   } else {
2194     assert_different_registers(Rd, Rn);
2195     assert(Rd != zr, "overflow in immediate operand");
2196     mov(Rd, (uint64_t)imm);
2197     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2198   }
2199 }
2200 
2201 
2202 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2203   if (increment.is_register()) {
2204     add(Rd, Rn, increment.as_register());
2205   } else {
2206     add(Rd, Rn, increment.as_constant());
2207   }
2208 }
2209 
2210 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2211   if (increment.is_register()) {
2212     addw(Rd, Rn, increment.as_register());
2213   } else {
2214     addw(Rd, Rn, increment.as_constant());
2215   }
2216 }
2217 
2218 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2219   if (decrement.is_register()) {
2220     sub(Rd, Rn, decrement.as_register());
2221   } else {
2222     sub(Rd, Rn, decrement.as_constant());
2223   }
2224 }
2225 
2226 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2227   if (decrement.is_register()) {
2228     subw(Rd, Rn, decrement.as_register());
2229   } else {
2230     subw(Rd, Rn, decrement.as_constant());
2231   }
2232 }
2233 
2234 void MacroAssembler::reinit_heapbase()
2235 {
2236   if (UseCompressedOops) {
2237     if (Universe::is_fully_initialized()) {
2238       mov(rheapbase, Universe::narrow_ptrs_base());
2239     } else {
2240       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2241       ldr(rheapbase, Address(rheapbase));
2242     }
2243   }
2244 }
2245 
2246 // this simulates the behaviour of the x86 cmpxchg instruction using a
2247 // load linked/store conditional pair. we use the acquire/release
2248 // versions of these instructions so that we flush pending writes as
2249 // per Java semantics.
2250 
2251 // n.b the x86 version assumes the old value to be compared against is
2252 // in rax and updates rax with the value located in memory if the
2253 // cmpxchg fails. we supply a register for the old value explicitly
2254 
2255 // the aarch64 load linked/store conditional instructions do not
2256 // accept an offset. so, unlike x86, we must provide a plain register
2257 // to identify the memory word to be compared/exchanged rather than a
2258 // register+offset Address.
2259 
2260 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2261                                 Label &succeed, Label *fail) {
2262   // oldv holds comparison value
2263   // newv holds value to write in exchange
2264   // addr identifies memory word to compare against/update
2265   if (UseLSE) {
2266     mov(tmp, oldv);
2267     casal(Assembler::xword, oldv, newv, addr);
2268     cmp(tmp, oldv);
2269     br(Assembler::EQ, succeed);
2270     membar(AnyAny);
2271   } else {
2272     Label retry_load, nope;
2273     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2274       prfm(Address(addr), PSTL1STRM);
2275     bind(retry_load);
2276     // flush and load exclusive from the memory location
2277     // and fail if it is not what we expect
2278     ldaxr(tmp, addr);
2279     cmp(tmp, oldv);
2280     br(Assembler::NE, nope);
2281     // if we store+flush with no intervening write tmp wil be zero
2282     stlxr(tmp, newv, addr);
2283     cbzw(tmp, succeed);
2284     // retry so we only ever return after a load fails to compare
2285     // ensures we don't return a stale value after a failed write.
2286     b(retry_load);
2287     // if the memory word differs we return it in oldv and signal a fail
2288     bind(nope);
2289     membar(AnyAny);
2290     mov(oldv, tmp);
2291   }
2292   if (fail)
2293     b(*fail);
2294 }
2295 
2296 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2297                                         Label &succeed, Label *fail) {
2298   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2299   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2300 }
2301 
2302 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2303                                 Label &succeed, Label *fail) {
2304   // oldv holds comparison value
2305   // newv holds value to write in exchange
2306   // addr identifies memory word to compare against/update
2307   // tmp returns 0/1 for success/failure
2308   if (UseLSE) {
2309     mov(tmp, oldv);
2310     casal(Assembler::word, oldv, newv, addr);
2311     cmp(tmp, oldv);
2312     br(Assembler::EQ, succeed);
2313     membar(AnyAny);
2314   } else {
2315     Label retry_load, nope;
2316     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2317       prfm(Address(addr), PSTL1STRM);
2318     bind(retry_load);
2319     // flush and load exclusive from the memory location
2320     // and fail if it is not what we expect
2321     ldaxrw(tmp, addr);
2322     cmp(tmp, oldv);
2323     br(Assembler::NE, nope);
2324     // if we store+flush with no intervening write tmp wil be zero
2325     stlxrw(tmp, newv, addr);
2326     cbzw(tmp, succeed);
2327     // retry so we only ever return after a load fails to compare
2328     // ensures we don't return a stale value after a failed write.
2329     b(retry_load);
2330     // if the memory word differs we return it in oldv and signal a fail
2331     bind(nope);
2332     membar(AnyAny);
2333     mov(oldv, tmp);
2334   }
2335   if (fail)
2336     b(*fail);
2337 }
2338 
2339 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2340 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2341 // Pass a register for the result, otherwise pass noreg.
2342 
2343 // Clobbers rscratch1
2344 void MacroAssembler::cmpxchg(Register addr, Register expected,
2345                              Register new_val,
2346                              enum operand_size size,
2347                              bool acquire, bool release,
2348                              bool weak,
2349                              Register result) {
2350   if (result == noreg)  result = rscratch1;
2351   if (UseLSE) {
2352     mov(result, expected);
2353     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2354     cmp(result, expected);
2355   } else {
2356     BLOCK_COMMENT("cmpxchg {");
2357     Label retry_load, done;
2358     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2359       prfm(Address(addr), PSTL1STRM);
2360     bind(retry_load);
2361     load_exclusive(result, addr, size, acquire);
2362     if (size == xword)
2363       cmp(result, expected);
2364     else
2365       cmpw(result, expected);
2366     br(Assembler::NE, done);
2367     store_exclusive(rscratch1, new_val, addr, size, release);
2368     if (weak) {
2369       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2370     } else {
2371       cbnzw(rscratch1, retry_load);
2372     }
2373     bind(done);
2374     BLOCK_COMMENT("} cmpxchg");
2375   }
2376 }
2377 
2378 static bool different(Register a, RegisterOrConstant b, Register c) {
2379   if (b.is_constant())
2380     return a != c;
2381   else
2382     return a != b.as_register() && a != c && b.as_register() != c;
2383 }
2384 
2385 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2386 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2387   if (UseLSE) {                                                         \
2388     prev = prev->is_valid() ? prev : zr;                                \
2389     if (incr.is_register()) {                                           \
2390       AOP(sz, incr.as_register(), prev, addr);                          \
2391     } else {                                                            \
2392       mov(rscratch2, incr.as_constant());                               \
2393       AOP(sz, rscratch2, prev, addr);                                   \
2394     }                                                                   \
2395     return;                                                             \
2396   }                                                                     \
2397   Register result = rscratch2;                                          \
2398   if (prev->is_valid())                                                 \
2399     result = different(prev, incr, addr) ? prev : rscratch2;            \
2400                                                                         \
2401   Label retry_load;                                                     \
2402   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2403     prfm(Address(addr), PSTL1STRM);                                     \
2404   bind(retry_load);                                                     \
2405   LDXR(result, addr);                                                   \
2406   OP(rscratch1, result, incr);                                          \
2407   STXR(rscratch2, rscratch1, addr);                                     \
2408   cbnzw(rscratch2, retry_load);                                         \
2409   if (prev->is_valid() && prev != result) {                             \
2410     IOP(prev, rscratch1, incr);                                         \
2411   }                                                                     \
2412 }
2413 
2414 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2415 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2416 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2417 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2418 
2419 #undef ATOMIC_OP
2420 
2421 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2422 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2423   if (UseLSE) {                                                         \
2424     prev = prev->is_valid() ? prev : zr;                                \
2425     AOP(sz, newv, prev, addr);                                          \
2426     return;                                                             \
2427   }                                                                     \
2428   Register result = rscratch2;                                          \
2429   if (prev->is_valid())                                                 \
2430     result = different(prev, newv, addr) ? prev : rscratch2;            \
2431                                                                         \
2432   Label retry_load;                                                     \
2433   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2434     prfm(Address(addr), PSTL1STRM);                                     \
2435   bind(retry_load);                                                     \
2436   LDXR(result, addr);                                                   \
2437   STXR(rscratch1, newv, addr);                                          \
2438   cbnzw(rscratch1, retry_load);                                         \
2439   if (prev->is_valid() && prev != result)                               \
2440     mov(prev, result);                                                  \
2441 }
2442 
2443 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2444 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2445 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2446 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2447 
2448 #undef ATOMIC_XCHG
2449 
2450 #ifndef PRODUCT
2451 extern "C" void findpc(intptr_t x);
2452 #endif
2453 
2454 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2455 {
2456   // In order to get locks to work, we need to fake a in_VM state
2457   if (ShowMessageBoxOnError ) {
2458     JavaThread* thread = JavaThread::current();
2459     JavaThreadState saved_state = thread->thread_state();
2460     thread->set_thread_state(_thread_in_vm);
2461 #ifndef PRODUCT
2462     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2463       ttyLocker ttyl;
2464       BytecodeCounter::print();
2465     }
2466 #endif
2467     if (os::message_box(msg, "Execution stopped, print registers?")) {
2468       ttyLocker ttyl;
2469       tty->print_cr(" pc = 0x%016lx", pc);
2470 #ifndef PRODUCT
2471       tty->cr();
2472       findpc(pc);
2473       tty->cr();
2474 #endif
2475       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2476       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2477       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2478       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2479       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2480       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2481       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2482       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2483       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2484       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2485       tty->print_cr("r10 = 0x%016lx", regs[10]);
2486       tty->print_cr("r11 = 0x%016lx", regs[11]);
2487       tty->print_cr("r12 = 0x%016lx", regs[12]);
2488       tty->print_cr("r13 = 0x%016lx", regs[13]);
2489       tty->print_cr("r14 = 0x%016lx", regs[14]);
2490       tty->print_cr("r15 = 0x%016lx", regs[15]);
2491       tty->print_cr("r16 = 0x%016lx", regs[16]);
2492       tty->print_cr("r17 = 0x%016lx", regs[17]);
2493       tty->print_cr("r18 = 0x%016lx", regs[18]);
2494       tty->print_cr("r19 = 0x%016lx", regs[19]);
2495       tty->print_cr("r20 = 0x%016lx", regs[20]);
2496       tty->print_cr("r21 = 0x%016lx", regs[21]);
2497       tty->print_cr("r22 = 0x%016lx", regs[22]);
2498       tty->print_cr("r23 = 0x%016lx", regs[23]);
2499       tty->print_cr("r24 = 0x%016lx", regs[24]);
2500       tty->print_cr("r25 = 0x%016lx", regs[25]);
2501       tty->print_cr("r26 = 0x%016lx", regs[26]);
2502       tty->print_cr("r27 = 0x%016lx", regs[27]);
2503       tty->print_cr("r28 = 0x%016lx", regs[28]);
2504       tty->print_cr("r30 = 0x%016lx", regs[30]);
2505       tty->print_cr("r31 = 0x%016lx", regs[31]);
2506       BREAKPOINT;
2507     }
2508     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2509   } else {
2510     ttyLocker ttyl;
2511     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2512                     msg);
2513     assert(false, "DEBUG MESSAGE: %s", msg);
2514   }
2515 }
2516 
2517 #ifdef BUILTIN_SIM
2518 // routine to generate an x86 prolog for a stub function which
2519 // bootstraps into the generated ARM code which directly follows the
2520 // stub
2521 //
2522 // the argument encodes the number of general and fp registers
2523 // passed by the caller and the callng convention (currently just
2524 // the number of general registers and assumes C argument passing)
2525 
2526 extern "C" {
2527 int aarch64_stub_prolog_size();
2528 void aarch64_stub_prolog();
2529 void aarch64_prolog();
2530 }
2531 
2532 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2533                                    address *prolog_ptr)
2534 {
2535   int calltype = (((ret_type & 0x3) << 8) |
2536                   ((fp_arg_count & 0xf) << 4) |
2537                   (gp_arg_count & 0xf));
2538 
2539   // the addresses for the x86 to ARM entry code we need to use
2540   address start = pc();
2541   // printf("start = %lx\n", start);
2542   int byteCount =  aarch64_stub_prolog_size();
2543   // printf("byteCount = %x\n", byteCount);
2544   int instructionCount = (byteCount + 3)/ 4;
2545   // printf("instructionCount = %x\n", instructionCount);
2546   for (int i = 0; i < instructionCount; i++) {
2547     nop();
2548   }
2549 
2550   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2551 
2552   // write the address of the setup routine and the call format at the
2553   // end of into the copied code
2554   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2555   if (prolog_ptr)
2556     patch_end[-2] = (u_int64_t)prolog_ptr;
2557   patch_end[-1] = calltype;
2558 }
2559 #endif
2560 
2561 void MacroAssembler::push_call_clobbered_registers() {
2562   int step = 4 * wordSize;
2563   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2564   sub(sp, sp, step);
2565   mov(rscratch1, -step);
2566   // Push v0-v7, v16-v31.
2567   for (int i = 31; i>= 4; i -= 4) {
2568     if (i <= v7->encoding() || i >= v16->encoding())
2569       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2570           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2571   }
2572   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2573       as_FloatRegister(3), T1D, Address(sp));
2574 }
2575 
2576 void MacroAssembler::pop_call_clobbered_registers() {
2577   for (int i = 0; i < 32; i += 4) {
2578     if (i <= v7->encoding() || i >= v16->encoding())
2579       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2580           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2581   }
2582 
2583   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2584 }
2585 
2586 void MacroAssembler::push_CPU_state(bool save_vectors) {
2587   int step = (save_vectors ? 8 : 4) * wordSize;
2588   push(0x3fffffff, sp);         // integer registers except lr & sp
2589   mov(rscratch1, -step);
2590   sub(sp, sp, step);
2591   for (int i = 28; i >= 4; i -= 4) {
2592     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2593         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2594   }
2595   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2596 }
2597 
2598 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2599   int step = (restore_vectors ? 8 : 4) * wordSize;
2600   for (int i = 0; i <= 28; i += 4)
2601     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2602         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2603   pop(0x3fffffff, sp);         // integer registers except lr & sp
2604 }
2605 
2606 /**
2607  * Helpers for multiply_to_len().
2608  */
2609 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2610                                      Register src1, Register src2) {
2611   adds(dest_lo, dest_lo, src1);
2612   adc(dest_hi, dest_hi, zr);
2613   adds(dest_lo, dest_lo, src2);
2614   adc(final_dest_hi, dest_hi, zr);
2615 }
2616 
2617 // Generate an address from (r + r1 extend offset).  "size" is the
2618 // size of the operand.  The result may be in rscratch2.
2619 Address MacroAssembler::offsetted_address(Register r, Register r1,
2620                                           Address::extend ext, int offset, int size) {
2621   if (offset || (ext.shift() % size != 0)) {
2622     lea(rscratch2, Address(r, r1, ext));
2623     return Address(rscratch2, offset);
2624   } else {
2625     return Address(r, r1, ext);
2626   }
2627 }
2628 
2629 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2630 {
2631   assert(offset >= 0, "spill to negative address?");
2632   // Offset reachable ?
2633   //   Not aligned - 9 bits signed offset
2634   //   Aligned - 12 bits unsigned offset shifted
2635   Register base = sp;
2636   if ((offset & (size-1)) && offset >= (1<<8)) {
2637     add(tmp, base, offset & ((1<<12)-1));
2638     base = tmp;
2639     offset &= -1<<12;
2640   }
2641 
2642   if (offset >= (1<<12) * size) {
2643     add(tmp, base, offset & (((1<<12)-1)<<12));
2644     base = tmp;
2645     offset &= ~(((1<<12)-1)<<12);
2646   }
2647 
2648   return Address(base, offset);
2649 }
2650 
2651 // Checks whether offset is aligned.
2652 // Returns true if it is, else false.
2653 bool MacroAssembler::merge_alignment_check(Register base,
2654                                            size_t size,
2655                                            long cur_offset,
2656                                            long prev_offset) const {
2657   if (AvoidUnalignedAccesses) {
2658     if (base == sp) {
2659       // Checks whether low offset if aligned to pair of registers.
2660       long pair_mask = size * 2 - 1;
2661       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2662       return (offset & pair_mask) == 0;
2663     } else { // If base is not sp, we can't guarantee the access is aligned.
2664       return false;
2665     }
2666   } else {
2667     long mask = size - 1;
2668     // Load/store pair instruction only supports element size aligned offset.
2669     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2670   }
2671 }
2672 
2673 // Checks whether current and previous loads/stores can be merged.
2674 // Returns true if it can be merged, else false.
2675 bool MacroAssembler::ldst_can_merge(Register rt,
2676                                     const Address &adr,
2677                                     size_t cur_size_in_bytes,
2678                                     bool is_store) const {
2679   address prev = pc() - NativeInstruction::instruction_size;
2680   address last = code()->last_insn();
2681 
2682   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2683     return false;
2684   }
2685 
2686   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2687     return false;
2688   }
2689 
2690   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2691   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2692 
2693   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2694   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2695 
2696   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2697     return false;
2698   }
2699 
2700   long max_offset = 63 * prev_size_in_bytes;
2701   long min_offset = -64 * prev_size_in_bytes;
2702 
2703   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2704 
2705   // Only same base can be merged.
2706   if (adr.base() != prev_ldst->base()) {
2707     return false;
2708   }
2709 
2710   long cur_offset = adr.offset();
2711   long prev_offset = prev_ldst->offset();
2712   size_t diff = abs(cur_offset - prev_offset);
2713   if (diff != prev_size_in_bytes) {
2714     return false;
2715   }
2716 
2717   // Following cases can not be merged:
2718   // ldr x2, [x2, #8]
2719   // ldr x3, [x2, #16]
2720   // or:
2721   // ldr x2, [x3, #8]
2722   // ldr x2, [x3, #16]
2723   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2724   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2725     return false;
2726   }
2727 
2728   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2729   // Offset range must be in ldp/stp instruction's range.
2730   if (low_offset > max_offset || low_offset < min_offset) {
2731     return false;
2732   }
2733 
2734   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2735     return true;
2736   }
2737 
2738   return false;
2739 }
2740 
2741 // Merge current load/store with previous load/store into ldp/stp.
2742 void MacroAssembler::merge_ldst(Register rt,
2743                                 const Address &adr,
2744                                 size_t cur_size_in_bytes,
2745                                 bool is_store) {
2746 
2747   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2748 
2749   Register rt_low, rt_high;
2750   address prev = pc() - NativeInstruction::instruction_size;
2751   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2752 
2753   long offset;
2754 
2755   if (adr.offset() < prev_ldst->offset()) {
2756     offset = adr.offset();
2757     rt_low = rt;
2758     rt_high = prev_ldst->target();
2759   } else {
2760     offset = prev_ldst->offset();
2761     rt_low = prev_ldst->target();
2762     rt_high = rt;
2763   }
2764 
2765   Address adr_p = Address(prev_ldst->base(), offset);
2766   // Overwrite previous generated binary.
2767   code_section()->set_end(prev);
2768 
2769   const int sz = prev_ldst->size_in_bytes();
2770   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2771   if (!is_store) {
2772     BLOCK_COMMENT("merged ldr pair");
2773     if (sz == 8) {
2774       ldp(rt_low, rt_high, adr_p);
2775     } else {
2776       ldpw(rt_low, rt_high, adr_p);
2777     }
2778   } else {
2779     BLOCK_COMMENT("merged str pair");
2780     if (sz == 8) {
2781       stp(rt_low, rt_high, adr_p);
2782     } else {
2783       stpw(rt_low, rt_high, adr_p);
2784     }
2785   }
2786 }
2787 
2788 /**
2789  * Multiply 64 bit by 64 bit first loop.
2790  */
2791 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2792                                            Register y, Register y_idx, Register z,
2793                                            Register carry, Register product,
2794                                            Register idx, Register kdx) {
2795   //
2796   //  jlong carry, x[], y[], z[];
2797   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2798   //    huge_128 product = y[idx] * x[xstart] + carry;
2799   //    z[kdx] = (jlong)product;
2800   //    carry  = (jlong)(product >>> 64);
2801   //  }
2802   //  z[xstart] = carry;
2803   //
2804 
2805   Label L_first_loop, L_first_loop_exit;
2806   Label L_one_x, L_one_y, L_multiply;
2807 
2808   subsw(xstart, xstart, 1);
2809   br(Assembler::MI, L_one_x);
2810 
2811   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2812   ldr(x_xstart, Address(rscratch1));
2813   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2814 
2815   bind(L_first_loop);
2816   subsw(idx, idx, 1);
2817   br(Assembler::MI, L_first_loop_exit);
2818   subsw(idx, idx, 1);
2819   br(Assembler::MI, L_one_y);
2820   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2821   ldr(y_idx, Address(rscratch1));
2822   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2823   bind(L_multiply);
2824 
2825   // AArch64 has a multiply-accumulate instruction that we can't use
2826   // here because it has no way to process carries, so we have to use
2827   // separate add and adc instructions.  Bah.
2828   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2829   mul(product, x_xstart, y_idx);
2830   adds(product, product, carry);
2831   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2832 
2833   subw(kdx, kdx, 2);
2834   ror(product, product, 32); // back to big-endian
2835   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2836 
2837   b(L_first_loop);
2838 
2839   bind(L_one_y);
2840   ldrw(y_idx, Address(y,  0));
2841   b(L_multiply);
2842 
2843   bind(L_one_x);
2844   ldrw(x_xstart, Address(x,  0));
2845   b(L_first_loop);
2846 
2847   bind(L_first_loop_exit);
2848 }
2849 
2850 /**
2851  * Multiply 128 bit by 128. Unrolled inner loop.
2852  *
2853  */
2854 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2855                                              Register carry, Register carry2,
2856                                              Register idx, Register jdx,
2857                                              Register yz_idx1, Register yz_idx2,
2858                                              Register tmp, Register tmp3, Register tmp4,
2859                                              Register tmp6, Register product_hi) {
2860 
2861   //   jlong carry, x[], y[], z[];
2862   //   int kdx = ystart+1;
2863   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2864   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2865   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2866   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2867   //     carry  = (jlong)(tmp4 >>> 64);
2868   //     z[kdx+idx+1] = (jlong)tmp3;
2869   //     z[kdx+idx] = (jlong)tmp4;
2870   //   }
2871   //   idx += 2;
2872   //   if (idx > 0) {
2873   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2874   //     z[kdx+idx] = (jlong)yz_idx1;
2875   //     carry  = (jlong)(yz_idx1 >>> 64);
2876   //   }
2877   //
2878 
2879   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2880 
2881   lsrw(jdx, idx, 2);
2882 
2883   bind(L_third_loop);
2884 
2885   subsw(jdx, jdx, 1);
2886   br(Assembler::MI, L_third_loop_exit);
2887   subw(idx, idx, 4);
2888 
2889   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2890 
2891   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2892 
2893   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2894 
2895   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2896   ror(yz_idx2, yz_idx2, 32);
2897 
2898   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2899 
2900   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2901   umulh(tmp4, product_hi, yz_idx1);
2902 
2903   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2904   ror(rscratch2, rscratch2, 32);
2905 
2906   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2907   umulh(carry2, product_hi, yz_idx2);
2908 
2909   // propagate sum of both multiplications into carry:tmp4:tmp3
2910   adds(tmp3, tmp3, carry);
2911   adc(tmp4, tmp4, zr);
2912   adds(tmp3, tmp3, rscratch1);
2913   adcs(tmp4, tmp4, tmp);
2914   adc(carry, carry2, zr);
2915   adds(tmp4, tmp4, rscratch2);
2916   adc(carry, carry, zr);
2917 
2918   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2919   ror(tmp4, tmp4, 32);
2920   stp(tmp4, tmp3, Address(tmp6, 0));
2921 
2922   b(L_third_loop);
2923   bind (L_third_loop_exit);
2924 
2925   andw (idx, idx, 0x3);
2926   cbz(idx, L_post_third_loop_done);
2927 
2928   Label L_check_1;
2929   subsw(idx, idx, 2);
2930   br(Assembler::MI, L_check_1);
2931 
2932   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2933   ldr(yz_idx1, Address(rscratch1, 0));
2934   ror(yz_idx1, yz_idx1, 32);
2935   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2936   umulh(tmp4, product_hi, yz_idx1);
2937   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2938   ldr(yz_idx2, Address(rscratch1, 0));
2939   ror(yz_idx2, yz_idx2, 32);
2940 
2941   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2942 
2943   ror(tmp3, tmp3, 32);
2944   str(tmp3, Address(rscratch1, 0));
2945 
2946   bind (L_check_1);
2947 
2948   andw (idx, idx, 0x1);
2949   subsw(idx, idx, 1);
2950   br(Assembler::MI, L_post_third_loop_done);
2951   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2952   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2953   umulh(carry2, tmp4, product_hi);
2954   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2955 
2956   add2_with_carry(carry2, tmp3, tmp4, carry);
2957 
2958   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2959   extr(carry, carry2, tmp3, 32);
2960 
2961   bind(L_post_third_loop_done);
2962 }
2963 
2964 /**
2965  * Code for BigInteger::multiplyToLen() instrinsic.
2966  *
2967  * r0: x
2968  * r1: xlen
2969  * r2: y
2970  * r3: ylen
2971  * r4:  z
2972  * r5: zlen
2973  * r10: tmp1
2974  * r11: tmp2
2975  * r12: tmp3
2976  * r13: tmp4
2977  * r14: tmp5
2978  * r15: tmp6
2979  * r16: tmp7
2980  *
2981  */
2982 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2983                                      Register z, Register zlen,
2984                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2985                                      Register tmp5, Register tmp6, Register product_hi) {
2986 
2987   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2988 
2989   const Register idx = tmp1;
2990   const Register kdx = tmp2;
2991   const Register xstart = tmp3;
2992 
2993   const Register y_idx = tmp4;
2994   const Register carry = tmp5;
2995   const Register product  = xlen;
2996   const Register x_xstart = zlen;  // reuse register
2997 
2998   // First Loop.
2999   //
3000   //  final static long LONG_MASK = 0xffffffffL;
3001   //  int xstart = xlen - 1;
3002   //  int ystart = ylen - 1;
3003   //  long carry = 0;
3004   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3005   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3006   //    z[kdx] = (int)product;
3007   //    carry = product >>> 32;
3008   //  }
3009   //  z[xstart] = (int)carry;
3010   //
3011 
3012   movw(idx, ylen);      // idx = ylen;
3013   movw(kdx, zlen);      // kdx = xlen+ylen;
3014   mov(carry, zr);       // carry = 0;
3015 
3016   Label L_done;
3017 
3018   movw(xstart, xlen);
3019   subsw(xstart, xstart, 1);
3020   br(Assembler::MI, L_done);
3021 
3022   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3023 
3024   Label L_second_loop;
3025   cbzw(kdx, L_second_loop);
3026 
3027   Label L_carry;
3028   subw(kdx, kdx, 1);
3029   cbzw(kdx, L_carry);
3030 
3031   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3032   lsr(carry, carry, 32);
3033   subw(kdx, kdx, 1);
3034 
3035   bind(L_carry);
3036   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3037 
3038   // Second and third (nested) loops.
3039   //
3040   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3041   //   carry = 0;
3042   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3043   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3044   //                    (z[k] & LONG_MASK) + carry;
3045   //     z[k] = (int)product;
3046   //     carry = product >>> 32;
3047   //   }
3048   //   z[i] = (int)carry;
3049   // }
3050   //
3051   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3052 
3053   const Register jdx = tmp1;
3054 
3055   bind(L_second_loop);
3056   mov(carry, zr);                // carry = 0;
3057   movw(jdx, ylen);               // j = ystart+1
3058 
3059   subsw(xstart, xstart, 1);      // i = xstart-1;
3060   br(Assembler::MI, L_done);
3061 
3062   str(z, Address(pre(sp, -4 * wordSize)));
3063 
3064   Label L_last_x;
3065   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3066   subsw(xstart, xstart, 1);       // i = xstart-1;
3067   br(Assembler::MI, L_last_x);
3068 
3069   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3070   ldr(product_hi, Address(rscratch1));
3071   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3072 
3073   Label L_third_loop_prologue;
3074   bind(L_third_loop_prologue);
3075 
3076   str(ylen, Address(sp, wordSize));
3077   stp(x, xstart, Address(sp, 2 * wordSize));
3078   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3079                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3080   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3081   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3082 
3083   addw(tmp3, xlen, 1);
3084   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3085   subsw(tmp3, tmp3, 1);
3086   br(Assembler::MI, L_done);
3087 
3088   lsr(carry, carry, 32);
3089   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3090   b(L_second_loop);
3091 
3092   // Next infrequent code is moved outside loops.
3093   bind(L_last_x);
3094   ldrw(product_hi, Address(x,  0));
3095   b(L_third_loop_prologue);
3096 
3097   bind(L_done);
3098 }
3099 
3100 // Code for BigInteger::mulAdd instrinsic
3101 // out     = r0
3102 // in      = r1
3103 // offset  = r2  (already out.length-offset)
3104 // len     = r3
3105 // k       = r4
3106 //
3107 // pseudo code from java implementation:
3108 // carry = 0;
3109 // offset = out.length-offset - 1;
3110 // for (int j=len-1; j >= 0; j--) {
3111 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3112 //     out[offset--] = (int)product;
3113 //     carry = product >>> 32;
3114 // }
3115 // return (int)carry;
3116 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3117       Register len, Register k) {
3118     Label LOOP, END;
3119     // pre-loop
3120     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3121     csel(out, zr, out, Assembler::EQ);
3122     br(Assembler::EQ, END);
3123     add(in, in, len, LSL, 2); // in[j+1] address
3124     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3125     mov(out, zr); // used to keep carry now
3126     BIND(LOOP);
3127     ldrw(rscratch1, Address(pre(in, -4)));
3128     madd(rscratch1, rscratch1, k, out);
3129     ldrw(rscratch2, Address(pre(offset, -4)));
3130     add(rscratch1, rscratch1, rscratch2);
3131     strw(rscratch1, Address(offset));
3132     lsr(out, rscratch1, 32);
3133     subs(len, len, 1);
3134     br(Assembler::NE, LOOP);
3135     BIND(END);
3136 }
3137 
3138 /**
3139  * Emits code to update CRC-32 with a byte value according to constants in table
3140  *
3141  * @param [in,out]crc   Register containing the crc.
3142  * @param [in]val       Register containing the byte to fold into the CRC.
3143  * @param [in]table     Register containing the table of crc constants.
3144  *
3145  * uint32_t crc;
3146  * val = crc_table[(val ^ crc) & 0xFF];
3147  * crc = val ^ (crc >> 8);
3148  *
3149  */
3150 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3151   eor(val, val, crc);
3152   andr(val, val, 0xff);
3153   ldrw(val, Address(table, val, Address::lsl(2)));
3154   eor(crc, val, crc, Assembler::LSR, 8);
3155 }
3156 
3157 /**
3158  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3159  *
3160  * @param [in,out]crc   Register containing the crc.
3161  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3162  * @param [in]table0    Register containing table 0 of crc constants.
3163  * @param [in]table1    Register containing table 1 of crc constants.
3164  * @param [in]table2    Register containing table 2 of crc constants.
3165  * @param [in]table3    Register containing table 3 of crc constants.
3166  *
3167  * uint32_t crc;
3168  *   v = crc ^ v
3169  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3170  *
3171  */
3172 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3173         Register table0, Register table1, Register table2, Register table3,
3174         bool upper) {
3175   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3176   uxtb(tmp, v);
3177   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3178   ubfx(tmp, v, 8, 8);
3179   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3180   eor(crc, crc, tmp);
3181   ubfx(tmp, v, 16, 8);
3182   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3183   eor(crc, crc, tmp);
3184   ubfx(tmp, v, 24, 8);
3185   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3186   eor(crc, crc, tmp);
3187 }
3188 
3189 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3190         Register len, Register tmp0, Register tmp1, Register tmp2,
3191         Register tmp3) {
3192     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3193     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3194 
3195     mvnw(crc, crc);
3196 
3197     subs(len, len, 128);
3198     br(Assembler::GE, CRC_by64_pre);
3199   BIND(CRC_less64);
3200     adds(len, len, 128-32);
3201     br(Assembler::GE, CRC_by32_loop);
3202   BIND(CRC_less32);
3203     adds(len, len, 32-4);
3204     br(Assembler::GE, CRC_by4_loop);
3205     adds(len, len, 4);
3206     br(Assembler::GT, CRC_by1_loop);
3207     b(L_exit);
3208 
3209   BIND(CRC_by32_loop);
3210     ldp(tmp0, tmp1, Address(post(buf, 16)));
3211     subs(len, len, 32);
3212     crc32x(crc, crc, tmp0);
3213     ldr(tmp2, Address(post(buf, 8)));
3214     crc32x(crc, crc, tmp1);
3215     ldr(tmp3, Address(post(buf, 8)));
3216     crc32x(crc, crc, tmp2);
3217     crc32x(crc, crc, tmp3);
3218     br(Assembler::GE, CRC_by32_loop);
3219     cmn(len, 32);
3220     br(Assembler::NE, CRC_less32);
3221     b(L_exit);
3222 
3223   BIND(CRC_by4_loop);
3224     ldrw(tmp0, Address(post(buf, 4)));
3225     subs(len, len, 4);
3226     crc32w(crc, crc, tmp0);
3227     br(Assembler::GE, CRC_by4_loop);
3228     adds(len, len, 4);
3229     br(Assembler::LE, L_exit);
3230   BIND(CRC_by1_loop);
3231     ldrb(tmp0, Address(post(buf, 1)));
3232     subs(len, len, 1);
3233     crc32b(crc, crc, tmp0);
3234     br(Assembler::GT, CRC_by1_loop);
3235     b(L_exit);
3236 
3237   BIND(CRC_by64_pre);
3238     sub(buf, buf, 8);
3239     ldp(tmp0, tmp1, Address(buf, 8));
3240     crc32x(crc, crc, tmp0);
3241     ldr(tmp2, Address(buf, 24));
3242     crc32x(crc, crc, tmp1);
3243     ldr(tmp3, Address(buf, 32));
3244     crc32x(crc, crc, tmp2);
3245     ldr(tmp0, Address(buf, 40));
3246     crc32x(crc, crc, tmp3);
3247     ldr(tmp1, Address(buf, 48));
3248     crc32x(crc, crc, tmp0);
3249     ldr(tmp2, Address(buf, 56));
3250     crc32x(crc, crc, tmp1);
3251     ldr(tmp3, Address(pre(buf, 64)));
3252 
3253     b(CRC_by64_loop);
3254 
3255     align(CodeEntryAlignment);
3256   BIND(CRC_by64_loop);
3257     subs(len, len, 64);
3258     crc32x(crc, crc, tmp2);
3259     ldr(tmp0, Address(buf, 8));
3260     crc32x(crc, crc, tmp3);
3261     ldr(tmp1, Address(buf, 16));
3262     crc32x(crc, crc, tmp0);
3263     ldr(tmp2, Address(buf, 24));
3264     crc32x(crc, crc, tmp1);
3265     ldr(tmp3, Address(buf, 32));
3266     crc32x(crc, crc, tmp2);
3267     ldr(tmp0, Address(buf, 40));
3268     crc32x(crc, crc, tmp3);
3269     ldr(tmp1, Address(buf, 48));
3270     crc32x(crc, crc, tmp0);
3271     ldr(tmp2, Address(buf, 56));
3272     crc32x(crc, crc, tmp1);
3273     ldr(tmp3, Address(pre(buf, 64)));
3274     br(Assembler::GE, CRC_by64_loop);
3275 
3276     // post-loop
3277     crc32x(crc, crc, tmp2);
3278     crc32x(crc, crc, tmp3);
3279 
3280     sub(len, len, 64);
3281     add(buf, buf, 8);
3282     cmn(len, 128);
3283     br(Assembler::NE, CRC_less64);
3284   BIND(L_exit);
3285     mvnw(crc, crc);
3286 }
3287 
3288 /**
3289  * @param crc   register containing existing CRC (32-bit)
3290  * @param buf   register pointing to input byte buffer (byte*)
3291  * @param len   register containing number of bytes
3292  * @param table register that will contain address of CRC table
3293  * @param tmp   scratch register
3294  */
3295 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3296         Register table0, Register table1, Register table2, Register table3,
3297         Register tmp, Register tmp2, Register tmp3) {
3298   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3299   unsigned long offset;
3300 
3301   if (UseCRC32) {
3302       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3303       return;
3304   }
3305 
3306     mvnw(crc, crc);
3307 
3308     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3309     if (offset) add(table0, table0, offset);
3310     add(table1, table0, 1*256*sizeof(juint));
3311     add(table2, table0, 2*256*sizeof(juint));
3312     add(table3, table0, 3*256*sizeof(juint));
3313 
3314   if (UseNeon) {
3315       cmp(len, 64);
3316       br(Assembler::LT, L_by16);
3317       eor(v16, T16B, v16, v16);
3318 
3319     Label L_fold;
3320 
3321       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3322 
3323       ld1(v0, v1, T2D, post(buf, 32));
3324       ld1r(v4, T2D, post(tmp, 8));
3325       ld1r(v5, T2D, post(tmp, 8));
3326       ld1r(v6, T2D, post(tmp, 8));
3327       ld1r(v7, T2D, post(tmp, 8));
3328       mov(v16, T4S, 0, crc);
3329 
3330       eor(v0, T16B, v0, v16);
3331       sub(len, len, 64);
3332 
3333     BIND(L_fold);
3334       pmull(v22, T8H, v0, v5, T8B);
3335       pmull(v20, T8H, v0, v7, T8B);
3336       pmull(v23, T8H, v0, v4, T8B);
3337       pmull(v21, T8H, v0, v6, T8B);
3338 
3339       pmull2(v18, T8H, v0, v5, T16B);
3340       pmull2(v16, T8H, v0, v7, T16B);
3341       pmull2(v19, T8H, v0, v4, T16B);
3342       pmull2(v17, T8H, v0, v6, T16B);
3343 
3344       uzp1(v24, T8H, v20, v22);
3345       uzp2(v25, T8H, v20, v22);
3346       eor(v20, T16B, v24, v25);
3347 
3348       uzp1(v26, T8H, v16, v18);
3349       uzp2(v27, T8H, v16, v18);
3350       eor(v16, T16B, v26, v27);
3351 
3352       ushll2(v22, T4S, v20, T8H, 8);
3353       ushll(v20, T4S, v20, T4H, 8);
3354 
3355       ushll2(v18, T4S, v16, T8H, 8);
3356       ushll(v16, T4S, v16, T4H, 8);
3357 
3358       eor(v22, T16B, v23, v22);
3359       eor(v18, T16B, v19, v18);
3360       eor(v20, T16B, v21, v20);
3361       eor(v16, T16B, v17, v16);
3362 
3363       uzp1(v17, T2D, v16, v20);
3364       uzp2(v21, T2D, v16, v20);
3365       eor(v17, T16B, v17, v21);
3366 
3367       ushll2(v20, T2D, v17, T4S, 16);
3368       ushll(v16, T2D, v17, T2S, 16);
3369 
3370       eor(v20, T16B, v20, v22);
3371       eor(v16, T16B, v16, v18);
3372 
3373       uzp1(v17, T2D, v20, v16);
3374       uzp2(v21, T2D, v20, v16);
3375       eor(v28, T16B, v17, v21);
3376 
3377       pmull(v22, T8H, v1, v5, T8B);
3378       pmull(v20, T8H, v1, v7, T8B);
3379       pmull(v23, T8H, v1, v4, T8B);
3380       pmull(v21, T8H, v1, v6, T8B);
3381 
3382       pmull2(v18, T8H, v1, v5, T16B);
3383       pmull2(v16, T8H, v1, v7, T16B);
3384       pmull2(v19, T8H, v1, v4, T16B);
3385       pmull2(v17, T8H, v1, v6, T16B);
3386 
3387       ld1(v0, v1, T2D, post(buf, 32));
3388 
3389       uzp1(v24, T8H, v20, v22);
3390       uzp2(v25, T8H, v20, v22);
3391       eor(v20, T16B, v24, v25);
3392 
3393       uzp1(v26, T8H, v16, v18);
3394       uzp2(v27, T8H, v16, v18);
3395       eor(v16, T16B, v26, v27);
3396 
3397       ushll2(v22, T4S, v20, T8H, 8);
3398       ushll(v20, T4S, v20, T4H, 8);
3399 
3400       ushll2(v18, T4S, v16, T8H, 8);
3401       ushll(v16, T4S, v16, T4H, 8);
3402 
3403       eor(v22, T16B, v23, v22);
3404       eor(v18, T16B, v19, v18);
3405       eor(v20, T16B, v21, v20);
3406       eor(v16, T16B, v17, v16);
3407 
3408       uzp1(v17, T2D, v16, v20);
3409       uzp2(v21, T2D, v16, v20);
3410       eor(v16, T16B, v17, v21);
3411 
3412       ushll2(v20, T2D, v16, T4S, 16);
3413       ushll(v16, T2D, v16, T2S, 16);
3414 
3415       eor(v20, T16B, v22, v20);
3416       eor(v16, T16B, v16, v18);
3417 
3418       uzp1(v17, T2D, v20, v16);
3419       uzp2(v21, T2D, v20, v16);
3420       eor(v20, T16B, v17, v21);
3421 
3422       shl(v16, T2D, v28, 1);
3423       shl(v17, T2D, v20, 1);
3424 
3425       eor(v0, T16B, v0, v16);
3426       eor(v1, T16B, v1, v17);
3427 
3428       subs(len, len, 32);
3429       br(Assembler::GE, L_fold);
3430 
3431       mov(crc, 0);
3432       mov(tmp, v0, T1D, 0);
3433       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3434       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3435       mov(tmp, v0, T1D, 1);
3436       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3437       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3438       mov(tmp, v1, T1D, 0);
3439       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3440       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3441       mov(tmp, v1, T1D, 1);
3442       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3443       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3444 
3445       add(len, len, 32);
3446   }
3447 
3448   BIND(L_by16);
3449     subs(len, len, 16);
3450     br(Assembler::GE, L_by16_loop);
3451     adds(len, len, 16-4);
3452     br(Assembler::GE, L_by4_loop);
3453     adds(len, len, 4);
3454     br(Assembler::GT, L_by1_loop);
3455     b(L_exit);
3456 
3457   BIND(L_by4_loop);
3458     ldrw(tmp, Address(post(buf, 4)));
3459     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3460     subs(len, len, 4);
3461     br(Assembler::GE, L_by4_loop);
3462     adds(len, len, 4);
3463     br(Assembler::LE, L_exit);
3464   BIND(L_by1_loop);
3465     subs(len, len, 1);
3466     ldrb(tmp, Address(post(buf, 1)));
3467     update_byte_crc32(crc, tmp, table0);
3468     br(Assembler::GT, L_by1_loop);
3469     b(L_exit);
3470 
3471     align(CodeEntryAlignment);
3472   BIND(L_by16_loop);
3473     subs(len, len, 16);
3474     ldp(tmp, tmp3, Address(post(buf, 16)));
3475     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3476     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3477     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3478     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3479     br(Assembler::GE, L_by16_loop);
3480     adds(len, len, 16-4);
3481     br(Assembler::GE, L_by4_loop);
3482     adds(len, len, 4);
3483     br(Assembler::GT, L_by1_loop);
3484   BIND(L_exit);
3485     mvnw(crc, crc);
3486 }
3487 
3488 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3489         Register len, Register tmp0, Register tmp1, Register tmp2,
3490         Register tmp3) {
3491     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3492     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3493 
3494     subs(len, len, 128);
3495     br(Assembler::GE, CRC_by64_pre);
3496   BIND(CRC_less64);
3497     adds(len, len, 128-32);
3498     br(Assembler::GE, CRC_by32_loop);
3499   BIND(CRC_less32);
3500     adds(len, len, 32-4);
3501     br(Assembler::GE, CRC_by4_loop);
3502     adds(len, len, 4);
3503     br(Assembler::GT, CRC_by1_loop);
3504     b(L_exit);
3505 
3506   BIND(CRC_by32_loop);
3507     ldp(tmp0, tmp1, Address(post(buf, 16)));
3508     subs(len, len, 32);
3509     crc32cx(crc, crc, tmp0);
3510     ldr(tmp2, Address(post(buf, 8)));
3511     crc32cx(crc, crc, tmp1);
3512     ldr(tmp3, Address(post(buf, 8)));
3513     crc32cx(crc, crc, tmp2);
3514     crc32cx(crc, crc, tmp3);
3515     br(Assembler::GE, CRC_by32_loop);
3516     cmn(len, 32);
3517     br(Assembler::NE, CRC_less32);
3518     b(L_exit);
3519 
3520   BIND(CRC_by4_loop);
3521     ldrw(tmp0, Address(post(buf, 4)));
3522     subs(len, len, 4);
3523     crc32cw(crc, crc, tmp0);
3524     br(Assembler::GE, CRC_by4_loop);
3525     adds(len, len, 4);
3526     br(Assembler::LE, L_exit);
3527   BIND(CRC_by1_loop);
3528     ldrb(tmp0, Address(post(buf, 1)));
3529     subs(len, len, 1);
3530     crc32cb(crc, crc, tmp0);
3531     br(Assembler::GT, CRC_by1_loop);
3532     b(L_exit);
3533 
3534   BIND(CRC_by64_pre);
3535     sub(buf, buf, 8);
3536     ldp(tmp0, tmp1, Address(buf, 8));
3537     crc32cx(crc, crc, tmp0);
3538     ldr(tmp2, Address(buf, 24));
3539     crc32cx(crc, crc, tmp1);
3540     ldr(tmp3, Address(buf, 32));
3541     crc32cx(crc, crc, tmp2);
3542     ldr(tmp0, Address(buf, 40));
3543     crc32cx(crc, crc, tmp3);
3544     ldr(tmp1, Address(buf, 48));
3545     crc32cx(crc, crc, tmp0);
3546     ldr(tmp2, Address(buf, 56));
3547     crc32cx(crc, crc, tmp1);
3548     ldr(tmp3, Address(pre(buf, 64)));
3549 
3550     b(CRC_by64_loop);
3551 
3552     align(CodeEntryAlignment);
3553   BIND(CRC_by64_loop);
3554     subs(len, len, 64);
3555     crc32cx(crc, crc, tmp2);
3556     ldr(tmp0, Address(buf, 8));
3557     crc32cx(crc, crc, tmp3);
3558     ldr(tmp1, Address(buf, 16));
3559     crc32cx(crc, crc, tmp0);
3560     ldr(tmp2, Address(buf, 24));
3561     crc32cx(crc, crc, tmp1);
3562     ldr(tmp3, Address(buf, 32));
3563     crc32cx(crc, crc, tmp2);
3564     ldr(tmp0, Address(buf, 40));
3565     crc32cx(crc, crc, tmp3);
3566     ldr(tmp1, Address(buf, 48));
3567     crc32cx(crc, crc, tmp0);
3568     ldr(tmp2, Address(buf, 56));
3569     crc32cx(crc, crc, tmp1);
3570     ldr(tmp3, Address(pre(buf, 64)));
3571     br(Assembler::GE, CRC_by64_loop);
3572 
3573     // post-loop
3574     crc32cx(crc, crc, tmp2);
3575     crc32cx(crc, crc, tmp3);
3576 
3577     sub(len, len, 64);
3578     add(buf, buf, 8);
3579     cmn(len, 128);
3580     br(Assembler::NE, CRC_less64);
3581   BIND(L_exit);
3582 }
3583 
3584 /**
3585  * @param crc   register containing existing CRC (32-bit)
3586  * @param buf   register pointing to input byte buffer (byte*)
3587  * @param len   register containing number of bytes
3588  * @param table register that will contain address of CRC table
3589  * @param tmp   scratch register
3590  */
3591 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3592         Register table0, Register table1, Register table2, Register table3,
3593         Register tmp, Register tmp2, Register tmp3) {
3594   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3595 }
3596 
3597 
3598 SkipIfEqual::SkipIfEqual(
3599     MacroAssembler* masm, const bool* flag_addr, bool value) {
3600   _masm = masm;
3601   unsigned long offset;
3602   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3603   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3604   _masm->cbzw(rscratch1, _label);
3605 }
3606 
3607 SkipIfEqual::~SkipIfEqual() {
3608   _masm->bind(_label);
3609 }
3610 
3611 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3612   Address adr;
3613   switch(dst.getMode()) {
3614   case Address::base_plus_offset:
3615     // This is the expected mode, although we allow all the other
3616     // forms below.
3617     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3618     break;
3619   default:
3620     lea(rscratch2, dst);
3621     adr = Address(rscratch2);
3622     break;
3623   }
3624   ldr(rscratch1, adr);
3625   add(rscratch1, rscratch1, src);
3626   str(rscratch1, adr);
3627 }
3628 
3629 void MacroAssembler::cmpptr(Register src1, Address src2) {
3630   unsigned long offset;
3631   adrp(rscratch1, src2, offset);
3632   ldr(rscratch1, Address(rscratch1, offset));
3633   cmp(src1, rscratch1);
3634 }
3635 
3636 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3637   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3638   bs->obj_equals(this, obj1, obj2);
3639 }
3640 
3641 void MacroAssembler::load_klass(Register dst, Register src) {
3642   if (UseCompressedClassPointers) {
3643     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3644     decode_klass_not_null(dst);
3645   } else {
3646     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3647   }
3648 }
3649 
3650 // ((OopHandle)result).resolve();
3651 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3652   // OopHandle::resolve is an indirection.
3653   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3654 }
3655 
3656 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3657   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3658   ldr(dst, Address(rmethod, Method::const_offset()));
3659   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3660   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3661   ldr(dst, Address(dst, mirror_offset));
3662   resolve_oop_handle(dst, tmp);
3663 }
3664 
3665 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3666   if (UseCompressedClassPointers) {
3667     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3668     if (Universe::narrow_klass_base() == NULL) {
3669       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3670       return;
3671     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3672                && Universe::narrow_klass_shift() == 0) {
3673       // Only the bottom 32 bits matter
3674       cmpw(trial_klass, tmp);
3675       return;
3676     }
3677     decode_klass_not_null(tmp);
3678   } else {
3679     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3680   }
3681   cmp(trial_klass, tmp);
3682 }
3683 
3684 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3685   load_klass(dst, src);
3686   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3687 }
3688 
3689 void MacroAssembler::store_klass(Register dst, Register src) {
3690   // FIXME: Should this be a store release?  concurrent gcs assumes
3691   // klass length is valid if klass field is not null.
3692   if (UseCompressedClassPointers) {
3693     encode_klass_not_null(src);
3694     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3695   } else {
3696     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3697   }
3698 }
3699 
3700 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3701   if (UseCompressedClassPointers) {
3702     // Store to klass gap in destination
3703     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3704   }
3705 }
3706 
3707 // Algorithm must match CompressedOops::encode.
3708 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3709 #ifdef ASSERT
3710   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3711 #endif
3712   verify_oop(s, "broken oop in encode_heap_oop");
3713   if (Universe::narrow_oop_base() == NULL) {
3714     if (Universe::narrow_oop_shift() != 0) {
3715       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3716       lsr(d, s, LogMinObjAlignmentInBytes);
3717     } else {
3718       mov(d, s);
3719     }
3720   } else {
3721     subs(d, s, rheapbase);
3722     csel(d, d, zr, Assembler::HS);
3723     lsr(d, d, LogMinObjAlignmentInBytes);
3724 
3725     /*  Old algorithm: is this any worse?
3726     Label nonnull;
3727     cbnz(r, nonnull);
3728     sub(r, r, rheapbase);
3729     bind(nonnull);
3730     lsr(r, r, LogMinObjAlignmentInBytes);
3731     */
3732   }
3733 }
3734 
3735 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3736 #ifdef ASSERT
3737   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3738   if (CheckCompressedOops) {
3739     Label ok;
3740     cbnz(r, ok);
3741     stop("null oop passed to encode_heap_oop_not_null");
3742     bind(ok);
3743   }
3744 #endif
3745   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3746   if (Universe::narrow_oop_base() != NULL) {
3747     sub(r, r, rheapbase);
3748   }
3749   if (Universe::narrow_oop_shift() != 0) {
3750     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3751     lsr(r, r, LogMinObjAlignmentInBytes);
3752   }
3753 }
3754 
3755 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3756 #ifdef ASSERT
3757   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3758   if (CheckCompressedOops) {
3759     Label ok;
3760     cbnz(src, ok);
3761     stop("null oop passed to encode_heap_oop_not_null2");
3762     bind(ok);
3763   }
3764 #endif
3765   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3766 
3767   Register data = src;
3768   if (Universe::narrow_oop_base() != NULL) {
3769     sub(dst, src, rheapbase);
3770     data = dst;
3771   }
3772   if (Universe::narrow_oop_shift() != 0) {
3773     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3774     lsr(dst, data, LogMinObjAlignmentInBytes);
3775     data = dst;
3776   }
3777   if (data == src)
3778     mov(dst, src);
3779 }
3780 
3781 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3782 #ifdef ASSERT
3783   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3784 #endif
3785   if (Universe::narrow_oop_base() == NULL) {
3786     if (Universe::narrow_oop_shift() != 0 || d != s) {
3787       lsl(d, s, Universe::narrow_oop_shift());
3788     }
3789   } else {
3790     Label done;
3791     if (d != s)
3792       mov(d, s);
3793     cbz(s, done);
3794     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3795     bind(done);
3796   }
3797   verify_oop(d, "broken oop in decode_heap_oop");
3798 }
3799 
3800 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3801   assert (UseCompressedOops, "should only be used for compressed headers");
3802   assert (Universe::heap() != NULL, "java heap should be initialized");
3803   // Cannot assert, unverified entry point counts instructions (see .ad file)
3804   // vtableStubs also counts instructions in pd_code_size_limit.
3805   // Also do not verify_oop as this is called by verify_oop.
3806   if (Universe::narrow_oop_shift() != 0) {
3807     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3808     if (Universe::narrow_oop_base() != NULL) {
3809       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3810     } else {
3811       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3812     }
3813   } else {
3814     assert (Universe::narrow_oop_base() == NULL, "sanity");
3815   }
3816 }
3817 
3818 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3819   assert (UseCompressedOops, "should only be used for compressed headers");
3820   assert (Universe::heap() != NULL, "java heap should be initialized");
3821   // Cannot assert, unverified entry point counts instructions (see .ad file)
3822   // vtableStubs also counts instructions in pd_code_size_limit.
3823   // Also do not verify_oop as this is called by verify_oop.
3824   if (Universe::narrow_oop_shift() != 0) {
3825     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3826     if (Universe::narrow_oop_base() != NULL) {
3827       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3828     } else {
3829       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3830     }
3831   } else {
3832     assert (Universe::narrow_oop_base() == NULL, "sanity");
3833     if (dst != src) {
3834       mov(dst, src);
3835     }
3836   }
3837 }
3838 
3839 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3840   if (Universe::narrow_klass_base() == NULL) {
3841     if (Universe::narrow_klass_shift() != 0) {
3842       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3843       lsr(dst, src, LogKlassAlignmentInBytes);
3844     } else {
3845       if (dst != src) mov(dst, src);
3846     }
3847     return;
3848   }
3849 
3850   if (use_XOR_for_compressed_class_base) {
3851     if (Universe::narrow_klass_shift() != 0) {
3852       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3853       lsr(dst, dst, LogKlassAlignmentInBytes);
3854     } else {
3855       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3856     }
3857     return;
3858   }
3859 
3860   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3861       && Universe::narrow_klass_shift() == 0) {
3862     movw(dst, src);
3863     return;
3864   }
3865 
3866 #ifdef ASSERT
3867   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3868 #endif
3869 
3870   Register rbase = dst;
3871   if (dst == src) rbase = rheapbase;
3872   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3873   sub(dst, src, rbase);
3874   if (Universe::narrow_klass_shift() != 0) {
3875     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3876     lsr(dst, dst, LogKlassAlignmentInBytes);
3877   }
3878   if (dst == src) reinit_heapbase();
3879 }
3880 
3881 void MacroAssembler::encode_klass_not_null(Register r) {
3882   encode_klass_not_null(r, r);
3883 }
3884 
3885 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3886   Register rbase = dst;
3887   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3888 
3889   if (Universe::narrow_klass_base() == NULL) {
3890     if (Universe::narrow_klass_shift() != 0) {
3891       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3892       lsl(dst, src, LogKlassAlignmentInBytes);
3893     } else {
3894       if (dst != src) mov(dst, src);
3895     }
3896     return;
3897   }
3898 
3899   if (use_XOR_for_compressed_class_base) {
3900     if (Universe::narrow_klass_shift() != 0) {
3901       lsl(dst, src, LogKlassAlignmentInBytes);
3902       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3903     } else {
3904       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3905     }
3906     return;
3907   }
3908 
3909   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3910       && Universe::narrow_klass_shift() == 0) {
3911     if (dst != src)
3912       movw(dst, src);
3913     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3914     return;
3915   }
3916 
3917   // Cannot assert, unverified entry point counts instructions (see .ad file)
3918   // vtableStubs also counts instructions in pd_code_size_limit.
3919   // Also do not verify_oop as this is called by verify_oop.
3920   if (dst == src) rbase = rheapbase;
3921   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3922   if (Universe::narrow_klass_shift() != 0) {
3923     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3924     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3925   } else {
3926     add(dst, rbase, src);
3927   }
3928   if (dst == src) reinit_heapbase();
3929 }
3930 
3931 void  MacroAssembler::decode_klass_not_null(Register r) {
3932   decode_klass_not_null(r, r);
3933 }
3934 
3935 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3936 #ifdef ASSERT
3937   {
3938     ThreadInVMfromUnknown tiv;
3939     assert (UseCompressedOops, "should only be used for compressed oops");
3940     assert (Universe::heap() != NULL, "java heap should be initialized");
3941     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3942     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3943   }
3944 #endif
3945   int oop_index = oop_recorder()->find_index(obj);
3946   InstructionMark im(this);
3947   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3948   code_section()->relocate(inst_mark(), rspec);
3949   movz(dst, 0xDEAD, 16);
3950   movk(dst, 0xBEEF);
3951 }
3952 
3953 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3954   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3955   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3956   int index = oop_recorder()->find_index(k);
3957   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3958 
3959   InstructionMark im(this);
3960   RelocationHolder rspec = metadata_Relocation::spec(index);
3961   code_section()->relocate(inst_mark(), rspec);
3962   narrowKlass nk = Klass::encode_klass(k);
3963   movz(dst, (nk >> 16), 16);
3964   movk(dst, nk & 0xffff);
3965 }
3966 
3967 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3968                                     Register dst, Address src,
3969                                     Register tmp1, Register thread_tmp) {
3970   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3971   decorators = AccessInternal::decorator_fixup(decorators);
3972   bool as_raw = (decorators & AS_RAW) != 0;
3973   if (as_raw) {
3974     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3975   } else {
3976     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3977   }
3978 }
3979 
3980 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
3981                                      Address dst, Register src,
3982                                      Register tmp1, Register thread_tmp) {
3983   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3984   decorators = AccessInternal::decorator_fixup(decorators);
3985   bool as_raw = (decorators & AS_RAW) != 0;
3986   if (as_raw) {
3987     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3988   } else {
3989     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3990   }
3991 }
3992 
3993 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
3994   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
3995   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
3996     decorators |= ACCESS_READ | ACCESS_WRITE;
3997   }
3998   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3999   return bs->resolve(this, decorators, obj);
4000 }
4001 
4002 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4003                                    Register thread_tmp, DecoratorSet decorators) {
4004   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4005 }
4006 
4007 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4008                                             Register thread_tmp, DecoratorSet decorators) {
4009   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4010 }
4011 
4012 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4013                                     Register thread_tmp, DecoratorSet decorators) {
4014   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4015 }
4016 
4017 // Used for storing NULLs.
4018 void MacroAssembler::store_heap_oop_null(Address dst) {
4019   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4020 }
4021 
4022 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4023   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4024   int index = oop_recorder()->allocate_metadata_index(obj);
4025   RelocationHolder rspec = metadata_Relocation::spec(index);
4026   return Address((address)obj, rspec);
4027 }
4028 
4029 // Move an oop into a register.  immediate is true if we want
4030 // immediate instrcutions, i.e. we are not going to patch this
4031 // instruction while the code is being executed by another thread.  In
4032 // that case we can use move immediates rather than the constant pool.
4033 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4034   int oop_index;
4035   if (obj == NULL) {
4036     oop_index = oop_recorder()->allocate_oop_index(obj);
4037   } else {
4038 #ifdef ASSERT
4039     {
4040       ThreadInVMfromUnknown tiv;
4041       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4042     }
4043 #endif
4044     oop_index = oop_recorder()->find_index(obj);
4045   }
4046   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4047   if (! immediate) {
4048     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4049     ldr_constant(dst, Address(dummy, rspec));
4050   } else
4051     mov(dst, Address((address)obj, rspec));
4052 }
4053 
4054 // Move a metadata address into a register.
4055 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4056   int oop_index;
4057   if (obj == NULL) {
4058     oop_index = oop_recorder()->allocate_metadata_index(obj);
4059   } else {
4060     oop_index = oop_recorder()->find_index(obj);
4061   }
4062   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4063   mov(dst, Address((address)obj, rspec));
4064 }
4065 
4066 Address MacroAssembler::constant_oop_address(jobject obj) {
4067 #ifdef ASSERT
4068   {
4069     ThreadInVMfromUnknown tiv;
4070     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4071     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4072   }
4073 #endif
4074   int oop_index = oop_recorder()->find_index(obj);
4075   return Address((address)obj, oop_Relocation::spec(oop_index));
4076 }
4077 
4078 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4079 void MacroAssembler::tlab_allocate(Register obj,
4080                                    Register var_size_in_bytes,
4081                                    int con_size_in_bytes,
4082                                    Register t1,
4083                                    Register t2,
4084                                    Label& slow_case) {
4085   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4086   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4087 }
4088 
4089 // Defines obj, preserves var_size_in_bytes
4090 void MacroAssembler::eden_allocate(Register obj,
4091                                    Register var_size_in_bytes,
4092                                    int con_size_in_bytes,
4093                                    Register t1,
4094                                    Label& slow_case) {
4095   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4096   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4097 }
4098 
4099 // Zero words; len is in bytes
4100 // Destroys all registers except addr
4101 // len must be a nonzero multiple of wordSize
4102 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4103   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4104 
4105 #ifdef ASSERT
4106   { Label L;
4107     tst(len, BytesPerWord - 1);
4108     br(Assembler::EQ, L);
4109     stop("len is not a multiple of BytesPerWord");
4110     bind(L);
4111   }
4112 #endif
4113 
4114 #ifndef PRODUCT
4115   block_comment("zero memory");
4116 #endif
4117 
4118   Label loop;
4119   Label entry;
4120 
4121 //  Algorithm:
4122 //
4123 //    scratch1 = cnt & 7;
4124 //    cnt -= scratch1;
4125 //    p += scratch1;
4126 //    switch (scratch1) {
4127 //      do {
4128 //        cnt -= 8;
4129 //          p[-8] = 0;
4130 //        case 7:
4131 //          p[-7] = 0;
4132 //        case 6:
4133 //          p[-6] = 0;
4134 //          // ...
4135 //        case 1:
4136 //          p[-1] = 0;
4137 //        case 0:
4138 //          p += 8;
4139 //      } while (cnt);
4140 //    }
4141 
4142   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4143 
4144   lsr(len, len, LogBytesPerWord);
4145   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4146   sub(len, len, rscratch1);      // cnt -= unroll
4147   // t1 always points to the end of the region we're about to zero
4148   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4149   adr(rscratch2, entry);
4150   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4151   br(rscratch2);
4152   bind(loop);
4153   sub(len, len, unroll);
4154   for (int i = -unroll; i < 0; i++)
4155     Assembler::str(zr, Address(t1, i * wordSize));
4156   bind(entry);
4157   add(t1, t1, unroll * wordSize);
4158   cbnz(len, loop);
4159 }
4160 
4161 void MacroAssembler::verify_tlab() {
4162 #ifdef ASSERT
4163   if (UseTLAB && VerifyOops) {
4164     Label next, ok;
4165 
4166     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4167 
4168     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4169     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4170     cmp(rscratch2, rscratch1);
4171     br(Assembler::HS, next);
4172     STOP("assert(top >= start)");
4173     should_not_reach_here();
4174 
4175     bind(next);
4176     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4177     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4178     cmp(rscratch2, rscratch1);
4179     br(Assembler::HS, ok);
4180     STOP("assert(top <= end)");
4181     should_not_reach_here();
4182 
4183     bind(ok);
4184     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4185   }
4186 #endif
4187 }
4188 
4189 // Writes to stack successive pages until offset reached to check for
4190 // stack overflow + shadow pages.  This clobbers tmp.
4191 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4192   assert_different_registers(tmp, size, rscratch1);
4193   mov(tmp, sp);
4194   // Bang stack for total size given plus shadow page size.
4195   // Bang one page at a time because large size can bang beyond yellow and
4196   // red zones.
4197   Label loop;
4198   mov(rscratch1, os::vm_page_size());
4199   bind(loop);
4200   lea(tmp, Address(tmp, -os::vm_page_size()));
4201   subsw(size, size, rscratch1);
4202   str(size, Address(tmp));
4203   br(Assembler::GT, loop);
4204 
4205   // Bang down shadow pages too.
4206   // At this point, (tmp-0) is the last address touched, so don't
4207   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4208   // was post-decremented.)  Skip this address by starting at i=1, and
4209   // touch a few more pages below.  N.B.  It is important to touch all
4210   // the way down to and including i=StackShadowPages.
4211   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4212     // this could be any sized move but this is can be a debugging crumb
4213     // so the bigger the better.
4214     lea(tmp, Address(tmp, -os::vm_page_size()));
4215     str(size, Address(tmp));
4216   }
4217 }
4218 
4219 
4220 // Move the address of the polling page into dest.
4221 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4222   if (SafepointMechanism::uses_thread_local_poll()) {
4223     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4224   } else {
4225     unsigned long off;
4226     adrp(dest, Address(page, rtype), off);
4227     assert(off == 0, "polling page must be page aligned");
4228   }
4229 }
4230 
4231 // Move the address of the polling page into r, then read the polling
4232 // page.
4233 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4234   get_polling_page(r, page, rtype);
4235   return read_polling_page(r, rtype);
4236 }
4237 
4238 // Read the polling page.  The address of the polling page must
4239 // already be in r.
4240 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4241   InstructionMark im(this);
4242   code_section()->relocate(inst_mark(), rtype);
4243   ldrw(zr, Address(r, 0));
4244   return inst_mark();
4245 }
4246 
4247 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4248   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4249   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4250   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4251   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4252   long offset_low = dest_page - low_page;
4253   long offset_high = dest_page - high_page;
4254 
4255   assert(is_valid_AArch64_address(dest.target()), "bad address");
4256   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4257 
4258   InstructionMark im(this);
4259   code_section()->relocate(inst_mark(), dest.rspec());
4260   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4261   // the code cache so that if it is relocated we know it will still reach
4262   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4263     _adrp(reg1, dest.target());
4264   } else {
4265     unsigned long target = (unsigned long)dest.target();
4266     unsigned long adrp_target
4267       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4268 
4269     _adrp(reg1, (address)adrp_target);
4270     movk(reg1, target >> 32, 32);
4271   }
4272   byte_offset = (unsigned long)dest.target() & 0xfff;
4273 }
4274 
4275 void MacroAssembler::load_byte_map_base(Register reg) {
4276   jbyte *byte_map_base =
4277     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4278 
4279   if (is_valid_AArch64_address((address)byte_map_base)) {
4280     // Strictly speaking the byte_map_base isn't an address at all,
4281     // and it might even be negative.
4282     unsigned long offset;
4283     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4284     // We expect offset to be zero with most collectors.
4285     if (offset != 0) {
4286       add(reg, reg, offset);
4287     }
4288   } else {
4289     mov(reg, (uint64_t)byte_map_base);
4290   }
4291 }
4292 
4293 void MacroAssembler::build_frame(int framesize) {
4294   assert(framesize > 0, "framesize must be > 0");
4295   if (framesize < ((1 << 9) + 2 * wordSize)) {
4296     sub(sp, sp, framesize);
4297     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4298     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4299   } else {
4300     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4301     if (PreserveFramePointer) mov(rfp, sp);
4302     if (framesize < ((1 << 12) + 2 * wordSize))
4303       sub(sp, sp, framesize - 2 * wordSize);
4304     else {
4305       mov(rscratch1, framesize - 2 * wordSize);
4306       sub(sp, sp, rscratch1);
4307     }
4308   }
4309 }
4310 
4311 void MacroAssembler::remove_frame(int framesize) {
4312   assert(framesize > 0, "framesize must be > 0");
4313   if (framesize < ((1 << 9) + 2 * wordSize)) {
4314     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4315     add(sp, sp, framesize);
4316   } else {
4317     if (framesize < ((1 << 12) + 2 * wordSize))
4318       add(sp, sp, framesize - 2 * wordSize);
4319     else {
4320       mov(rscratch1, framesize - 2 * wordSize);
4321       add(sp, sp, rscratch1);
4322     }
4323     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4324   }
4325 }
4326 
4327 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4328 
4329 // Search for str1 in str2 and return index or -1
4330 void MacroAssembler::string_indexof(Register str2, Register str1,
4331                                     Register cnt2, Register cnt1,
4332                                     Register tmp1, Register tmp2,
4333                                     Register tmp3, Register tmp4,
4334                                     Register tmp5, Register tmp6,
4335                                     int icnt1, Register result, int ae) {
4336   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4337   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4338 
4339   Register ch1 = rscratch1;
4340   Register ch2 = rscratch2;
4341   Register cnt1tmp = tmp1;
4342   Register cnt2tmp = tmp2;
4343   Register cnt1_neg = cnt1;
4344   Register cnt2_neg = cnt2;
4345   Register result_tmp = tmp4;
4346 
4347   bool isL = ae == StrIntrinsicNode::LL;
4348 
4349   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4350   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4351   int str1_chr_shift = str1_isL ? 0:1;
4352   int str2_chr_shift = str2_isL ? 0:1;
4353   int str1_chr_size = str1_isL ? 1:2;
4354   int str2_chr_size = str2_isL ? 1:2;
4355   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4356                                       (chr_insn)&MacroAssembler::ldrh;
4357   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4358                                       (chr_insn)&MacroAssembler::ldrh;
4359   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4360   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4361 
4362   // Note, inline_string_indexOf() generates checks:
4363   // if (substr.count > string.count) return -1;
4364   // if (substr.count == 0) return 0;
4365 
4366   // We have two strings, a source string in str2, cnt2 and a pattern string
4367   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4368 
4369   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4370   // With a small pattern and source we use linear scan.
4371 
4372   if (icnt1 == -1) {
4373     sub(result_tmp, cnt2, cnt1);
4374     cmp(cnt1, 8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4375     br(LT, LINEARSEARCH);
4376     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4377     cmp(cnt1, 256);
4378     lsr(tmp1, cnt2, 2);
4379     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4380     br(GE, LINEARSTUB);
4381   }
4382 
4383 // The Boyer Moore alogorithm is based on the description here:-
4384 //
4385 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4386 //
4387 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4388 // and the 'Good Suffix' rule.
4389 //
4390 // These rules are essentially heuristics for how far we can shift the
4391 // pattern along the search string.
4392 //
4393 // The implementation here uses the 'Bad Character' rule only because of the
4394 // complexity of initialisation for the 'Good Suffix' rule.
4395 //
4396 // This is also known as the Boyer-Moore-Horspool algorithm:-
4397 //
4398 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4399 //
4400 // This particular implementation has few java-specific optimizations.
4401 //
4402 // #define ASIZE 256
4403 //
4404 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4405 //       int i, j;
4406 //       unsigned c;
4407 //       unsigned char bc[ASIZE];
4408 //
4409 //       /* Preprocessing */
4410 //       for (i = 0; i < ASIZE; ++i)
4411 //          bc[i] = m;
4412 //       for (i = 0; i < m - 1; ) {
4413 //          c = x[i];
4414 //          ++i;
4415 //          // c < 256 for Latin1 string, so, no need for branch
4416 //          #ifdef PATTERN_STRING_IS_LATIN1
4417 //          bc[c] = m - i;
4418 //          #else
4419 //          if (c < ASIZE) bc[c] = m - i;
4420 //          #endif
4421 //       }
4422 //
4423 //       /* Searching */
4424 //       j = 0;
4425 //       while (j <= n - m) {
4426 //          c = y[i+j];
4427 //          if (x[m-1] == c)
4428 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4429 //          if (i < 0) return j;
4430 //          // c < 256 for Latin1 string, so, no need for branch
4431 //          #ifdef SOURCE_STRING_IS_LATIN1
4432 //          // LL case: (c< 256) always true. Remove branch
4433 //          j += bc[y[j+m-1]];
4434 //          #endif
4435 //          #ifndef PATTERN_STRING_IS_UTF
4436 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4437 //          if (c < ASIZE)
4438 //            j += bc[y[j+m-1]];
4439 //          else
4440 //            j += 1
4441 //          #endif
4442 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4443 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4444 //          if (c < ASIZE)
4445 //            j += bc[y[j+m-1]];
4446 //          else
4447 //            j += m
4448 //          #endif
4449 //       }
4450 //    }
4451 
4452   if (icnt1 == -1) {
4453     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4454         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4455     Register cnt1end = tmp2;
4456     Register str2end = cnt2;
4457     Register skipch = tmp2;
4458 
4459     // str1 length is >=8, so, we can read at least 1 register for cases when
4460     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4461     // UL case. We'll re-read last character in inner pre-loop code to have
4462     // single outer pre-loop load
4463     const int firstStep = isL ? 7 : 3;
4464 
4465     const int ASIZE = 256;
4466     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4467     sub(sp, sp, ASIZE);
4468     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4469     mov(ch1, sp);
4470     BIND(BM_INIT_LOOP);
4471       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4472       subs(tmp5, tmp5, 1);
4473       br(GT, BM_INIT_LOOP);
4474 
4475       sub(cnt1tmp, cnt1, 1);
4476       mov(tmp5, str2);
4477       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4478       sub(ch2, cnt1, 1);
4479       mov(tmp3, str1);
4480     BIND(BCLOOP);
4481       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4482       if (!str1_isL) {
4483         cmp(ch1, ASIZE);
4484         br(HS, BCSKIP);
4485       }
4486       strb(ch2, Address(sp, ch1));
4487     BIND(BCSKIP);
4488       subs(ch2, ch2, 1);
4489       br(GT, BCLOOP);
4490 
4491       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4492       if (str1_isL == str2_isL) {
4493         // load last 8 bytes (8LL/4UU symbols)
4494         ldr(tmp6, Address(tmp6, -wordSize));
4495       } else {
4496         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4497         // convert Latin1 to UTF. We'll have to wait until load completed, but
4498         // it's still faster than per-character loads+checks
4499         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4500         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4501         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4502         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4503         orr(ch2, ch1, ch2, LSL, 16);
4504         orr(tmp6, tmp6, tmp3, LSL, 48);
4505         orr(tmp6, tmp6, ch2, LSL, 16);
4506       }
4507     BIND(BMLOOPSTR2);
4508       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4509       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4510       if (str1_isL == str2_isL) {
4511         // re-init tmp3. It's for free because it's executed in parallel with
4512         // load above. Alternative is to initialize it before loop, but it'll
4513         // affect performance on in-order systems with 2 or more ld/st pipelines
4514         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4515       }
4516       if (!isL) { // UU/UL case
4517         lsl(ch2, cnt1tmp, 1); // offset in bytes
4518       }
4519       cmp(tmp3, skipch);
4520       br(NE, BMSKIP);
4521       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4522       mov(ch1, tmp6);
4523       if (isL) {
4524         b(BMLOOPSTR1_AFTER_LOAD);
4525       } else {
4526         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4527         b(BMLOOPSTR1_CMP);
4528       }
4529     BIND(BMLOOPSTR1);
4530       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4531       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4532     BIND(BMLOOPSTR1_AFTER_LOAD);
4533       subs(cnt1tmp, cnt1tmp, 1);
4534       br(LT, BMLOOPSTR1_LASTCMP);
4535     BIND(BMLOOPSTR1_CMP);
4536       cmp(ch1, ch2);
4537       br(EQ, BMLOOPSTR1);
4538     BIND(BMSKIP);
4539       if (!isL) {
4540         // if we've met UTF symbol while searching Latin1 pattern, then we can
4541         // skip cnt1 symbols
4542         if (str1_isL != str2_isL) {
4543           mov(result_tmp, cnt1);
4544         } else {
4545           mov(result_tmp, 1);
4546         }
4547         cmp(skipch, ASIZE);
4548         br(HS, BMADV);
4549       }
4550       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4551     BIND(BMADV);
4552       sub(cnt1tmp, cnt1, 1);
4553       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4554       cmp(str2, str2end);
4555       br(LE, BMLOOPSTR2);
4556       add(sp, sp, ASIZE);
4557       b(NOMATCH);
4558     BIND(BMLOOPSTR1_LASTCMP);
4559       cmp(ch1, ch2);
4560       br(NE, BMSKIP);
4561     BIND(BMMATCH);
4562       sub(result, str2, tmp5);
4563       if (!str2_isL) lsr(result, result, 1);
4564       add(sp, sp, ASIZE);
4565       b(DONE);
4566 
4567     BIND(LINEARSTUB);
4568     cmp(cnt1, 16); // small patterns still should be handled by simple algorithm
4569     br(LT, LINEAR_MEDIUM);
4570     mov(result, zr);
4571     RuntimeAddress stub = NULL;
4572     if (isL) {
4573       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4574       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4575     } else if (str1_isL) {
4576       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4577        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4578     } else {
4579       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4580       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4581     }
4582     trampoline_call(stub);
4583     b(DONE);
4584   }
4585 
4586   BIND(LINEARSEARCH);
4587   {
4588     Label DO1, DO2, DO3;
4589 
4590     Register str2tmp = tmp2;
4591     Register first = tmp3;
4592 
4593     if (icnt1 == -1)
4594     {
4595         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4596 
4597         cmp(cnt1, str1_isL == str2_isL ? 4 : 2);
4598         br(LT, DOSHORT);
4599       BIND(LINEAR_MEDIUM);
4600         (this->*str1_load_1chr)(first, Address(str1));
4601         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4602         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4603         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4604         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4605 
4606       BIND(FIRST_LOOP);
4607         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4608         cmp(first, ch2);
4609         br(EQ, STR1_LOOP);
4610       BIND(STR2_NEXT);
4611         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4612         br(LE, FIRST_LOOP);
4613         b(NOMATCH);
4614 
4615       BIND(STR1_LOOP);
4616         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4617         add(cnt2tmp, cnt2_neg, str2_chr_size);
4618         br(GE, MATCH);
4619 
4620       BIND(STR1_NEXT);
4621         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4622         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4623         cmp(ch1, ch2);
4624         br(NE, STR2_NEXT);
4625         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4626         add(cnt2tmp, cnt2tmp, str2_chr_size);
4627         br(LT, STR1_NEXT);
4628         b(MATCH);
4629 
4630       BIND(DOSHORT);
4631       if (str1_isL == str2_isL) {
4632         cmp(cnt1, 2);
4633         br(LT, DO1);
4634         br(GT, DO3);
4635       }
4636     }
4637 
4638     if (icnt1 == 4) {
4639       Label CH1_LOOP;
4640 
4641         (this->*load_4chr)(ch1, str1);
4642         sub(result_tmp, cnt2, 4);
4643         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4644         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4645 
4646       BIND(CH1_LOOP);
4647         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4648         cmp(ch1, ch2);
4649         br(EQ, MATCH);
4650         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4651         br(LE, CH1_LOOP);
4652         b(NOMATCH);
4653       }
4654 
4655     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4656       Label CH1_LOOP;
4657 
4658       BIND(DO2);
4659         (this->*load_2chr)(ch1, str1);
4660         if (icnt1 == 2) {
4661           sub(result_tmp, cnt2, 2);
4662         }
4663         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4664         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4665       BIND(CH1_LOOP);
4666         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4667         cmp(ch1, ch2);
4668         br(EQ, MATCH);
4669         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4670         br(LE, CH1_LOOP);
4671         b(NOMATCH);
4672     }
4673 
4674     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4675       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4676 
4677       BIND(DO3);
4678         (this->*load_2chr)(first, str1);
4679         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4680         if (icnt1 == 3) {
4681           sub(result_tmp, cnt2, 3);
4682         }
4683         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4684         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4685       BIND(FIRST_LOOP);
4686         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4687         cmpw(first, ch2);
4688         br(EQ, STR1_LOOP);
4689       BIND(STR2_NEXT);
4690         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4691         br(LE, FIRST_LOOP);
4692         b(NOMATCH);
4693 
4694       BIND(STR1_LOOP);
4695         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4696         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4697         cmp(ch1, ch2);
4698         br(NE, STR2_NEXT);
4699         b(MATCH);
4700     }
4701 
4702     if (icnt1 == -1 || icnt1 == 1) {
4703       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4704 
4705       BIND(DO1);
4706         (this->*str1_load_1chr)(ch1, str1);
4707         cmp(cnt2, 8);
4708         br(LT, DO1_SHORT);
4709 
4710         sub(result_tmp, cnt2, 8/str2_chr_size);
4711         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4712         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4713         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4714 
4715         if (str2_isL) {
4716           orr(ch1, ch1, ch1, LSL, 8);
4717         }
4718         orr(ch1, ch1, ch1, LSL, 16);
4719         orr(ch1, ch1, ch1, LSL, 32);
4720       BIND(CH1_LOOP);
4721         ldr(ch2, Address(str2, cnt2_neg));
4722         eor(ch2, ch1, ch2);
4723         sub(tmp1, ch2, tmp3);
4724         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4725         bics(tmp1, tmp1, tmp2);
4726         br(NE, HAS_ZERO);
4727         adds(cnt2_neg, cnt2_neg, 8);
4728         br(LT, CH1_LOOP);
4729 
4730         cmp(cnt2_neg, 8);
4731         mov(cnt2_neg, 0);
4732         br(LT, CH1_LOOP);
4733         b(NOMATCH);
4734 
4735       BIND(HAS_ZERO);
4736         rev(tmp1, tmp1);
4737         clz(tmp1, tmp1);
4738         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4739         b(MATCH);
4740 
4741       BIND(DO1_SHORT);
4742         mov(result_tmp, cnt2);
4743         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4744         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4745       BIND(DO1_LOOP);
4746         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4747         cmpw(ch1, ch2);
4748         br(EQ, MATCH);
4749         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4750         br(LT, DO1_LOOP);
4751     }
4752   }
4753   BIND(NOMATCH);
4754     mov(result, -1);
4755     b(DONE);
4756   BIND(MATCH);
4757     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4758   BIND(DONE);
4759 }
4760 
4761 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4762 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4763 
4764 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4765                                          Register ch, Register result,
4766                                          Register tmp1, Register tmp2, Register tmp3)
4767 {
4768   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4769   Register cnt1_neg = cnt1;
4770   Register ch1 = rscratch1;
4771   Register result_tmp = rscratch2;
4772 
4773   cmp(cnt1, 4);
4774   br(LT, DO1_SHORT);
4775 
4776   orr(ch, ch, ch, LSL, 16);
4777   orr(ch, ch, ch, LSL, 32);
4778 
4779   sub(cnt1, cnt1, 4);
4780   mov(result_tmp, cnt1);
4781   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4782   sub(cnt1_neg, zr, cnt1, LSL, 1);
4783 
4784   mov(tmp3, 0x0001000100010001);
4785 
4786   BIND(CH1_LOOP);
4787     ldr(ch1, Address(str1, cnt1_neg));
4788     eor(ch1, ch, ch1);
4789     sub(tmp1, ch1, tmp3);
4790     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4791     bics(tmp1, tmp1, tmp2);
4792     br(NE, HAS_ZERO);
4793     adds(cnt1_neg, cnt1_neg, 8);
4794     br(LT, CH1_LOOP);
4795 
4796     cmp(cnt1_neg, 8);
4797     mov(cnt1_neg, 0);
4798     br(LT, CH1_LOOP);
4799     b(NOMATCH);
4800 
4801   BIND(HAS_ZERO);
4802     rev(tmp1, tmp1);
4803     clz(tmp1, tmp1);
4804     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4805     b(MATCH);
4806 
4807   BIND(DO1_SHORT);
4808     mov(result_tmp, cnt1);
4809     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4810     sub(cnt1_neg, zr, cnt1, LSL, 1);
4811   BIND(DO1_LOOP);
4812     ldrh(ch1, Address(str1, cnt1_neg));
4813     cmpw(ch, ch1);
4814     br(EQ, MATCH);
4815     adds(cnt1_neg, cnt1_neg, 2);
4816     br(LT, DO1_LOOP);
4817   BIND(NOMATCH);
4818     mov(result, -1);
4819     b(DONE);
4820   BIND(MATCH);
4821     add(result, result_tmp, cnt1_neg, ASR, 1);
4822   BIND(DONE);
4823 }
4824 
4825 // Compare strings.
4826 void MacroAssembler::string_compare(Register str1, Register str2,
4827     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4828     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4829   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4830       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4831       SHORT_LOOP_START, TAIL_CHECK;
4832 
4833   const int STUB_THRESHOLD = 64 + 8;
4834   bool isLL = ae == StrIntrinsicNode::LL;
4835   bool isLU = ae == StrIntrinsicNode::LU;
4836   bool isUL = ae == StrIntrinsicNode::UL;
4837 
4838   bool str1_isL = isLL || isLU;
4839   bool str2_isL = isLL || isUL;
4840 
4841   int str1_chr_shift = str1_isL ? 0 : 1;
4842   int str2_chr_shift = str2_isL ? 0 : 1;
4843   int str1_chr_size = str1_isL ? 1 : 2;
4844   int str2_chr_size = str2_isL ? 1 : 2;
4845   int minCharsInWord = isLL ? wordSize : wordSize/2;
4846 
4847   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4848   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4849                                       (chr_insn)&MacroAssembler::ldrh;
4850   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4851                                       (chr_insn)&MacroAssembler::ldrh;
4852   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4853                             (uxt_insn)&MacroAssembler::uxthw;
4854 
4855   BLOCK_COMMENT("string_compare {");
4856 
4857   // Bizzarely, the counts are passed in bytes, regardless of whether they
4858   // are L or U strings, however the result is always in characters.
4859   if (!str1_isL) asrw(cnt1, cnt1, 1);
4860   if (!str2_isL) asrw(cnt2, cnt2, 1);
4861 
4862   // Compute the minimum of the string lengths and save the difference.
4863   subsw(result, cnt1, cnt2);
4864   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4865 
4866   // A very short string
4867   cmpw(cnt2, minCharsInWord);
4868   br(Assembler::LT, SHORT_STRING);
4869 
4870   // Compare longwords
4871   // load first parts of strings and finish initialization while loading
4872   {
4873     if (str1_isL == str2_isL) { // LL or UU
4874       ldr(tmp1, Address(str1));
4875       cmp(str1, str2);
4876       br(Assembler::EQ, DONE);
4877       ldr(tmp2, Address(str2));
4878       cmp(cnt2, STUB_THRESHOLD);
4879       br(GE, STUB);
4880       subsw(cnt2, cnt2, minCharsInWord);
4881       br(EQ, TAIL_CHECK);
4882       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4883       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4884       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4885     } else if (isLU) {
4886       ldrs(vtmp, Address(str1));
4887       cmp(str1, str2);
4888       br(Assembler::EQ, DONE);
4889       ldr(tmp2, Address(str2));
4890       cmp(cnt2, STUB_THRESHOLD);
4891       br(GE, STUB);
4892       subsw(cnt2, cnt2, 4);
4893       br(EQ, TAIL_CHECK);
4894       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4895       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4896       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4897       zip1(vtmp, T8B, vtmp, vtmpZ);
4898       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4899       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4900       add(cnt1, cnt1, 4);
4901       fmovd(tmp1, vtmp);
4902     } else { // UL case
4903       ldr(tmp1, Address(str1));
4904       cmp(str1, str2);
4905       br(Assembler::EQ, DONE);
4906       ldrs(vtmp, Address(str2));
4907       cmp(cnt2, STUB_THRESHOLD);
4908       br(GE, STUB);
4909       subsw(cnt2, cnt2, 4);
4910       br(EQ, TAIL_CHECK);
4911       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4912       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4913       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4914       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4915       zip1(vtmp, T8B, vtmp, vtmpZ);
4916       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4917       add(cnt1, cnt1, 8);
4918       fmovd(tmp2, vtmp);
4919     }
4920     adds(cnt2, cnt2, isUL ? 4 : 8);
4921     br(GE, TAIL);
4922     eor(rscratch2, tmp1, tmp2);
4923     cbnz(rscratch2, DIFFERENCE);
4924     // main loop
4925     bind(NEXT_WORD);
4926     if (str1_isL == str2_isL) {
4927       ldr(tmp1, Address(str1, cnt2));
4928       ldr(tmp2, Address(str2, cnt2));
4929       adds(cnt2, cnt2, 8);
4930     } else if (isLU) {
4931       ldrs(vtmp, Address(str1, cnt1));
4932       ldr(tmp2, Address(str2, cnt2));
4933       add(cnt1, cnt1, 4);
4934       zip1(vtmp, T8B, vtmp, vtmpZ);
4935       fmovd(tmp1, vtmp);
4936       adds(cnt2, cnt2, 8);
4937     } else { // UL
4938       ldrs(vtmp, Address(str2, cnt2));
4939       ldr(tmp1, Address(str1, cnt1));
4940       zip1(vtmp, T8B, vtmp, vtmpZ);
4941       add(cnt1, cnt1, 8);
4942       fmovd(tmp2, vtmp);
4943       adds(cnt2, cnt2, 4);
4944     }
4945     br(GE, TAIL);
4946 
4947     eor(rscratch2, tmp1, tmp2);
4948     cbz(rscratch2, NEXT_WORD);
4949     b(DIFFERENCE);
4950     bind(TAIL);
4951     eor(rscratch2, tmp1, tmp2);
4952     cbnz(rscratch2, DIFFERENCE);
4953     // Last longword.  In the case where length == 4 we compare the
4954     // same longword twice, but that's still faster than another
4955     // conditional branch.
4956     if (str1_isL == str2_isL) {
4957       ldr(tmp1, Address(str1));
4958       ldr(tmp2, Address(str2));
4959     } else if (isLU) {
4960       ldrs(vtmp, Address(str1));
4961       ldr(tmp2, Address(str2));
4962       zip1(vtmp, T8B, vtmp, vtmpZ);
4963       fmovd(tmp1, vtmp);
4964     } else { // UL
4965       ldrs(vtmp, Address(str2));
4966       ldr(tmp1, Address(str1));
4967       zip1(vtmp, T8B, vtmp, vtmpZ);
4968       fmovd(tmp2, vtmp);
4969     }
4970     bind(TAIL_CHECK);
4971     eor(rscratch2, tmp1, tmp2);
4972     cbz(rscratch2, DONE);
4973 
4974     // Find the first different characters in the longwords and
4975     // compute their difference.
4976     bind(DIFFERENCE);
4977     rev(rscratch2, rscratch2);
4978     clz(rscratch2, rscratch2);
4979     andr(rscratch2, rscratch2, isLL ? -8 : -16);
4980     lsrv(tmp1, tmp1, rscratch2);
4981     (this->*ext_chr)(tmp1, tmp1);
4982     lsrv(tmp2, tmp2, rscratch2);
4983     (this->*ext_chr)(tmp2, tmp2);
4984     subw(result, tmp1, tmp2);
4985     b(DONE);
4986   }
4987 
4988   bind(STUB);
4989     RuntimeAddress stub = NULL;
4990     switch(ae) {
4991       case StrIntrinsicNode::LL:
4992         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
4993         break;
4994       case StrIntrinsicNode::UU:
4995         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
4996         break;
4997       case StrIntrinsicNode::LU:
4998         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
4999         break;
5000       case StrIntrinsicNode::UL:
5001         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5002         break;
5003       default:
5004         ShouldNotReachHere();
5005      }
5006     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5007     trampoline_call(stub);
5008     b(DONE);
5009 
5010   bind(SHORT_STRING);
5011   // Is the minimum length zero?
5012   cbz(cnt2, DONE);
5013   // arrange code to do most branches while loading and loading next characters
5014   // while comparing previous
5015   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5016   subs(cnt2, cnt2, 1);
5017   br(EQ, SHORT_LAST_INIT);
5018   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5019   b(SHORT_LOOP_START);
5020   bind(SHORT_LOOP);
5021   subs(cnt2, cnt2, 1);
5022   br(EQ, SHORT_LAST);
5023   bind(SHORT_LOOP_START);
5024   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5025   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5026   cmp(tmp1, cnt1);
5027   br(NE, SHORT_LOOP_TAIL);
5028   subs(cnt2, cnt2, 1);
5029   br(EQ, SHORT_LAST2);
5030   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5031   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5032   cmp(tmp2, rscratch1);
5033   br(EQ, SHORT_LOOP);
5034   sub(result, tmp2, rscratch1);
5035   b(DONE);
5036   bind(SHORT_LOOP_TAIL);
5037   sub(result, tmp1, cnt1);
5038   b(DONE);
5039   bind(SHORT_LAST2);
5040   cmp(tmp2, rscratch1);
5041   br(EQ, DONE);
5042   sub(result, tmp2, rscratch1);
5043 
5044   b(DONE);
5045   bind(SHORT_LAST_INIT);
5046   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5047   bind(SHORT_LAST);
5048   cmp(tmp1, cnt1);
5049   br(EQ, DONE);
5050   sub(result, tmp1, cnt1);
5051 
5052   bind(DONE);
5053 
5054   BLOCK_COMMENT("} string_compare");
5055 }
5056 
5057 // This method checks if provided byte array contains byte with highest bit set.
5058 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5059     // Simple and most common case of aligned small array which is not at the
5060     // end of memory page is placed here. All other cases are in stub.
5061     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5062     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5063     assert_different_registers(ary1, len, result);
5064 
5065     cmpw(len, 0);
5066     br(LE, SET_RESULT);
5067     cmpw(len, 4 * wordSize);
5068     br(GE, STUB_LONG); // size > 32 then go to stub
5069 
5070     int shift = 64 - exact_log2(os::vm_page_size());
5071     lsl(rscratch1, ary1, shift);
5072     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5073     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5074     br(CS, STUB); // at the end of page then go to stub
5075     subs(len, len, wordSize);
5076     br(LT, END);
5077 
5078   BIND(LOOP);
5079     ldr(rscratch1, Address(post(ary1, wordSize)));
5080     tst(rscratch1, UPPER_BIT_MASK);
5081     br(NE, SET_RESULT);
5082     subs(len, len, wordSize);
5083     br(GE, LOOP);
5084     cmpw(len, -wordSize);
5085     br(EQ, SET_RESULT);
5086 
5087   BIND(END);
5088     ldr(result, Address(ary1));
5089     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5090     lslv(result, result, len);
5091     tst(result, UPPER_BIT_MASK);
5092     b(SET_RESULT);
5093 
5094   BIND(STUB);
5095     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5096     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5097     trampoline_call(has_neg);
5098     b(DONE);
5099 
5100   BIND(STUB_LONG);
5101     RuntimeAddress has_neg_long =  RuntimeAddress(
5102             StubRoutines::aarch64::has_negatives_long());
5103     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5104     trampoline_call(has_neg_long);
5105     b(DONE);
5106 
5107   BIND(SET_RESULT);
5108     cset(result, NE); // set true or false
5109 
5110   BIND(DONE);
5111 }
5112 
5113 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5114                                    Register tmp4, Register tmp5, Register result,
5115                                    Register cnt1, int elem_size) {
5116   Label DONE, SAME;
5117   Register tmp1 = rscratch1;
5118   Register tmp2 = rscratch2;
5119   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5120   int elem_per_word = wordSize/elem_size;
5121   int log_elem_size = exact_log2(elem_size);
5122   int length_offset = arrayOopDesc::length_offset_in_bytes();
5123   int base_offset
5124     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5125   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5126 
5127   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5128   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5129 
5130 #ifndef PRODUCT
5131   {
5132     const char kind = (elem_size == 2) ? 'U' : 'L';
5133     char comment[64];
5134     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5135     BLOCK_COMMENT(comment);
5136   }
5137 #endif
5138 
5139   // if (a1 == a2)
5140   //     return true;
5141   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5142   br(EQ, SAME);
5143 
5144   if (UseSimpleArrayEquals) {
5145     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5146     // if (a1 == null || a2 == null)
5147     //     return false;
5148     // a1 & a2 == 0 means (some-pointer is null) or
5149     // (very-rare-or-even-probably-impossible-pointer-values)
5150     // so, we can save one branch in most cases
5151     tst(a1, a2);
5152     mov(result, false);
5153     br(EQ, A_MIGHT_BE_NULL);
5154     // if (a1.length != a2.length)
5155     //      return false;
5156     bind(A_IS_NOT_NULL);
5157     ldrw(cnt1, Address(a1, length_offset));
5158     ldrw(cnt2, Address(a2, length_offset));
5159     eorw(tmp5, cnt1, cnt2);
5160     cbnzw(tmp5, DONE);
5161     lea(a1, Address(a1, base_offset));
5162     lea(a2, Address(a2, base_offset));
5163     // Check for short strings, i.e. smaller than wordSize.
5164     subs(cnt1, cnt1, elem_per_word);
5165     br(Assembler::LT, SHORT);
5166     // Main 8 byte comparison loop.
5167     bind(NEXT_WORD); {
5168       ldr(tmp1, Address(post(a1, wordSize)));
5169       ldr(tmp2, Address(post(a2, wordSize)));
5170       subs(cnt1, cnt1, elem_per_word);
5171       eor(tmp5, tmp1, tmp2);
5172       cbnz(tmp5, DONE);
5173     } br(GT, NEXT_WORD);
5174     // Last longword.  In the case where length == 4 we compare the
5175     // same longword twice, but that's still faster than another
5176     // conditional branch.
5177     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5178     // length == 4.
5179     if (log_elem_size > 0)
5180       lsl(cnt1, cnt1, log_elem_size);
5181     ldr(tmp3, Address(a1, cnt1));
5182     ldr(tmp4, Address(a2, cnt1));
5183     eor(tmp5, tmp3, tmp4);
5184     cbnz(tmp5, DONE);
5185     b(SAME);
5186     bind(A_MIGHT_BE_NULL);
5187     // in case both a1 and a2 are not-null, proceed with loads
5188     cbz(a1, DONE);
5189     cbz(a2, DONE);
5190     b(A_IS_NOT_NULL);
5191     bind(SHORT);
5192 
5193     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5194     {
5195       ldrw(tmp1, Address(post(a1, 4)));
5196       ldrw(tmp2, Address(post(a2, 4)));
5197       eorw(tmp5, tmp1, tmp2);
5198       cbnzw(tmp5, DONE);
5199     }
5200     bind(TAIL03);
5201     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5202     {
5203       ldrh(tmp3, Address(post(a1, 2)));
5204       ldrh(tmp4, Address(post(a2, 2)));
5205       eorw(tmp5, tmp3, tmp4);
5206       cbnzw(tmp5, DONE);
5207     }
5208     bind(TAIL01);
5209     if (elem_size == 1) { // Only needed when comparing byte arrays.
5210       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5211       {
5212         ldrb(tmp1, a1);
5213         ldrb(tmp2, a2);
5214         eorw(tmp5, tmp1, tmp2);
5215         cbnzw(tmp5, DONE);
5216       }
5217     }
5218   } else {
5219     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5220         CSET_EQ, LAST_CHECK;
5221     mov(result, false);
5222     cbz(a1, DONE);
5223     ldrw(cnt1, Address(a1, length_offset));
5224     cbz(a2, DONE);
5225     ldrw(cnt2, Address(a2, length_offset));
5226     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5227     // faster to perform another branch before comparing a1 and a2
5228     cmp(cnt1, elem_per_word);
5229     br(LE, SHORT); // short or same
5230     ldr(tmp3, Address(pre(a1, base_offset)));
5231     cmp(cnt1, stubBytesThreshold);
5232     br(GE, STUB);
5233     ldr(tmp4, Address(pre(a2, base_offset)));
5234     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5235     cmp(cnt2, cnt1);
5236     br(NE, DONE);
5237 
5238     // Main 16 byte comparison loop with 2 exits
5239     bind(NEXT_DWORD); {
5240       ldr(tmp1, Address(pre(a1, wordSize)));
5241       ldr(tmp2, Address(pre(a2, wordSize)));
5242       subs(cnt1, cnt1, 2 * elem_per_word);
5243       br(LE, TAIL);
5244       eor(tmp4, tmp3, tmp4);
5245       cbnz(tmp4, DONE);
5246       ldr(tmp3, Address(pre(a1, wordSize)));
5247       ldr(tmp4, Address(pre(a2, wordSize)));
5248       cmp(cnt1, elem_per_word);
5249       br(LE, TAIL2);
5250       cmp(tmp1, tmp2);
5251     } br(EQ, NEXT_DWORD);
5252     b(DONE);
5253 
5254     bind(TAIL);
5255     eor(tmp4, tmp3, tmp4);
5256     eor(tmp2, tmp1, tmp2);
5257     lslv(tmp2, tmp2, tmp5);
5258     orr(tmp5, tmp4, tmp2);
5259     cmp(tmp5, zr);
5260     b(CSET_EQ);
5261 
5262     bind(TAIL2);
5263     eor(tmp2, tmp1, tmp2);
5264     cbnz(tmp2, DONE);
5265     b(LAST_CHECK);
5266 
5267     bind(STUB);
5268     ldr(tmp4, Address(pre(a2, base_offset)));
5269     cmp(cnt2, cnt1);
5270     br(NE, DONE);
5271     if (elem_size == 2) { // convert to byte counter
5272       lsl(cnt1, cnt1, 1);
5273     }
5274     eor(tmp5, tmp3, tmp4);
5275     cbnz(tmp5, DONE);
5276     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5277     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5278     trampoline_call(stub);
5279     b(DONE);
5280 
5281     bind(EARLY_OUT);
5282     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5283     // so, if a2 == null => return false(0), else return true, so we can return a2
5284     mov(result, a2);
5285     b(DONE);
5286     bind(SHORT);
5287     cmp(cnt2, cnt1);
5288     br(NE, DONE);
5289     cbz(cnt1, SAME);
5290     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5291     ldr(tmp3, Address(a1, base_offset));
5292     ldr(tmp4, Address(a2, base_offset));
5293     bind(LAST_CHECK);
5294     eor(tmp4, tmp3, tmp4);
5295     lslv(tmp5, tmp4, tmp5);
5296     cmp(tmp5, zr);
5297     bind(CSET_EQ);
5298     cset(result, EQ);
5299     b(DONE);
5300   }
5301 
5302   bind(SAME);
5303   mov(result, true);
5304   // That's it.
5305   bind(DONE);
5306 
5307   BLOCK_COMMENT("} array_equals");
5308 }
5309 
5310 // Compare Strings
5311 
5312 // For Strings we're passed the address of the first characters in a1
5313 // and a2 and the length in cnt1.
5314 // elem_size is the element size in bytes: either 1 or 2.
5315 // There are two implementations.  For arrays >= 8 bytes, all
5316 // comparisons (including the final one, which may overlap) are
5317 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5318 // halfword, then a short, and then a byte.
5319 
5320 void MacroAssembler::string_equals(Register a1, Register a2,
5321                                    Register result, Register cnt1, int elem_size)
5322 {
5323   Label SAME, DONE, SHORT, NEXT_WORD;
5324   Register tmp1 = rscratch1;
5325   Register tmp2 = rscratch2;
5326   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5327 
5328   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5329   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5330 
5331 #ifndef PRODUCT
5332   {
5333     const char kind = (elem_size == 2) ? 'U' : 'L';
5334     char comment[64];
5335     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5336     BLOCK_COMMENT(comment);
5337   }
5338 #endif
5339 
5340   mov(result, false);
5341 
5342   // Check for short strings, i.e. smaller than wordSize.
5343   subs(cnt1, cnt1, wordSize);
5344   br(Assembler::LT, SHORT);
5345   // Main 8 byte comparison loop.
5346   bind(NEXT_WORD); {
5347     ldr(tmp1, Address(post(a1, wordSize)));
5348     ldr(tmp2, Address(post(a2, wordSize)));
5349     subs(cnt1, cnt1, wordSize);
5350     eor(tmp1, tmp1, tmp2);
5351     cbnz(tmp1, DONE);
5352   } br(GT, NEXT_WORD);
5353   // Last longword.  In the case where length == 4 we compare the
5354   // same longword twice, but that's still faster than another
5355   // conditional branch.
5356   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5357   // length == 4.
5358   ldr(tmp1, Address(a1, cnt1));
5359   ldr(tmp2, Address(a2, cnt1));
5360   eor(tmp2, tmp1, tmp2);
5361   cbnz(tmp2, DONE);
5362   b(SAME);
5363 
5364   bind(SHORT);
5365   Label TAIL03, TAIL01;
5366 
5367   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5368   {
5369     ldrw(tmp1, Address(post(a1, 4)));
5370     ldrw(tmp2, Address(post(a2, 4)));
5371     eorw(tmp1, tmp1, tmp2);
5372     cbnzw(tmp1, DONE);
5373   }
5374   bind(TAIL03);
5375   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5376   {
5377     ldrh(tmp1, Address(post(a1, 2)));
5378     ldrh(tmp2, Address(post(a2, 2)));
5379     eorw(tmp1, tmp1, tmp2);
5380     cbnzw(tmp1, DONE);
5381   }
5382   bind(TAIL01);
5383   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5384     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5385     {
5386       ldrb(tmp1, a1);
5387       ldrb(tmp2, a2);
5388       eorw(tmp1, tmp1, tmp2);
5389       cbnzw(tmp1, DONE);
5390     }
5391   }
5392   // Arrays are equal.
5393   bind(SAME);
5394   mov(result, true);
5395 
5396   // That's it.
5397   bind(DONE);
5398   BLOCK_COMMENT("} string_equals");
5399 }
5400 
5401 
5402 // The size of the blocks erased by the zero_blocks stub.  We must
5403 // handle anything smaller than this ourselves in zero_words().
5404 const int MacroAssembler::zero_words_block_size = 8;
5405 
5406 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5407 // possible, handling small word counts locally and delegating
5408 // anything larger to the zero_blocks stub.  It is expanded many times
5409 // in compiled code, so it is important to keep it short.
5410 
5411 // ptr:   Address of a buffer to be zeroed.
5412 // cnt:   Count in HeapWords.
5413 //
5414 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5415 void MacroAssembler::zero_words(Register ptr, Register cnt)
5416 {
5417   assert(is_power_of_2(zero_words_block_size), "adjust this");
5418   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5419 
5420   BLOCK_COMMENT("zero_words {");
5421   cmp(cnt, zero_words_block_size);
5422   Label around, done, done16;
5423   br(LO, around);
5424   {
5425     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5426     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5427     if (StubRoutines::aarch64::complete()) {
5428       trampoline_call(zero_blocks);
5429     } else {
5430       bl(zero_blocks);
5431     }
5432   }
5433   bind(around);
5434   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5435     Label l;
5436     tbz(cnt, exact_log2(i), l);
5437     for (int j = 0; j < i; j += 2) {
5438       stp(zr, zr, post(ptr, 16));
5439     }
5440     bind(l);
5441   }
5442   {
5443     Label l;
5444     tbz(cnt, 0, l);
5445     str(zr, Address(ptr));
5446     bind(l);
5447   }
5448   BLOCK_COMMENT("} zero_words");
5449 }
5450 
5451 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5452 // cnt:          Immediate count in HeapWords.
5453 #define SmallArraySize (18 * BytesPerLong)
5454 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5455 {
5456   BLOCK_COMMENT("zero_words {");
5457   int i = cnt & 1;  // store any odd word to start
5458   if (i) str(zr, Address(base));
5459 
5460   if (cnt <= SmallArraySize / BytesPerLong) {
5461     for (; i < (int)cnt; i += 2)
5462       stp(zr, zr, Address(base, i * wordSize));
5463   } else {
5464     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5465     int remainder = cnt % (2 * unroll);
5466     for (; i < remainder; i += 2)
5467       stp(zr, zr, Address(base, i * wordSize));
5468 
5469     Label loop;
5470     Register cnt_reg = rscratch1;
5471     Register loop_base = rscratch2;
5472     cnt = cnt - remainder;
5473     mov(cnt_reg, cnt);
5474     // adjust base and prebias by -2 * wordSize so we can pre-increment
5475     add(loop_base, base, (remainder - 2) * wordSize);
5476     bind(loop);
5477     sub(cnt_reg, cnt_reg, 2 * unroll);
5478     for (i = 1; i < unroll; i++)
5479       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5480     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5481     cbnz(cnt_reg, loop);
5482   }
5483   BLOCK_COMMENT("} zero_words");
5484 }
5485 
5486 // Zero blocks of memory by using DC ZVA.
5487 //
5488 // Aligns the base address first sufficently for DC ZVA, then uses
5489 // DC ZVA repeatedly for every full block.  cnt is the size to be
5490 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5491 // in cnt.
5492 //
5493 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5494 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5495 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5496   Register tmp = rscratch1;
5497   Register tmp2 = rscratch2;
5498   int zva_length = VM_Version::zva_length();
5499   Label initial_table_end, loop_zva;
5500   Label fini;
5501 
5502   // Base must be 16 byte aligned. If not just return and let caller handle it
5503   tst(base, 0x0f);
5504   br(Assembler::NE, fini);
5505   // Align base with ZVA length.
5506   neg(tmp, base);
5507   andr(tmp, tmp, zva_length - 1);
5508 
5509   // tmp: the number of bytes to be filled to align the base with ZVA length.
5510   add(base, base, tmp);
5511   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5512   adr(tmp2, initial_table_end);
5513   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5514   br(tmp2);
5515 
5516   for (int i = -zva_length + 16; i < 0; i += 16)
5517     stp(zr, zr, Address(base, i));
5518   bind(initial_table_end);
5519 
5520   sub(cnt, cnt, zva_length >> 3);
5521   bind(loop_zva);
5522   dc(Assembler::ZVA, base);
5523   subs(cnt, cnt, zva_length >> 3);
5524   add(base, base, zva_length);
5525   br(Assembler::GE, loop_zva);
5526   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5527   bind(fini);
5528 }
5529 
5530 // base:   Address of a buffer to be filled, 8 bytes aligned.
5531 // cnt:    Count in 8-byte unit.
5532 // value:  Value to be filled with.
5533 // base will point to the end of the buffer after filling.
5534 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5535 {
5536 //  Algorithm:
5537 //
5538 //    scratch1 = cnt & 7;
5539 //    cnt -= scratch1;
5540 //    p += scratch1;
5541 //    switch (scratch1) {
5542 //      do {
5543 //        cnt -= 8;
5544 //          p[-8] = v;
5545 //        case 7:
5546 //          p[-7] = v;
5547 //        case 6:
5548 //          p[-6] = v;
5549 //          // ...
5550 //        case 1:
5551 //          p[-1] = v;
5552 //        case 0:
5553 //          p += 8;
5554 //      } while (cnt);
5555 //    }
5556 
5557   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5558 
5559   Label fini, skip, entry, loop;
5560   const int unroll = 8; // Number of stp instructions we'll unroll
5561 
5562   cbz(cnt, fini);
5563   tbz(base, 3, skip);
5564   str(value, Address(post(base, 8)));
5565   sub(cnt, cnt, 1);
5566   bind(skip);
5567 
5568   andr(rscratch1, cnt, (unroll-1) * 2);
5569   sub(cnt, cnt, rscratch1);
5570   add(base, base, rscratch1, Assembler::LSL, 3);
5571   adr(rscratch2, entry);
5572   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5573   br(rscratch2);
5574 
5575   bind(loop);
5576   add(base, base, unroll * 16);
5577   for (int i = -unroll; i < 0; i++)
5578     stp(value, value, Address(base, i * 16));
5579   bind(entry);
5580   subs(cnt, cnt, unroll * 2);
5581   br(Assembler::GE, loop);
5582 
5583   tbz(cnt, 0, fini);
5584   str(value, Address(post(base, 8)));
5585   bind(fini);
5586 }
5587 
5588 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5589 // java/lang/StringUTF16.compress.
5590 void MacroAssembler::encode_iso_array(Register src, Register dst,
5591                       Register len, Register result,
5592                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5593                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5594 {
5595     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5596         NEXT_32_START, NEXT_32_PRFM_START;
5597     Register tmp1 = rscratch1, tmp2 = rscratch2;
5598 
5599       mov(result, len); // Save initial len
5600 
5601 #ifndef BUILTIN_SIM
5602       cmp(len, 8); // handle shortest strings first
5603       br(LT, LOOP_1);
5604       cmp(len, 32);
5605       br(LT, NEXT_8);
5606       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5607       // to convert chars to bytes
5608       if (SoftwarePrefetchHintDistance >= 0) {
5609         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5610         cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5611         br(LE, NEXT_32_START);
5612         b(NEXT_32_PRFM_START);
5613         BIND(NEXT_32_PRFM);
5614           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5615         BIND(NEXT_32_PRFM_START);
5616           prfm(Address(src, SoftwarePrefetchHintDistance));
5617           orr(v4, T16B, Vtmp1, Vtmp2);
5618           orr(v5, T16B, Vtmp3, Vtmp4);
5619           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5620           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5621           stpq(Vtmp1, Vtmp3, dst);
5622           uzp2(v5, T16B, v4, v5); // high bytes
5623           umov(tmp2, v5, D, 1);
5624           fmovd(tmp1, v5);
5625           orr(tmp1, tmp1, tmp2);
5626           cbnz(tmp1, LOOP_8);
5627           sub(len, len, 32);
5628           add(dst, dst, 32);
5629           add(src, src, 64);
5630           cmp(len, SoftwarePrefetchHintDistance/2 + 16);
5631           br(GE, NEXT_32_PRFM);
5632           cmp(len, 32);
5633           br(LT, LOOP_8);
5634         BIND(NEXT_32);
5635           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5636         BIND(NEXT_32_START);
5637       } else {
5638         BIND(NEXT_32);
5639           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5640       }
5641       prfm(Address(src, SoftwarePrefetchHintDistance));
5642       uzp1(v4, T16B, Vtmp1, Vtmp2);
5643       uzp1(v5, T16B, Vtmp3, Vtmp4);
5644       stpq(v4, v5, dst);
5645       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5646       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5647       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5648       umov(tmp2, Vtmp1, D, 1);
5649       fmovd(tmp1, Vtmp1);
5650       orr(tmp1, tmp1, tmp2);
5651       cbnz(tmp1, LOOP_8);
5652       sub(len, len, 32);
5653       add(dst, dst, 32);
5654       add(src, src, 64);
5655       cmp(len, 32);
5656       br(GE, NEXT_32);
5657       cbz(len, DONE);
5658 
5659     BIND(LOOP_8);
5660       cmp(len, 8);
5661       br(LT, LOOP_1);
5662     BIND(NEXT_8);
5663       ld1(Vtmp1, T8H, src);
5664       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5665       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5666       strd(Vtmp2, dst);
5667       fmovd(tmp1, Vtmp3);
5668       cbnz(tmp1, NEXT_1);
5669 
5670       sub(len, len, 8);
5671       add(dst, dst, 8);
5672       add(src, src, 16);
5673       cmp(len, 8);
5674       br(GE, NEXT_8);
5675 
5676     BIND(LOOP_1);
5677 #endif
5678     cbz(len, DONE);
5679     BIND(NEXT_1);
5680       ldrh(tmp1, Address(post(src, 2)));
5681       strb(tmp1, Address(post(dst, 1)));
5682       tst(tmp1, 0xff00);
5683       br(NE, SET_RESULT);
5684       subs(len, len, 1);
5685       br(GT, NEXT_1);
5686 
5687     BIND(SET_RESULT);
5688       sub(result, result, len); // Return index where we stopped
5689                                 // Return len == 0 if we processed all
5690                                 // characters
5691     BIND(DONE);
5692 }
5693 
5694 
5695 // Inflate byte[] array to char[].
5696 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5697                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5698                                         Register tmp4) {
5699   Label big, done, after_init, to_stub;
5700 
5701   assert_different_registers(src, dst, len, tmp4, rscratch1);
5702 
5703   fmovd(vtmp1, zr);
5704   lsrw(tmp4, len, 3);
5705   bind(after_init);
5706   cbnzw(tmp4, big);
5707   // Short string: less than 8 bytes.
5708   {
5709     Label loop, tiny;
5710 
5711     cmpw(len, 4);
5712     br(LT, tiny);
5713     // Use SIMD to do 4 bytes.
5714     ldrs(vtmp2, post(src, 4));
5715     zip1(vtmp3, T8B, vtmp2, vtmp1);
5716     subw(len, len, 4);
5717     strd(vtmp3, post(dst, 8));
5718 
5719     cbzw(len, done);
5720 
5721     // Do the remaining bytes by steam.
5722     bind(loop);
5723     ldrb(tmp4, post(src, 1));
5724     strh(tmp4, post(dst, 2));
5725     subw(len, len, 1);
5726 
5727     bind(tiny);
5728     cbnz(len, loop);
5729 
5730     b(done);
5731   }
5732 
5733   if (SoftwarePrefetchHintDistance >= 0) {
5734     bind(to_stub);
5735       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5736       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5737       trampoline_call(stub);
5738       b(after_init);
5739   }
5740 
5741   // Unpack the bytes 8 at a time.
5742   bind(big);
5743   {
5744     Label loop, around, loop_last, loop_start;
5745 
5746     if (SoftwarePrefetchHintDistance >= 0) {
5747       const int large_loop_threshold = (64 + 16)/8;
5748       ldrd(vtmp2, post(src, 8));
5749       andw(len, len, 7);
5750       cmp(tmp4, large_loop_threshold);
5751       br(GE, to_stub);
5752       b(loop_start);
5753 
5754       bind(loop);
5755       ldrd(vtmp2, post(src, 8));
5756       bind(loop_start);
5757       subs(tmp4, tmp4, 1);
5758       br(EQ, loop_last);
5759       zip1(vtmp2, T16B, vtmp2, vtmp1);
5760       ldrd(vtmp3, post(src, 8));
5761       st1(vtmp2, T8H, post(dst, 16));
5762       subs(tmp4, tmp4, 1);
5763       zip1(vtmp3, T16B, vtmp3, vtmp1);
5764       st1(vtmp3, T8H, post(dst, 16));
5765       br(NE, loop);
5766       b(around);
5767       bind(loop_last);
5768       zip1(vtmp2, T16B, vtmp2, vtmp1);
5769       st1(vtmp2, T8H, post(dst, 16));
5770       bind(around);
5771       cbz(len, done);
5772     } else {
5773       andw(len, len, 7);
5774       bind(loop);
5775       ldrd(vtmp2, post(src, 8));
5776       sub(tmp4, tmp4, 1);
5777       zip1(vtmp3, T16B, vtmp2, vtmp1);
5778       st1(vtmp3, T8H, post(dst, 16));
5779       cbnz(tmp4, loop);
5780     }
5781   }
5782 
5783   // Do the tail of up to 8 bytes.
5784   add(src, src, len);
5785   ldrd(vtmp3, Address(src, -8));
5786   add(dst, dst, len, ext::uxtw, 1);
5787   zip1(vtmp3, T16B, vtmp3, vtmp1);
5788   strq(vtmp3, Address(dst, -16));
5789 
5790   bind(done);
5791 }
5792 
5793 // Compress char[] array to byte[].
5794 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5795                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5796                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5797                                          Register result) {
5798   encode_iso_array(src, dst, len, result,
5799                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5800   cmp(len, zr);
5801   csel(result, result, zr, EQ);
5802 }
5803 
5804 // get_thread() can be called anywhere inside generated code so we
5805 // need to save whatever non-callee save context might get clobbered
5806 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5807 // the call setup code.
5808 //
5809 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5810 //
5811 void MacroAssembler::get_thread(Register dst) {
5812   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5813   push(saved_regs, sp);
5814 
5815   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5816   blrt(lr, 1, 0, 1);
5817   if (dst != c_rarg0) {
5818     mov(dst, c_rarg0);
5819   }
5820 
5821   pop(saved_regs, sp);
5822 }