1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "gc/shared/collectedHeap.hpp"
  39 #include "memory/resourceArea.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "oops/oop.hpp"
  45 #include "opto/compile.hpp"
  46 #include "opto/intrinsicnode.hpp"
  47 #include "opto/node.hpp"
  48 #include "runtime/biasedLocking.hpp"
  49 #include "runtime/icache.hpp"
  50 #include "runtime/interfaceSupport.inline.hpp"
  51 #include "runtime/jniHandles.inline.hpp"
  52 #include "runtime/sharedRuntime.hpp"
  53 #include "runtime/thread.hpp"
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) /* nothing */
  57 #define STOP(error) stop(error)
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #define STOP(error) block_comment(error); stop(error)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 // Patch any kind of instruction; there may be several instructions.
  66 // Return the total length (in bytes) of the instructions.
  67 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  68   int instructions = 1;
  69   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  70   long offset = (target - branch) >> 2;
  71   unsigned insn = *(unsigned*)branch;
  72   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  73     // Load register (literal)
  74     Instruction_aarch64::spatch(branch, 23, 5, offset);
  75   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  76     // Unconditional branch (immediate)
  77     Instruction_aarch64::spatch(branch, 25, 0, offset);
  78   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  79     // Conditional branch (immediate)
  80     Instruction_aarch64::spatch(branch, 23, 5, offset);
  81   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  82     // Compare & branch (immediate)
  83     Instruction_aarch64::spatch(branch, 23, 5, offset);
  84   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  85     // Test & branch (immediate)
  86     Instruction_aarch64::spatch(branch, 18, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  88     // PC-rel. addressing
  89     offset = target-branch;
  90     int shift = Instruction_aarch64::extract(insn, 31, 31);
  91     if (shift) {
  92       u_int64_t dest = (u_int64_t)target;
  93       uint64_t pc_page = (uint64_t)branch >> 12;
  94       uint64_t adr_page = (uint64_t)target >> 12;
  95       unsigned offset_lo = dest & 0xfff;
  96       offset = adr_page - pc_page;
  97 
  98       // We handle 4 types of PC relative addressing
  99       //   1 - adrp    Rx, target_page
 100       //       ldr/str Ry, [Rx, #offset_in_page]
 101       //   2 - adrp    Rx, target_page
 102       //       add     Ry, Rx, #offset_in_page
 103       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 104       //       movk    Rx, #imm16<<32
 105       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 106       // In the first 3 cases we must check that Rx is the same in the adrp and the
 107       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 108       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 109       // to be followed by a random unrelated ldr/str, add or movk instruction.
 110       //
 111       unsigned insn2 = ((unsigned*)branch)[1];
 112       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 113                 Instruction_aarch64::extract(insn, 4, 0) ==
 114                         Instruction_aarch64::extract(insn2, 9, 5)) {
 115         // Load/store register (unsigned immediate)
 116         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 117         Instruction_aarch64::patch(branch + sizeof (unsigned),
 118                                     21, 10, offset_lo >> size);
 119         guarantee(((dest >> size) << size) == dest, "misaligned target");
 120         instructions = 2;
 121       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 122                 Instruction_aarch64::extract(insn, 4, 0) ==
 123                         Instruction_aarch64::extract(insn2, 4, 0)) {
 124         // add (immediate)
 125         Instruction_aarch64::patch(branch + sizeof (unsigned),
 126                                    21, 10, offset_lo);
 127         instructions = 2;
 128       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 129                    Instruction_aarch64::extract(insn, 4, 0) ==
 130                      Instruction_aarch64::extract(insn2, 4, 0)) {
 131         // movk #imm16<<32
 132         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 133         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 134         long pc_page = (long)branch >> 12;
 135         long adr_page = (long)dest >> 12;
 136         offset = adr_page - pc_page;
 137         instructions = 2;
 138       }
 139     }
 140     int offset_lo = offset & 3;
 141     offset >>= 2;
 142     Instruction_aarch64::spatch(branch, 23, 5, offset);
 143     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 144   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 145     u_int64_t dest = (u_int64_t)target;
 146     // Move wide constant
 147     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 148     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 149     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 150     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 151     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 152     assert(target_addr_for_insn(branch) == target, "should be");
 153     instructions = 3;
 154   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 155              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 156     // nothing to do
 157     assert(target == 0, "did not expect to relocate target for polling page load");
 158   } else {
 159     ShouldNotReachHere();
 160   }
 161   return instructions * NativeInstruction::instruction_size;
 162 }
 163 
 164 int MacroAssembler::patch_oop(address insn_addr, address o) {
 165   int instructions;
 166   unsigned insn = *(unsigned*)insn_addr;
 167   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 168 
 169   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 170   // narrow OOPs by setting the upper 16 bits in the first
 171   // instruction.
 172   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 173     // Move narrow OOP
 174     narrowOop n = CompressedOops::encode((oop)o);
 175     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 176     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 177     instructions = 2;
 178   } else {
 179     // Move wide OOP
 180     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 181     uintptr_t dest = (uintptr_t)o;
 182     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 183     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 184     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 185     instructions = 3;
 186   }
 187   return instructions * NativeInstruction::instruction_size;
 188 }
 189 
 190 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 191   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 192   // We encode narrow ones by setting the upper 16 bits in the first
 193   // instruction.
 194   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 195   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 196          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 197 
 198   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 199   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 200   return 2 * NativeInstruction::instruction_size;
 201 }
 202 
 203 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 204   long offset = 0;
 205   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 206     // Load register (literal)
 207     offset = Instruction_aarch64::sextract(insn, 23, 5);
 208     return address(((uint64_t)insn_addr + (offset << 2)));
 209   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 210     // Unconditional branch (immediate)
 211     offset = Instruction_aarch64::sextract(insn, 25, 0);
 212   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 213     // Conditional branch (immediate)
 214     offset = Instruction_aarch64::sextract(insn, 23, 5);
 215   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 216     // Compare & branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 23, 5);
 218    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 219     // Test & branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 18, 5);
 221   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 222     // PC-rel. addressing
 223     offset = Instruction_aarch64::extract(insn, 30, 29);
 224     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 225     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 226     if (shift) {
 227       offset <<= shift;
 228       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 229       target_page &= ((uint64_t)-1) << shift;
 230       // Return the target address for the following sequences
 231       //   1 - adrp    Rx, target_page
 232       //       ldr/str Ry, [Rx, #offset_in_page]
 233       //   2 - adrp    Rx, target_page
 234       //       add     Ry, Rx, #offset_in_page
 235       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 236       //       movk    Rx, #imm12<<32
 237       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 238       //
 239       // In the first two cases  we check that the register is the same and
 240       // return the target_page + the offset within the page.
 241       // Otherwise we assume it is a page aligned relocation and return
 242       // the target page only.
 243       //
 244       unsigned insn2 = ((unsigned*)insn_addr)[1];
 245       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 246                 Instruction_aarch64::extract(insn, 4, 0) ==
 247                         Instruction_aarch64::extract(insn2, 9, 5)) {
 248         // Load/store register (unsigned immediate)
 249         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 250         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 251         return address(target_page + (byte_offset << size));
 252       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 253                 Instruction_aarch64::extract(insn, 4, 0) ==
 254                         Instruction_aarch64::extract(insn2, 4, 0)) {
 255         // add (immediate)
 256         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 257         return address(target_page + byte_offset);
 258       } else {
 259         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 260                Instruction_aarch64::extract(insn, 4, 0) ==
 261                  Instruction_aarch64::extract(insn2, 4, 0)) {
 262           target_page = (target_page & 0xffffffff) |
 263                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 264         }
 265         return (address)target_page;
 266       }
 267     } else {
 268       ShouldNotReachHere();
 269     }
 270   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 271     u_int32_t *insns = (u_int32_t *)insn_addr;
 272     // Move wide constant: movz, movk, movk.  See movptr().
 273     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 274     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 275     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 276                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 277                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 278   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 279              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 280     return 0;
 281   } else {
 282     ShouldNotReachHere();
 283   }
 284   return address(((uint64_t)insn_addr + (offset << 2)));
 285 }
 286 
 287 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 288   dsb(Assembler::SY);
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   if (last_java_pc != NULL) {
 377     adr(scratch, last_java_pc);
 378   } else {
 379     // FIXME: This is almost never correct.  We should delete all
 380     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 381     // correct return address instead.
 382     adr(scratch, pc());
 383   }
 384 
 385   str(scratch, Address(rthread,
 386                        JavaThread::frame_anchor_offset()
 387                        + JavaFrameAnchor::last_Java_pc_offset()));
 388 
 389   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 390 }
 391 
 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 393                                          Register last_java_fp,
 394                                          Label &L,
 395                                          Register scratch) {
 396   if (L.is_bound()) {
 397     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 398   } else {
 399     InstructionMark im(this);
 400     L.add_patch_at(code(), locator());
 401     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 402   }
 403 }
 404 
 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 407   assert(CodeCache::find_blob(entry.target()) != NULL,
 408          "destination of far call not found in code cache");
 409   if (far_branches()) {
 410     unsigned long offset;
 411     // We can use ADRP here because we know that the total size of
 412     // the code cache cannot exceed 2Gb.
 413     adrp(tmp, entry, offset);
 414     add(tmp, tmp, offset);
 415     if (cbuf) cbuf->set_insts_mark();
 416     blr(tmp);
 417   } else {
 418     if (cbuf) cbuf->set_insts_mark();
 419     bl(entry);
 420   }
 421 }
 422 
 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 424   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 425   assert(CodeCache::find_blob(entry.target()) != NULL,
 426          "destination of far call not found in code cache");
 427   if (far_branches()) {
 428     unsigned long offset;
 429     // We can use ADRP here because we know that the total size of
 430     // the code cache cannot exceed 2Gb.
 431     adrp(tmp, entry, offset);
 432     add(tmp, tmp, offset);
 433     if (cbuf) cbuf->set_insts_mark();
 434     br(tmp);
 435   } else {
 436     if (cbuf) cbuf->set_insts_mark();
 437     b(entry);
 438   }
 439 }
 440 
 441 void MacroAssembler::reserved_stack_check() {
 442     // testing if reserved zone needs to be enabled
 443     Label no_reserved_zone_enabling;
 444 
 445     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 446     cmp(sp, rscratch1);
 447     br(Assembler::LO, no_reserved_zone_enabling);
 448 
 449     enter();   // LR and FP are live.
 450     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 451     mov(c_rarg0, rthread);
 452     blr(rscratch1);
 453     leave();
 454 
 455     // We have already removed our own frame.
 456     // throw_delayed_StackOverflowError will think that it's been
 457     // called by our caller.
 458     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 459     br(rscratch1);
 460     should_not_reach_here();
 461 
 462     bind(no_reserved_zone_enabling);
 463 }
 464 
 465 int MacroAssembler::biased_locking_enter(Register lock_reg,
 466                                          Register obj_reg,
 467                                          Register swap_reg,
 468                                          Register tmp_reg,
 469                                          bool swap_reg_contains_mark,
 470                                          Label& done,
 471                                          Label* slow_case,
 472                                          BiasedLockingCounters* counters) {
 473   assert(UseBiasedLocking, "why call this otherwise?");
 474   assert_different_registers(lock_reg, obj_reg, swap_reg);
 475 
 476   if (PrintBiasedLockingStatistics && counters == NULL)
 477     counters = BiasedLocking::counters();
 478 
 479   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 480   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 481   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 482   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 483   Address saved_mark_addr(lock_reg, 0);
 484 
 485   // Biased locking
 486   // See whether the lock is currently biased toward our thread and
 487   // whether the epoch is still valid
 488   // Note that the runtime guarantees sufficient alignment of JavaThread
 489   // pointers to allow age to be placed into low bits
 490   // First check to see whether biasing is even enabled for this object
 491   Label cas_label;
 492   int null_check_offset = -1;
 493   if (!swap_reg_contains_mark) {
 494     null_check_offset = offset();
 495     ldr(swap_reg, mark_addr);
 496   }
 497   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 498   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 499   br(Assembler::NE, cas_label);
 500   // The bias pattern is present in the object's header. Need to check
 501   // whether the bias owner and the epoch are both still current.
 502   load_prototype_header(tmp_reg, obj_reg);
 503   orr(tmp_reg, tmp_reg, rthread);
 504   eor(tmp_reg, swap_reg, tmp_reg);
 505   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 506   if (counters != NULL) {
 507     Label around;
 508     cbnz(tmp_reg, around);
 509     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 510     b(done);
 511     bind(around);
 512   } else {
 513     cbz(tmp_reg, done);
 514   }
 515 
 516   Label try_revoke_bias;
 517   Label try_rebias;
 518 
 519   // At this point we know that the header has the bias pattern and
 520   // that we are not the bias owner in the current epoch. We need to
 521   // figure out more details about the state of the header in order to
 522   // know what operations can be legally performed on the object's
 523   // header.
 524 
 525   // If the low three bits in the xor result aren't clear, that means
 526   // the prototype header is no longer biased and we have to revoke
 527   // the bias on this object.
 528   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 529   cbnz(rscratch1, try_revoke_bias);
 530 
 531   // Biasing is still enabled for this data type. See whether the
 532   // epoch of the current bias is still valid, meaning that the epoch
 533   // bits of the mark word are equal to the epoch bits of the
 534   // prototype header. (Note that the prototype header's epoch bits
 535   // only change at a safepoint.) If not, attempt to rebias the object
 536   // toward the current thread. Note that we must be absolutely sure
 537   // that the current epoch is invalid in order to do this because
 538   // otherwise the manipulations it performs on the mark word are
 539   // illegal.
 540   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 541   cbnz(rscratch1, try_rebias);
 542 
 543   // The epoch of the current bias is still valid but we know nothing
 544   // about the owner; it might be set or it might be clear. Try to
 545   // acquire the bias of the object using an atomic operation. If this
 546   // fails we will go in to the runtime to revoke the object's bias.
 547   // Note that we first construct the presumed unbiased header so we
 548   // don't accidentally blow away another thread's valid bias.
 549   {
 550     Label here;
 551     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 552     andr(swap_reg, swap_reg, rscratch1);
 553     orr(tmp_reg, swap_reg, rthread);
 554     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 555     // If the biasing toward our thread failed, this means that
 556     // another thread succeeded in biasing it toward itself and we
 557     // need to revoke that bias. The revocation will occur in the
 558     // interpreter runtime in the slow case.
 559     bind(here);
 560     if (counters != NULL) {
 561       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 562                   tmp_reg, rscratch1, rscratch2);
 563     }
 564   }
 565   b(done);
 566 
 567   bind(try_rebias);
 568   // At this point we know the epoch has expired, meaning that the
 569   // current "bias owner", if any, is actually invalid. Under these
 570   // circumstances _only_, we are allowed to use the current header's
 571   // value as the comparison value when doing the cas to acquire the
 572   // bias in the current epoch. In other words, we allow transfer of
 573   // the bias from one thread to another directly in this situation.
 574   //
 575   // FIXME: due to a lack of registers we currently blow away the age
 576   // bits in this situation. Should attempt to preserve them.
 577   {
 578     Label here;
 579     load_prototype_header(tmp_reg, obj_reg);
 580     orr(tmp_reg, rthread, tmp_reg);
 581     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 582     // If the biasing toward our thread failed, then another thread
 583     // succeeded in biasing it toward itself and we need to revoke that
 584     // bias. The revocation will occur in the runtime in the slow case.
 585     bind(here);
 586     if (counters != NULL) {
 587       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 588                   tmp_reg, rscratch1, rscratch2);
 589     }
 590   }
 591   b(done);
 592 
 593   bind(try_revoke_bias);
 594   // The prototype mark in the klass doesn't have the bias bit set any
 595   // more, indicating that objects of this data type are not supposed
 596   // to be biased any more. We are going to try to reset the mark of
 597   // this object to the prototype value and fall through to the
 598   // CAS-based locking scheme. Note that if our CAS fails, it means
 599   // that another thread raced us for the privilege of revoking the
 600   // bias of this particular object, so it's okay to continue in the
 601   // normal locking code.
 602   //
 603   // FIXME: due to a lack of registers we currently blow away the age
 604   // bits in this situation. Should attempt to preserve them.
 605   {
 606     Label here, nope;
 607     load_prototype_header(tmp_reg, obj_reg);
 608     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 609     bind(here);
 610 
 611     // Fall through to the normal CAS-based lock, because no matter what
 612     // the result of the above CAS, some thread must have succeeded in
 613     // removing the bias bit from the object's header.
 614     if (counters != NULL) {
 615       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 616                   rscratch1, rscratch2);
 617     }
 618     bind(nope);
 619   }
 620 
 621   bind(cas_label);
 622 
 623   return null_check_offset;
 624 }
 625 
 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 627   assert(UseBiasedLocking, "why call this otherwise?");
 628 
 629   // Check for biased locking unlock case, which is a no-op
 630   // Note: we do not have to check the thread ID for two reasons.
 631   // First, the interpreter checks for IllegalMonitorStateException at
 632   // a higher level. Second, if the bias was revoked while we held the
 633   // lock, the object could not be rebiased toward another thread, so
 634   // the bias bit would be clear.
 635   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 636   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 637   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 638   br(Assembler::EQ, done);
 639 }
 640 
 641 static void pass_arg0(MacroAssembler* masm, Register arg) {
 642   if (c_rarg0 != arg ) {
 643     masm->mov(c_rarg0, arg);
 644   }
 645 }
 646 
 647 static void pass_arg1(MacroAssembler* masm, Register arg) {
 648   if (c_rarg1 != arg ) {
 649     masm->mov(c_rarg1, arg);
 650   }
 651 }
 652 
 653 static void pass_arg2(MacroAssembler* masm, Register arg) {
 654   if (c_rarg2 != arg ) {
 655     masm->mov(c_rarg2, arg);
 656   }
 657 }
 658 
 659 static void pass_arg3(MacroAssembler* masm, Register arg) {
 660   if (c_rarg3 != arg ) {
 661     masm->mov(c_rarg3, arg);
 662   }
 663 }
 664 
 665 void MacroAssembler::call_VM_base(Register oop_result,
 666                                   Register java_thread,
 667                                   Register last_java_sp,
 668                                   address  entry_point,
 669                                   int      number_of_arguments,
 670                                   bool     check_exceptions) {
 671    // determine java_thread register
 672   if (!java_thread->is_valid()) {
 673     java_thread = rthread;
 674   }
 675 
 676   // determine last_java_sp register
 677   if (!last_java_sp->is_valid()) {
 678     last_java_sp = esp;
 679   }
 680 
 681   // debugging support
 682   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 683   assert(java_thread == rthread, "unexpected register");
 684 #ifdef ASSERT
 685   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 686   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 687 #endif // ASSERT
 688 
 689   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 690   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 691 
 692   // push java thread (becomes first argument of C function)
 693 
 694   mov(c_rarg0, java_thread);
 695 
 696   // set last Java frame before call
 697   assert(last_java_sp != rfp, "can't use rfp");
 698 
 699   Label l;
 700   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 701 
 702   // do the call, remove parameters
 703   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 704 
 705   // reset last Java frame
 706   // Only interpreter should have to clear fp
 707   reset_last_Java_frame(true);
 708 
 709    // C++ interp handles this in the interpreter
 710   check_and_handle_popframe(java_thread);
 711   check_and_handle_earlyret(java_thread);
 712 
 713   if (check_exceptions) {
 714     // check for pending exceptions (java_thread is set upon return)
 715     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 716     Label ok;
 717     cbz(rscratch1, ok);
 718     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 719     br(rscratch1);
 720     bind(ok);
 721   }
 722 
 723   // get oop result if there is one and reset the value in the thread
 724   if (oop_result->is_valid()) {
 725     get_vm_result(oop_result, java_thread);
 726   }
 727 }
 728 
 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 730   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 731 }
 732 
 733 // Maybe emit a call via a trampoline.  If the code cache is small
 734 // trampolines won't be emitted.
 735 
 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 737   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 738   assert(entry.rspec().type() == relocInfo::runtime_call_type
 739          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 740          || entry.rspec().type() == relocInfo::static_call_type
 741          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 742 
 743   // We need a trampoline if branches are far.
 744   if (far_branches()) {
 745     // We don't want to emit a trampoline if C2 is generating dummy
 746     // code during its branch shortening phase.
 747     CompileTask* task = ciEnv::current()->task();
 748     bool in_scratch_emit_size =
 749       (task != NULL && is_c2_compile(task->comp_level()) &&
 750        Compile::current()->in_scratch_emit_size());
 751     if (!in_scratch_emit_size) {
 752       address stub = emit_trampoline_stub(offset(), entry.target());
 753       if (stub == NULL) {
 754         return NULL; // CodeCache is full
 755       }
 756     }
 757   }
 758 
 759   if (cbuf) cbuf->set_insts_mark();
 760   relocate(entry.rspec());
 761   if (!far_branches()) {
 762     bl(entry.target());
 763   } else {
 764     bl(pc());
 765   }
 766   // just need to return a non-null address
 767   return pc();
 768 }
 769 
 770 
 771 // Emit a trampoline stub for a call to a target which is too far away.
 772 //
 773 // code sequences:
 774 //
 775 // call-site:
 776 //   branch-and-link to <destination> or <trampoline stub>
 777 //
 778 // Related trampoline stub for this call site in the stub section:
 779 //   load the call target from the constant pool
 780 //   branch (LR still points to the call site above)
 781 
 782 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 783                                              address dest) {
 784   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 785   if (stub == NULL) {
 786     return NULL;  // CodeBuffer::expand failed
 787   }
 788 
 789   // Create a trampoline stub relocation which relates this trampoline stub
 790   // with the call instruction at insts_call_instruction_offset in the
 791   // instructions code-section.
 792   align(wordSize);
 793   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 794                                             + insts_call_instruction_offset));
 795   const int stub_start_offset = offset();
 796 
 797   // Now, create the trampoline stub's code:
 798   // - load the call
 799   // - call
 800   Label target;
 801   ldr(rscratch1, target);
 802   br(rscratch1);
 803   bind(target);
 804   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 805          "should be");
 806   emit_int64((int64_t)dest);
 807 
 808   const address stub_start_addr = addr_at(stub_start_offset);
 809 
 810   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 811 
 812   end_a_stub();
 813   return stub_start_addr;
 814 }
 815 
 816 address MacroAssembler::ic_call(address entry, jint method_index) {
 817   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 818   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 819   // unsigned long offset;
 820   // ldr_constant(rscratch2, const_ptr);
 821   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 822   return trampoline_call(Address(entry, rh));
 823 }
 824 
 825 // Implementation of call_VM versions
 826 
 827 void MacroAssembler::call_VM(Register oop_result,
 828                              address entry_point,
 829                              bool check_exceptions) {
 830   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 831 }
 832 
 833 void MacroAssembler::call_VM(Register oop_result,
 834                              address entry_point,
 835                              Register arg_1,
 836                              bool check_exceptions) {
 837   pass_arg1(this, arg_1);
 838   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 839 }
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              Register arg_1,
 844                              Register arg_2,
 845                              bool check_exceptions) {
 846   assert(arg_1 != c_rarg2, "smashed arg");
 847   pass_arg2(this, arg_2);
 848   pass_arg1(this, arg_1);
 849   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 850 }
 851 
 852 void MacroAssembler::call_VM(Register oop_result,
 853                              address entry_point,
 854                              Register arg_1,
 855                              Register arg_2,
 856                              Register arg_3,
 857                              bool check_exceptions) {
 858   assert(arg_1 != c_rarg3, "smashed arg");
 859   assert(arg_2 != c_rarg3, "smashed arg");
 860   pass_arg3(this, arg_3);
 861 
 862   assert(arg_1 != c_rarg2, "smashed arg");
 863   pass_arg2(this, arg_2);
 864 
 865   pass_arg1(this, arg_1);
 866   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 867 }
 868 
 869 void MacroAssembler::call_VM(Register oop_result,
 870                              Register last_java_sp,
 871                              address entry_point,
 872                              int number_of_arguments,
 873                              bool check_exceptions) {
 874   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 875 }
 876 
 877 void MacroAssembler::call_VM(Register oop_result,
 878                              Register last_java_sp,
 879                              address entry_point,
 880                              Register arg_1,
 881                              bool check_exceptions) {
 882   pass_arg1(this, arg_1);
 883   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 884 }
 885 
 886 void MacroAssembler::call_VM(Register oop_result,
 887                              Register last_java_sp,
 888                              address entry_point,
 889                              Register arg_1,
 890                              Register arg_2,
 891                              bool check_exceptions) {
 892 
 893   assert(arg_1 != c_rarg2, "smashed arg");
 894   pass_arg2(this, arg_2);
 895   pass_arg1(this, arg_1);
 896   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 897 }
 898 
 899 void MacroAssembler::call_VM(Register oop_result,
 900                              Register last_java_sp,
 901                              address entry_point,
 902                              Register arg_1,
 903                              Register arg_2,
 904                              Register arg_3,
 905                              bool check_exceptions) {
 906   assert(arg_1 != c_rarg3, "smashed arg");
 907   assert(arg_2 != c_rarg3, "smashed arg");
 908   pass_arg3(this, arg_3);
 909   assert(arg_1 != c_rarg2, "smashed arg");
 910   pass_arg2(this, arg_2);
 911   pass_arg1(this, arg_1);
 912   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 913 }
 914 
 915 
 916 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 917   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 918   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 919   verify_oop(oop_result, "broken oop in call_VM_base");
 920 }
 921 
 922 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 923   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 924   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 925 }
 926 
 927 void MacroAssembler::align(int modulus) {
 928   while (offset() % modulus != 0) nop();
 929 }
 930 
 931 // these are no-ops overridden by InterpreterMacroAssembler
 932 
 933 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 934 
 935 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 936 
 937 
 938 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 939                                                       Register tmp,
 940                                                       int offset) {
 941   intptr_t value = *delayed_value_addr;
 942   if (value != 0)
 943     return RegisterOrConstant(value + offset);
 944 
 945   // load indirectly to solve generation ordering problem
 946   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 947 
 948   if (offset != 0)
 949     add(tmp, tmp, offset);
 950 
 951   return RegisterOrConstant(tmp);
 952 }
 953 
 954 
 955 void MacroAssembler:: notify(int type) {
 956   if (type == bytecode_start) {
 957     // set_last_Java_frame(esp, rfp, (address)NULL);
 958     Assembler:: notify(type);
 959     // reset_last_Java_frame(true);
 960   }
 961   else
 962     Assembler:: notify(type);
 963 }
 964 
 965 // Look up the method for a megamorphic invokeinterface call.
 966 // The target method is determined by <intf_klass, itable_index>.
 967 // The receiver klass is in recv_klass.
 968 // On success, the result will be in method_result, and execution falls through.
 969 // On failure, execution transfers to the given label.
 970 void MacroAssembler::lookup_interface_method(Register recv_klass,
 971                                              Register intf_klass,
 972                                              RegisterOrConstant itable_index,
 973                                              Register method_result,
 974                                              Register scan_temp,
 975                                              Label& L_no_such_interface,
 976                          bool return_method) {
 977   assert_different_registers(recv_klass, intf_klass, scan_temp);
 978   assert_different_registers(method_result, intf_klass, scan_temp);
 979   assert(recv_klass != method_result || !return_method,
 980      "recv_klass can be destroyed when method isn't needed");
 981   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 982          "caller must use same register for non-constant itable index as for method");
 983 
 984   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 985   int vtable_base = in_bytes(Klass::vtable_start_offset());
 986   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 987   int scan_step   = itableOffsetEntry::size() * wordSize;
 988   int vte_size    = vtableEntry::size_in_bytes();
 989   assert(vte_size == wordSize, "else adjust times_vte_scale");
 990 
 991   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
 992 
 993   // %%% Could store the aligned, prescaled offset in the klassoop.
 994   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 995   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 996   add(scan_temp, scan_temp, vtable_base);
 997 
 998   if (return_method) {
 999     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1000     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1001     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1002     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1003     if (itentry_off)
1004       add(recv_klass, recv_klass, itentry_off);
1005   }
1006 
1007   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1008   //   if (scan->interface() == intf) {
1009   //     result = (klass + scan->offset() + itable_index);
1010   //   }
1011   // }
1012   Label search, found_method;
1013 
1014   for (int peel = 1; peel >= 0; peel--) {
1015     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1016     cmp(intf_klass, method_result);
1017 
1018     if (peel) {
1019       br(Assembler::EQ, found_method);
1020     } else {
1021       br(Assembler::NE, search);
1022       // (invert the test to fall through to found_method...)
1023     }
1024 
1025     if (!peel)  break;
1026 
1027     bind(search);
1028 
1029     // Check that the previous entry is non-null.  A null entry means that
1030     // the receiver class doesn't implement the interface, and wasn't the
1031     // same as when the caller was compiled.
1032     cbz(method_result, L_no_such_interface);
1033     add(scan_temp, scan_temp, scan_step);
1034   }
1035 
1036   bind(found_method);
1037 
1038   // Got a hit.
1039   if (return_method) {
1040     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1041     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1042   }
1043 }
1044 
1045 // virtual method calling
1046 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1047                                            RegisterOrConstant vtable_index,
1048                                            Register method_result) {
1049   const int base = in_bytes(Klass::vtable_start_offset());
1050   assert(vtableEntry::size() * wordSize == 8,
1051          "adjust the scaling in the code below");
1052   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1053 
1054   if (vtable_index.is_register()) {
1055     lea(method_result, Address(recv_klass,
1056                                vtable_index.as_register(),
1057                                Address::lsl(LogBytesPerWord)));
1058     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1059   } else {
1060     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1061     ldr(method_result,
1062         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1063   }
1064 }
1065 
1066 void MacroAssembler::check_klass_subtype(Register sub_klass,
1067                            Register super_klass,
1068                            Register temp_reg,
1069                            Label& L_success) {
1070   Label L_failure;
1071   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1072   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1073   bind(L_failure);
1074 }
1075 
1076 
1077 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1078                                                    Register super_klass,
1079                                                    Register temp_reg,
1080                                                    Label* L_success,
1081                                                    Label* L_failure,
1082                                                    Label* L_slow_path,
1083                                         RegisterOrConstant super_check_offset) {
1084   assert_different_registers(sub_klass, super_klass, temp_reg);
1085   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1086   if (super_check_offset.is_register()) {
1087     assert_different_registers(sub_klass, super_klass,
1088                                super_check_offset.as_register());
1089   } else if (must_load_sco) {
1090     assert(temp_reg != noreg, "supply either a temp or a register offset");
1091   }
1092 
1093   Label L_fallthrough;
1094   int label_nulls = 0;
1095   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1096   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1097   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1098   assert(label_nulls <= 1, "at most one NULL in the batch");
1099 
1100   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1101   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1102   Address super_check_offset_addr(super_klass, sco_offset);
1103 
1104   // Hacked jmp, which may only be used just before L_fallthrough.
1105 #define final_jmp(label)                                                \
1106   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1107   else                            b(label)                /*omit semi*/
1108 
1109   // If the pointers are equal, we are done (e.g., String[] elements).
1110   // This self-check enables sharing of secondary supertype arrays among
1111   // non-primary types such as array-of-interface.  Otherwise, each such
1112   // type would need its own customized SSA.
1113   // We move this check to the front of the fast path because many
1114   // type checks are in fact trivially successful in this manner,
1115   // so we get a nicely predicted branch right at the start of the check.
1116   cmp(sub_klass, super_klass);
1117   br(Assembler::EQ, *L_success);
1118 
1119   // Check the supertype display:
1120   if (must_load_sco) {
1121     ldrw(temp_reg, super_check_offset_addr);
1122     super_check_offset = RegisterOrConstant(temp_reg);
1123   }
1124   Address super_check_addr(sub_klass, super_check_offset);
1125   ldr(rscratch1, super_check_addr);
1126   cmp(super_klass, rscratch1); // load displayed supertype
1127 
1128   // This check has worked decisively for primary supers.
1129   // Secondary supers are sought in the super_cache ('super_cache_addr').
1130   // (Secondary supers are interfaces and very deeply nested subtypes.)
1131   // This works in the same check above because of a tricky aliasing
1132   // between the super_cache and the primary super display elements.
1133   // (The 'super_check_addr' can address either, as the case requires.)
1134   // Note that the cache is updated below if it does not help us find
1135   // what we need immediately.
1136   // So if it was a primary super, we can just fail immediately.
1137   // Otherwise, it's the slow path for us (no success at this point).
1138 
1139   if (super_check_offset.is_register()) {
1140     br(Assembler::EQ, *L_success);
1141     subs(zr, super_check_offset.as_register(), sc_offset);
1142     if (L_failure == &L_fallthrough) {
1143       br(Assembler::EQ, *L_slow_path);
1144     } else {
1145       br(Assembler::NE, *L_failure);
1146       final_jmp(*L_slow_path);
1147     }
1148   } else if (super_check_offset.as_constant() == sc_offset) {
1149     // Need a slow path; fast failure is impossible.
1150     if (L_slow_path == &L_fallthrough) {
1151       br(Assembler::EQ, *L_success);
1152     } else {
1153       br(Assembler::NE, *L_slow_path);
1154       final_jmp(*L_success);
1155     }
1156   } else {
1157     // No slow path; it's a fast decision.
1158     if (L_failure == &L_fallthrough) {
1159       br(Assembler::EQ, *L_success);
1160     } else {
1161       br(Assembler::NE, *L_failure);
1162       final_jmp(*L_success);
1163     }
1164   }
1165 
1166   bind(L_fallthrough);
1167 
1168 #undef final_jmp
1169 }
1170 
1171 // These two are taken from x86, but they look generally useful
1172 
1173 // scans count pointer sized words at [addr] for occurence of value,
1174 // generic
1175 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1176                                 Register scratch) {
1177   Label Lloop, Lexit;
1178   cbz(count, Lexit);
1179   bind(Lloop);
1180   ldr(scratch, post(addr, wordSize));
1181   cmp(value, scratch);
1182   br(EQ, Lexit);
1183   sub(count, count, 1);
1184   cbnz(count, Lloop);
1185   bind(Lexit);
1186 }
1187 
1188 // scans count 4 byte words at [addr] for occurence of value,
1189 // generic
1190 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1191                                 Register scratch) {
1192   Label Lloop, Lexit;
1193   cbz(count, Lexit);
1194   bind(Lloop);
1195   ldrw(scratch, post(addr, wordSize));
1196   cmpw(value, scratch);
1197   br(EQ, Lexit);
1198   sub(count, count, 1);
1199   cbnz(count, Lloop);
1200   bind(Lexit);
1201 }
1202 
1203 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1204                                                    Register super_klass,
1205                                                    Register temp_reg,
1206                                                    Register temp2_reg,
1207                                                    Label* L_success,
1208                                                    Label* L_failure,
1209                                                    bool set_cond_codes) {
1210   assert_different_registers(sub_klass, super_klass, temp_reg);
1211   if (temp2_reg != noreg)
1212     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1213 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1214 
1215   Label L_fallthrough;
1216   int label_nulls = 0;
1217   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1218   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1219   assert(label_nulls <= 1, "at most one NULL in the batch");
1220 
1221   // a couple of useful fields in sub_klass:
1222   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1223   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1224   Address secondary_supers_addr(sub_klass, ss_offset);
1225   Address super_cache_addr(     sub_klass, sc_offset);
1226 
1227   BLOCK_COMMENT("check_klass_subtype_slow_path");
1228 
1229   // Do a linear scan of the secondary super-klass chain.
1230   // This code is rarely used, so simplicity is a virtue here.
1231   // The repne_scan instruction uses fixed registers, which we must spill.
1232   // Don't worry too much about pre-existing connections with the input regs.
1233 
1234   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1235   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1236 
1237   RegSet pushed_registers;
1238   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1239   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1240 
1241   if (super_klass != r0 || UseCompressedOops) {
1242     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1243   }
1244 
1245   push(pushed_registers, sp);
1246 
1247   // Get super_klass value into r0 (even if it was in r5 or r2).
1248   if (super_klass != r0) {
1249     mov(r0, super_klass);
1250   }
1251 
1252 #ifndef PRODUCT
1253   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1254   Address pst_counter_addr(rscratch2);
1255   ldr(rscratch1, pst_counter_addr);
1256   add(rscratch1, rscratch1, 1);
1257   str(rscratch1, pst_counter_addr);
1258 #endif //PRODUCT
1259 
1260   // We will consult the secondary-super array.
1261   ldr(r5, secondary_supers_addr);
1262   // Load the array length.
1263   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1264   // Skip to start of data.
1265   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1266 
1267   cmp(sp, zr); // Clear Z flag; SP is never zero
1268   // Scan R2 words at [R5] for an occurrence of R0.
1269   // Set NZ/Z based on last compare.
1270   repne_scan(r5, r0, r2, rscratch1);
1271 
1272   // Unspill the temp. registers:
1273   pop(pushed_registers, sp);
1274 
1275   br(Assembler::NE, *L_failure);
1276 
1277   // Success.  Cache the super we found and proceed in triumph.
1278   str(super_klass, super_cache_addr);
1279 
1280   if (L_success != &L_fallthrough) {
1281     b(*L_success);
1282   }
1283 
1284 #undef IS_A_TEMP
1285 
1286   bind(L_fallthrough);
1287 }
1288 
1289 
1290 void MacroAssembler::verify_oop(Register reg, const char* s) {
1291   if (!VerifyOops) return;
1292 
1293   // Pass register number to verify_oop_subroutine
1294   const char* b = NULL;
1295   {
1296     ResourceMark rm;
1297     stringStream ss;
1298     ss.print("verify_oop: %s: %s", reg->name(), s);
1299     b = code_string(ss.as_string());
1300   }
1301   BLOCK_COMMENT("verify_oop {");
1302 
1303   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1304   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1305 
1306   mov(r0, reg);
1307   mov(rscratch1, (address)b);
1308 
1309   // call indirectly to solve generation ordering problem
1310   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1311   ldr(rscratch2, Address(rscratch2));
1312   blr(rscratch2);
1313 
1314   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1315   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1316 
1317   BLOCK_COMMENT("} verify_oop");
1318 }
1319 
1320 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1321   if (!VerifyOops) return;
1322 
1323   const char* b = NULL;
1324   {
1325     ResourceMark rm;
1326     stringStream ss;
1327     ss.print("verify_oop_addr: %s", s);
1328     b = code_string(ss.as_string());
1329   }
1330   BLOCK_COMMENT("verify_oop_addr {");
1331 
1332   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1333   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1334 
1335   // addr may contain sp so we will have to adjust it based on the
1336   // pushes that we just did.
1337   if (addr.uses(sp)) {
1338     lea(r0, addr);
1339     ldr(r0, Address(r0, 4 * wordSize));
1340   } else {
1341     ldr(r0, addr);
1342   }
1343   mov(rscratch1, (address)b);
1344 
1345   // call indirectly to solve generation ordering problem
1346   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1347   ldr(rscratch2, Address(rscratch2));
1348   blr(rscratch2);
1349 
1350   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1351   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1352 
1353   BLOCK_COMMENT("} verify_oop_addr");
1354 }
1355 
1356 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1357                                          int extra_slot_offset) {
1358   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1359   int stackElementSize = Interpreter::stackElementSize;
1360   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1361 #ifdef ASSERT
1362   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1363   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1364 #endif
1365   if (arg_slot.is_constant()) {
1366     return Address(esp, arg_slot.as_constant() * stackElementSize
1367                    + offset);
1368   } else {
1369     add(rscratch1, esp, arg_slot.as_register(),
1370         ext::uxtx, exact_log2(stackElementSize));
1371     return Address(rscratch1, offset);
1372   }
1373 }
1374 
1375 void MacroAssembler::call_VM_leaf_base(address entry_point,
1376                                        int number_of_arguments,
1377                                        Label *retaddr) {
1378   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1379 }
1380 
1381 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1382                                         int number_of_gp_arguments,
1383                                         int number_of_fp_arguments,
1384                                         ret_type type,
1385                                         Label *retaddr) {
1386   Label E, L;
1387 
1388   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1389 
1390   // We add 1 to number_of_arguments because the thread in arg0 is
1391   // not counted
1392   mov(rscratch1, entry_point);
1393   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1394   if (retaddr)
1395     bind(*retaddr);
1396 
1397   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1398   maybe_isb();
1399 }
1400 
1401 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1402   call_VM_leaf_base(entry_point, number_of_arguments);
1403 }
1404 
1405 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1406   pass_arg0(this, arg_0);
1407   call_VM_leaf_base(entry_point, 1);
1408 }
1409 
1410 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1411   pass_arg0(this, arg_0);
1412   pass_arg1(this, arg_1);
1413   call_VM_leaf_base(entry_point, 2);
1414 }
1415 
1416 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1417                                   Register arg_1, Register arg_2) {
1418   pass_arg0(this, arg_0);
1419   pass_arg1(this, arg_1);
1420   pass_arg2(this, arg_2);
1421   call_VM_leaf_base(entry_point, 3);
1422 }
1423 
1424 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1425   pass_arg0(this, arg_0);
1426   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1427 }
1428 
1429 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1430 
1431   assert(arg_0 != c_rarg1, "smashed arg");
1432   pass_arg1(this, arg_1);
1433   pass_arg0(this, arg_0);
1434   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1435 }
1436 
1437 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1438   assert(arg_0 != c_rarg2, "smashed arg");
1439   assert(arg_1 != c_rarg2, "smashed arg");
1440   pass_arg2(this, arg_2);
1441   assert(arg_0 != c_rarg1, "smashed arg");
1442   pass_arg1(this, arg_1);
1443   pass_arg0(this, arg_0);
1444   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1445 }
1446 
1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1448   assert(arg_0 != c_rarg3, "smashed arg");
1449   assert(arg_1 != c_rarg3, "smashed arg");
1450   assert(arg_2 != c_rarg3, "smashed arg");
1451   pass_arg3(this, arg_3);
1452   assert(arg_0 != c_rarg2, "smashed arg");
1453   assert(arg_1 != c_rarg2, "smashed arg");
1454   pass_arg2(this, arg_2);
1455   assert(arg_0 != c_rarg1, "smashed arg");
1456   pass_arg1(this, arg_1);
1457   pass_arg0(this, arg_0);
1458   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1459 }
1460 
1461 void MacroAssembler::null_check(Register reg, int offset) {
1462   if (needs_explicit_null_check(offset)) {
1463     // provoke OS NULL exception if reg = NULL by
1464     // accessing M[reg] w/o changing any registers
1465     // NOTE: this is plenty to provoke a segv
1466     ldr(zr, Address(reg));
1467   } else {
1468     // nothing to do, (later) access of M[reg + offset]
1469     // will provoke OS NULL exception if reg = NULL
1470   }
1471 }
1472 
1473 // MacroAssembler protected routines needed to implement
1474 // public methods
1475 
1476 void MacroAssembler::mov(Register r, Address dest) {
1477   code_section()->relocate(pc(), dest.rspec());
1478   u_int64_t imm64 = (u_int64_t)dest.target();
1479   movptr(r, imm64);
1480 }
1481 
1482 // Move a constant pointer into r.  In AArch64 mode the virtual
1483 // address space is 48 bits in size, so we only need three
1484 // instructions to create a patchable instruction sequence that can
1485 // reach anywhere.
1486 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1487 #ifndef PRODUCT
1488   {
1489     char buffer[64];
1490     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1491     block_comment(buffer);
1492   }
1493 #endif
1494   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1495   movz(r, imm64 & 0xffff);
1496   imm64 >>= 16;
1497   movk(r, imm64 & 0xffff, 16);
1498   imm64 >>= 16;
1499   movk(r, imm64 & 0xffff, 32);
1500 }
1501 
1502 // Macro to mov replicated immediate to vector register.
1503 //  Vd will get the following values for different arrangements in T
1504 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1505 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1506 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1507 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1508 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1509 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1510 //   T1D/T2D: invalid
1511 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1512   assert(T != T1D && T != T2D, "invalid arrangement");
1513   if (T == T8B || T == T16B) {
1514     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1515     movi(Vd, T, imm32 & 0xff, 0);
1516     return;
1517   }
1518   u_int32_t nimm32 = ~imm32;
1519   if (T == T4H || T == T8H) {
1520     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1521     imm32 &= 0xffff;
1522     nimm32 &= 0xffff;
1523   }
1524   u_int32_t x = imm32;
1525   int movi_cnt = 0;
1526   int movn_cnt = 0;
1527   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1528   x = nimm32;
1529   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1530   if (movn_cnt < movi_cnt) imm32 = nimm32;
1531   unsigned lsl = 0;
1532   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1533   if (movn_cnt < movi_cnt)
1534     mvni(Vd, T, imm32 & 0xff, lsl);
1535   else
1536     movi(Vd, T, imm32 & 0xff, lsl);
1537   imm32 >>= 8; lsl += 8;
1538   while (imm32) {
1539     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1540     if (movn_cnt < movi_cnt)
1541       bici(Vd, T, imm32 & 0xff, lsl);
1542     else
1543       orri(Vd, T, imm32 & 0xff, lsl);
1544     lsl += 8; imm32 >>= 8;
1545   }
1546 }
1547 
1548 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1549 {
1550 #ifndef PRODUCT
1551   {
1552     char buffer[64];
1553     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1554     block_comment(buffer);
1555   }
1556 #endif
1557   if (operand_valid_for_logical_immediate(false, imm64)) {
1558     orr(dst, zr, imm64);
1559   } else {
1560     // we can use a combination of MOVZ or MOVN with
1561     // MOVK to build up the constant
1562     u_int64_t imm_h[4];
1563     int zero_count = 0;
1564     int neg_count = 0;
1565     int i;
1566     for (i = 0; i < 4; i++) {
1567       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1568       if (imm_h[i] == 0) {
1569         zero_count++;
1570       } else if (imm_h[i] == 0xffffL) {
1571         neg_count++;
1572       }
1573     }
1574     if (zero_count == 4) {
1575       // one MOVZ will do
1576       movz(dst, 0);
1577     } else if (neg_count == 4) {
1578       // one MOVN will do
1579       movn(dst, 0);
1580     } else if (zero_count == 3) {
1581       for (i = 0; i < 4; i++) {
1582         if (imm_h[i] != 0L) {
1583           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1584           break;
1585         }
1586       }
1587     } else if (neg_count == 3) {
1588       // one MOVN will do
1589       for (int i = 0; i < 4; i++) {
1590         if (imm_h[i] != 0xffffL) {
1591           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1592           break;
1593         }
1594       }
1595     } else if (zero_count == 2) {
1596       // one MOVZ and one MOVK will do
1597       for (i = 0; i < 3; i++) {
1598         if (imm_h[i] != 0L) {
1599           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1600           i++;
1601           break;
1602         }
1603       }
1604       for (;i < 4; i++) {
1605         if (imm_h[i] != 0L) {
1606           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1607         }
1608       }
1609     } else if (neg_count == 2) {
1610       // one MOVN and one MOVK will do
1611       for (i = 0; i < 4; i++) {
1612         if (imm_h[i] != 0xffffL) {
1613           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1614           i++;
1615           break;
1616         }
1617       }
1618       for (;i < 4; i++) {
1619         if (imm_h[i] != 0xffffL) {
1620           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1621         }
1622       }
1623     } else if (zero_count == 1) {
1624       // one MOVZ and two MOVKs will do
1625       for (i = 0; i < 4; i++) {
1626         if (imm_h[i] != 0L) {
1627           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1628           i++;
1629           break;
1630         }
1631       }
1632       for (;i < 4; i++) {
1633         if (imm_h[i] != 0x0L) {
1634           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1635         }
1636       }
1637     } else if (neg_count == 1) {
1638       // one MOVN and two MOVKs will do
1639       for (i = 0; i < 4; i++) {
1640         if (imm_h[i] != 0xffffL) {
1641           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1642           i++;
1643           break;
1644         }
1645       }
1646       for (;i < 4; i++) {
1647         if (imm_h[i] != 0xffffL) {
1648           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649         }
1650       }
1651     } else {
1652       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1653       movz(dst, (u_int32_t)imm_h[0], 0);
1654       for (i = 1; i < 4; i++) {
1655         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1656       }
1657     }
1658   }
1659 }
1660 
1661 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1662 {
1663 #ifndef PRODUCT
1664     {
1665       char buffer[64];
1666       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1667       block_comment(buffer);
1668     }
1669 #endif
1670   if (operand_valid_for_logical_immediate(true, imm32)) {
1671     orrw(dst, zr, imm32);
1672   } else {
1673     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1674     // constant
1675     u_int32_t imm_h[2];
1676     imm_h[0] = imm32 & 0xffff;
1677     imm_h[1] = ((imm32 >> 16) & 0xffff);
1678     if (imm_h[0] == 0) {
1679       movzw(dst, imm_h[1], 16);
1680     } else if (imm_h[0] == 0xffff) {
1681       movnw(dst, imm_h[1] ^ 0xffff, 16);
1682     } else if (imm_h[1] == 0) {
1683       movzw(dst, imm_h[0], 0);
1684     } else if (imm_h[1] == 0xffff) {
1685       movnw(dst, imm_h[0] ^ 0xffff, 0);
1686     } else {
1687       // use a MOVZ and MOVK (makes it easier to debug)
1688       movzw(dst, imm_h[0], 0);
1689       movkw(dst, imm_h[1], 16);
1690     }
1691   }
1692 }
1693 
1694 // Form an address from base + offset in Rd.  Rd may or may
1695 // not actually be used: you must use the Address that is returned.
1696 // It is up to you to ensure that the shift provided matches the size
1697 // of your data.
1698 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1699   if (Address::offset_ok_for_immed(byte_offset, shift))
1700     // It fits; no need for any heroics
1701     return Address(base, byte_offset);
1702 
1703   // Don't do anything clever with negative or misaligned offsets
1704   unsigned mask = (1 << shift) - 1;
1705   if (byte_offset < 0 || byte_offset & mask) {
1706     mov(Rd, byte_offset);
1707     add(Rd, base, Rd);
1708     return Address(Rd);
1709   }
1710 
1711   // See if we can do this with two 12-bit offsets
1712   {
1713     unsigned long word_offset = byte_offset >> shift;
1714     unsigned long masked_offset = word_offset & 0xfff000;
1715     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1716         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1717       add(Rd, base, masked_offset << shift);
1718       word_offset -= masked_offset;
1719       return Address(Rd, word_offset << shift);
1720     }
1721   }
1722 
1723   // Do it the hard way
1724   mov(Rd, byte_offset);
1725   add(Rd, base, Rd);
1726   return Address(Rd);
1727 }
1728 
1729 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1730   if (UseLSE) {
1731     mov(tmp, 1);
1732     ldadd(Assembler::word, tmp, zr, counter_addr);
1733     return;
1734   }
1735   Label retry_load;
1736   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1737     prfm(Address(counter_addr), PSTL1STRM);
1738   bind(retry_load);
1739   // flush and load exclusive from the memory location
1740   ldxrw(tmp, counter_addr);
1741   addw(tmp, tmp, 1);
1742   // if we store+flush with no intervening write tmp wil be zero
1743   stxrw(tmp2, tmp, counter_addr);
1744   cbnzw(tmp2, retry_load);
1745 }
1746 
1747 
1748 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1749                                     bool want_remainder, Register scratch)
1750 {
1751   // Full implementation of Java idiv and irem.  The function
1752   // returns the (pc) offset of the div instruction - may be needed
1753   // for implicit exceptions.
1754   //
1755   // constraint : ra/rb =/= scratch
1756   //         normal case
1757   //
1758   // input : ra: dividend
1759   //         rb: divisor
1760   //
1761   // result: either
1762   //         quotient  (= ra idiv rb)
1763   //         remainder (= ra irem rb)
1764 
1765   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1766 
1767   int idivl_offset = offset();
1768   if (! want_remainder) {
1769     sdivw(result, ra, rb);
1770   } else {
1771     sdivw(scratch, ra, rb);
1772     Assembler::msubw(result, scratch, rb, ra);
1773   }
1774 
1775   return idivl_offset;
1776 }
1777 
1778 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1779                                     bool want_remainder, Register scratch)
1780 {
1781   // Full implementation of Java ldiv and lrem.  The function
1782   // returns the (pc) offset of the div instruction - may be needed
1783   // for implicit exceptions.
1784   //
1785   // constraint : ra/rb =/= scratch
1786   //         normal case
1787   //
1788   // input : ra: dividend
1789   //         rb: divisor
1790   //
1791   // result: either
1792   //         quotient  (= ra idiv rb)
1793   //         remainder (= ra irem rb)
1794 
1795   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1796 
1797   int idivq_offset = offset();
1798   if (! want_remainder) {
1799     sdiv(result, ra, rb);
1800   } else {
1801     sdiv(scratch, ra, rb);
1802     Assembler::msub(result, scratch, rb, ra);
1803   }
1804 
1805   return idivq_offset;
1806 }
1807 
1808 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1809   address prev = pc() - NativeMembar::instruction_size;
1810   address last = code()->last_insn();
1811   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1812     NativeMembar *bar = NativeMembar_at(prev);
1813     // We are merging two memory barrier instructions.  On AArch64 we
1814     // can do this simply by ORing them together.
1815     bar->set_kind(bar->get_kind() | order_constraint);
1816     BLOCK_COMMENT("merged membar");
1817   } else {
1818     code()->set_last_insn(pc());
1819     dmb(Assembler::barrier(order_constraint));
1820   }
1821 }
1822 
1823 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1824   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1825     merge_ldst(rt, adr, size_in_bytes, is_store);
1826     code()->clear_last_insn();
1827     return true;
1828   } else {
1829     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1830     const unsigned mask = size_in_bytes - 1;
1831     if (adr.getMode() == Address::base_plus_offset &&
1832         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1833       code()->set_last_insn(pc());
1834     }
1835     return false;
1836   }
1837 }
1838 
1839 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1840   // We always try to merge two adjacent loads into one ldp.
1841   if (!try_merge_ldst(Rx, adr, 8, false)) {
1842     Assembler::ldr(Rx, adr);
1843   }
1844 }
1845 
1846 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1847   // We always try to merge two adjacent loads into one ldp.
1848   if (!try_merge_ldst(Rw, adr, 4, false)) {
1849     Assembler::ldrw(Rw, adr);
1850   }
1851 }
1852 
1853 void MacroAssembler::str(Register Rx, const Address &adr) {
1854   // We always try to merge two adjacent stores into one stp.
1855   if (!try_merge_ldst(Rx, adr, 8, true)) {
1856     Assembler::str(Rx, adr);
1857   }
1858 }
1859 
1860 void MacroAssembler::strw(Register Rw, const Address &adr) {
1861   // We always try to merge two adjacent stores into one stp.
1862   if (!try_merge_ldst(Rw, adr, 4, true)) {
1863     Assembler::strw(Rw, adr);
1864   }
1865 }
1866 
1867 // MacroAssembler routines found actually to be needed
1868 
1869 void MacroAssembler::push(Register src)
1870 {
1871   str(src, Address(pre(esp, -1 * wordSize)));
1872 }
1873 
1874 void MacroAssembler::pop(Register dst)
1875 {
1876   ldr(dst, Address(post(esp, 1 * wordSize)));
1877 }
1878 
1879 // Note: load_unsigned_short used to be called load_unsigned_word.
1880 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1881   int off = offset();
1882   ldrh(dst, src);
1883   return off;
1884 }
1885 
1886 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1887   int off = offset();
1888   ldrb(dst, src);
1889   return off;
1890 }
1891 
1892 int MacroAssembler::load_signed_short(Register dst, Address src) {
1893   int off = offset();
1894   ldrsh(dst, src);
1895   return off;
1896 }
1897 
1898 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1899   int off = offset();
1900   ldrsb(dst, src);
1901   return off;
1902 }
1903 
1904 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1905   int off = offset();
1906   ldrshw(dst, src);
1907   return off;
1908 }
1909 
1910 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1911   int off = offset();
1912   ldrsbw(dst, src);
1913   return off;
1914 }
1915 
1916 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1917   switch (size_in_bytes) {
1918   case  8:  ldr(dst, src); break;
1919   case  4:  ldrw(dst, src); break;
1920   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1921   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1922   default:  ShouldNotReachHere();
1923   }
1924 }
1925 
1926 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1927   switch (size_in_bytes) {
1928   case  8:  str(src, dst); break;
1929   case  4:  strw(src, dst); break;
1930   case  2:  strh(src, dst); break;
1931   case  1:  strb(src, dst); break;
1932   default:  ShouldNotReachHere();
1933   }
1934 }
1935 
1936 void MacroAssembler::decrementw(Register reg, int value)
1937 {
1938   if (value < 0)  { incrementw(reg, -value);      return; }
1939   if (value == 0) {                               return; }
1940   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1941   /* else */ {
1942     guarantee(reg != rscratch2, "invalid dst for register decrement");
1943     movw(rscratch2, (unsigned)value);
1944     subw(reg, reg, rscratch2);
1945   }
1946 }
1947 
1948 void MacroAssembler::decrement(Register reg, int value)
1949 {
1950   if (value < 0)  { increment(reg, -value);      return; }
1951   if (value == 0) {                              return; }
1952   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1953   /* else */ {
1954     assert(reg != rscratch2, "invalid dst for register decrement");
1955     mov(rscratch2, (unsigned long)value);
1956     sub(reg, reg, rscratch2);
1957   }
1958 }
1959 
1960 void MacroAssembler::decrementw(Address dst, int value)
1961 {
1962   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1963   if (dst.getMode() == Address::literal) {
1964     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1965     lea(rscratch2, dst);
1966     dst = Address(rscratch2);
1967   }
1968   ldrw(rscratch1, dst);
1969   decrementw(rscratch1, value);
1970   strw(rscratch1, dst);
1971 }
1972 
1973 void MacroAssembler::decrement(Address dst, int value)
1974 {
1975   assert(!dst.uses(rscratch1), "invalid address for decrement");
1976   if (dst.getMode() == Address::literal) {
1977     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1978     lea(rscratch2, dst);
1979     dst = Address(rscratch2);
1980   }
1981   ldr(rscratch1, dst);
1982   decrement(rscratch1, value);
1983   str(rscratch1, dst);
1984 }
1985 
1986 void MacroAssembler::incrementw(Register reg, int value)
1987 {
1988   if (value < 0)  { decrementw(reg, -value);      return; }
1989   if (value == 0) {                               return; }
1990   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1991   /* else */ {
1992     assert(reg != rscratch2, "invalid dst for register increment");
1993     movw(rscratch2, (unsigned)value);
1994     addw(reg, reg, rscratch2);
1995   }
1996 }
1997 
1998 void MacroAssembler::increment(Register reg, int value)
1999 {
2000   if (value < 0)  { decrement(reg, -value);      return; }
2001   if (value == 0) {                              return; }
2002   if (value < (1 << 12)) { add(reg, reg, value); return; }
2003   /* else */ {
2004     assert(reg != rscratch2, "invalid dst for register increment");
2005     movw(rscratch2, (unsigned)value);
2006     add(reg, reg, rscratch2);
2007   }
2008 }
2009 
2010 void MacroAssembler::incrementw(Address dst, int value)
2011 {
2012   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2013   if (dst.getMode() == Address::literal) {
2014     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2015     lea(rscratch2, dst);
2016     dst = Address(rscratch2);
2017   }
2018   ldrw(rscratch1, dst);
2019   incrementw(rscratch1, value);
2020   strw(rscratch1, dst);
2021 }
2022 
2023 void MacroAssembler::increment(Address dst, int value)
2024 {
2025   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2026   if (dst.getMode() == Address::literal) {
2027     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2028     lea(rscratch2, dst);
2029     dst = Address(rscratch2);
2030   }
2031   ldr(rscratch1, dst);
2032   increment(rscratch1, value);
2033   str(rscratch1, dst);
2034 }
2035 
2036 
2037 void MacroAssembler::pusha() {
2038   push(0x7fffffff, sp);
2039 }
2040 
2041 void MacroAssembler::popa() {
2042   pop(0x7fffffff, sp);
2043 }
2044 
2045 // Push lots of registers in the bit set supplied.  Don't push sp.
2046 // Return the number of words pushed
2047 int MacroAssembler::push(unsigned int bitset, Register stack) {
2048   int words_pushed = 0;
2049 
2050   // Scan bitset to accumulate register pairs
2051   unsigned char regs[32];
2052   int count = 0;
2053   for (int reg = 0; reg <= 30; reg++) {
2054     if (1 & bitset)
2055       regs[count++] = reg;
2056     bitset >>= 1;
2057   }
2058   regs[count++] = zr->encoding_nocheck();
2059   count &= ~1;  // Only push an even nuber of regs
2060 
2061   if (count) {
2062     stp(as_Register(regs[0]), as_Register(regs[1]),
2063        Address(pre(stack, -count * wordSize)));
2064     words_pushed += 2;
2065   }
2066   for (int i = 2; i < count; i += 2) {
2067     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2068        Address(stack, i * wordSize));
2069     words_pushed += 2;
2070   }
2071 
2072   assert(words_pushed == count, "oops, pushed != count");
2073 
2074   return count;
2075 }
2076 
2077 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2078   int words_pushed = 0;
2079 
2080   // Scan bitset to accumulate register pairs
2081   unsigned char regs[32];
2082   int count = 0;
2083   for (int reg = 0; reg <= 30; reg++) {
2084     if (1 & bitset)
2085       regs[count++] = reg;
2086     bitset >>= 1;
2087   }
2088   regs[count++] = zr->encoding_nocheck();
2089   count &= ~1;
2090 
2091   for (int i = 2; i < count; i += 2) {
2092     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2093        Address(stack, i * wordSize));
2094     words_pushed += 2;
2095   }
2096   if (count) {
2097     ldp(as_Register(regs[0]), as_Register(regs[1]),
2098        Address(post(stack, count * wordSize)));
2099     words_pushed += 2;
2100   }
2101 
2102   assert(words_pushed == count, "oops, pushed != count");
2103 
2104   return count;
2105 }
2106 #ifdef ASSERT
2107 void MacroAssembler::verify_heapbase(const char* msg) {
2108 #if 0
2109   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2110   assert (Universe::heap() != NULL, "java heap should be initialized");
2111   if (CheckCompressedOops) {
2112     Label ok;
2113     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2114     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2115     br(Assembler::EQ, ok);
2116     stop(msg);
2117     bind(ok);
2118     pop(1 << rscratch1->encoding(), sp);
2119   }
2120 #endif
2121 }
2122 #endif
2123 
2124 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2125   Label done, not_weak;
2126   cbz(value, done);           // Use NULL as-is.
2127 
2128   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2129   tbz(r0, 0, not_weak);    // Test for jweak tag.
2130 
2131   // Resolve jweak.
2132   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2133                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2134   verify_oop(value);
2135   b(done);
2136 
2137   bind(not_weak);
2138   // Resolve (untagged) jobject.
2139   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2140   verify_oop(value);
2141   bind(done);
2142 }
2143 
2144 void MacroAssembler::stop(const char* msg) {
2145   address ip = pc();
2146   pusha();
2147   // We use movptr rather than mov here because we need code size not
2148   // to depend on the pointer value of msg otherwise C2 can observe
2149   // the same node with different sizes when emitted in a scratch
2150   // buffer and later when emitted for good.
2151   movptr(c_rarg0, (uintptr_t)msg);
2152   movptr(c_rarg1, (uintptr_t)ip);
2153   mov(c_rarg2, sp);
2154   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2155   // call(c_rarg3);
2156   blrt(c_rarg3, 3, 0, 1);
2157   hlt(0);
2158 }
2159 
2160 void MacroAssembler::unimplemented(const char* what) {
2161   const char* buf = NULL;
2162   {
2163     ResourceMark rm;
2164     stringStream ss;
2165     ss.print("unimplemented: %s", what);
2166     buf = code_string(ss.as_string());
2167   }
2168   stop(buf);
2169 }
2170 
2171 // If a constant does not fit in an immediate field, generate some
2172 // number of MOV instructions and then perform the operation.
2173 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2174                                            add_sub_imm_insn insn1,
2175                                            add_sub_reg_insn insn2) {
2176   assert(Rd != zr, "Rd = zr and not setting flags?");
2177   if (operand_valid_for_add_sub_immediate((int)imm)) {
2178     (this->*insn1)(Rd, Rn, imm);
2179   } else {
2180     if (uabs(imm) < (1 << 24)) {
2181        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2182        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2183     } else {
2184        assert_different_registers(Rd, Rn);
2185        mov(Rd, (uint64_t)imm);
2186        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2187     }
2188   }
2189 }
2190 
2191 // Seperate vsn which sets the flags. Optimisations are more restricted
2192 // because we must set the flags correctly.
2193 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2194                                            add_sub_imm_insn insn1,
2195                                            add_sub_reg_insn insn2) {
2196   if (operand_valid_for_add_sub_immediate((int)imm)) {
2197     (this->*insn1)(Rd, Rn, imm);
2198   } else {
2199     assert_different_registers(Rd, Rn);
2200     assert(Rd != zr, "overflow in immediate operand");
2201     mov(Rd, (uint64_t)imm);
2202     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2203   }
2204 }
2205 
2206 
2207 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2208   if (increment.is_register()) {
2209     add(Rd, Rn, increment.as_register());
2210   } else {
2211     add(Rd, Rn, increment.as_constant());
2212   }
2213 }
2214 
2215 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2216   if (increment.is_register()) {
2217     addw(Rd, Rn, increment.as_register());
2218   } else {
2219     addw(Rd, Rn, increment.as_constant());
2220   }
2221 }
2222 
2223 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2224   if (decrement.is_register()) {
2225     sub(Rd, Rn, decrement.as_register());
2226   } else {
2227     sub(Rd, Rn, decrement.as_constant());
2228   }
2229 }
2230 
2231 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2232   if (decrement.is_register()) {
2233     subw(Rd, Rn, decrement.as_register());
2234   } else {
2235     subw(Rd, Rn, decrement.as_constant());
2236   }
2237 }
2238 
2239 void MacroAssembler::reinit_heapbase()
2240 {
2241   if (UseCompressedOops) {
2242     if (Universe::is_fully_initialized()) {
2243       mov(rheapbase, Universe::narrow_ptrs_base());
2244     } else {
2245       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2246       ldr(rheapbase, Address(rheapbase));
2247     }
2248   }
2249 }
2250 
2251 // this simulates the behaviour of the x86 cmpxchg instruction using a
2252 // load linked/store conditional pair. we use the acquire/release
2253 // versions of these instructions so that we flush pending writes as
2254 // per Java semantics.
2255 
2256 // n.b the x86 version assumes the old value to be compared against is
2257 // in rax and updates rax with the value located in memory if the
2258 // cmpxchg fails. we supply a register for the old value explicitly
2259 
2260 // the aarch64 load linked/store conditional instructions do not
2261 // accept an offset. so, unlike x86, we must provide a plain register
2262 // to identify the memory word to be compared/exchanged rather than a
2263 // register+offset Address.
2264 
2265 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2266                                 Label &succeed, Label *fail) {
2267   // oldv holds comparison value
2268   // newv holds value to write in exchange
2269   // addr identifies memory word to compare against/update
2270   if (UseLSE) {
2271     mov(tmp, oldv);
2272     casal(Assembler::xword, oldv, newv, addr);
2273     cmp(tmp, oldv);
2274     br(Assembler::EQ, succeed);
2275     membar(AnyAny);
2276   } else {
2277     Label retry_load, nope;
2278     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2279       prfm(Address(addr), PSTL1STRM);
2280     bind(retry_load);
2281     // flush and load exclusive from the memory location
2282     // and fail if it is not what we expect
2283     ldaxr(tmp, addr);
2284     cmp(tmp, oldv);
2285     br(Assembler::NE, nope);
2286     // if we store+flush with no intervening write tmp wil be zero
2287     stlxr(tmp, newv, addr);
2288     cbzw(tmp, succeed);
2289     // retry so we only ever return after a load fails to compare
2290     // ensures we don't return a stale value after a failed write.
2291     b(retry_load);
2292     // if the memory word differs we return it in oldv and signal a fail
2293     bind(nope);
2294     membar(AnyAny);
2295     mov(oldv, tmp);
2296   }
2297   if (fail)
2298     b(*fail);
2299 }
2300 
2301 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2302                                         Label &succeed, Label *fail) {
2303   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2304   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2305 }
2306 
2307 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2308                                 Label &succeed, Label *fail) {
2309   // oldv holds comparison value
2310   // newv holds value to write in exchange
2311   // addr identifies memory word to compare against/update
2312   // tmp returns 0/1 for success/failure
2313   if (UseLSE) {
2314     mov(tmp, oldv);
2315     casal(Assembler::word, oldv, newv, addr);
2316     cmp(tmp, oldv);
2317     br(Assembler::EQ, succeed);
2318     membar(AnyAny);
2319   } else {
2320     Label retry_load, nope;
2321     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2322       prfm(Address(addr), PSTL1STRM);
2323     bind(retry_load);
2324     // flush and load exclusive from the memory location
2325     // and fail if it is not what we expect
2326     ldaxrw(tmp, addr);
2327     cmp(tmp, oldv);
2328     br(Assembler::NE, nope);
2329     // if we store+flush with no intervening write tmp wil be zero
2330     stlxrw(tmp, newv, addr);
2331     cbzw(tmp, succeed);
2332     // retry so we only ever return after a load fails to compare
2333     // ensures we don't return a stale value after a failed write.
2334     b(retry_load);
2335     // if the memory word differs we return it in oldv and signal a fail
2336     bind(nope);
2337     membar(AnyAny);
2338     mov(oldv, tmp);
2339   }
2340   if (fail)
2341     b(*fail);
2342 }
2343 
2344 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2345 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2346 // Pass a register for the result, otherwise pass noreg.
2347 
2348 // Clobbers rscratch1
2349 void MacroAssembler::cmpxchg(Register addr, Register expected,
2350                              Register new_val,
2351                              enum operand_size size,
2352                              bool acquire, bool release,
2353                              bool weak,
2354                              Register result) {
2355   if (result == noreg)  result = rscratch1;
2356   if (UseLSE) {
2357     mov(result, expected);
2358     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2359     cmp(result, expected);
2360   } else {
2361     BLOCK_COMMENT("cmpxchg {");
2362     Label retry_load, done;
2363     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2364       prfm(Address(addr), PSTL1STRM);
2365     bind(retry_load);
2366     load_exclusive(result, addr, size, acquire);
2367     if (size == xword)
2368       cmp(result, expected);
2369     else
2370       cmpw(result, expected);
2371     br(Assembler::NE, done);
2372     store_exclusive(rscratch1, new_val, addr, size, release);
2373     if (weak) {
2374       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2375     } else {
2376       cbnzw(rscratch1, retry_load);
2377     }
2378     bind(done);
2379     BLOCK_COMMENT("} cmpxchg");
2380   }
2381 }
2382 
2383 void MacroAssembler::cmpxchg_oop(Register addr, Register expected, Register new_val,
2384                                  bool acquire, bool release, bool weak, bool encode,
2385                                  Register tmp1, Register tmp2,
2386                                  Register tmp3, Register result) {
2387   BarrierSetAssembler* bsa = BarrierSet::barrier_set()->barrier_set_assembler();
2388   bsa->cmpxchg_oop(this, addr, expected, new_val, acquire, release, weak, encode, tmp1, tmp2, tmp3, result);
2389 }
2390 
2391 static bool different(Register a, RegisterOrConstant b, Register c) {
2392   if (b.is_constant())
2393     return a != c;
2394   else
2395     return a != b.as_register() && a != c && b.as_register() != c;
2396 }
2397 
2398 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2399 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2400   if (UseLSE) {                                                         \
2401     prev = prev->is_valid() ? prev : zr;                                \
2402     if (incr.is_register()) {                                           \
2403       AOP(sz, incr.as_register(), prev, addr);                          \
2404     } else {                                                            \
2405       mov(rscratch2, incr.as_constant());                               \
2406       AOP(sz, rscratch2, prev, addr);                                   \
2407     }                                                                   \
2408     return;                                                             \
2409   }                                                                     \
2410   Register result = rscratch2;                                          \
2411   if (prev->is_valid())                                                 \
2412     result = different(prev, incr, addr) ? prev : rscratch2;            \
2413                                                                         \
2414   Label retry_load;                                                     \
2415   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2416     prfm(Address(addr), PSTL1STRM);                                     \
2417   bind(retry_load);                                                     \
2418   LDXR(result, addr);                                                   \
2419   OP(rscratch1, result, incr);                                          \
2420   STXR(rscratch2, rscratch1, addr);                                     \
2421   cbnzw(rscratch2, retry_load);                                         \
2422   if (prev->is_valid() && prev != result) {                             \
2423     IOP(prev, rscratch1, incr);                                         \
2424   }                                                                     \
2425 }
2426 
2427 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2428 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2429 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2430 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2431 
2432 #undef ATOMIC_OP
2433 
2434 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2435 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2436   if (UseLSE) {                                                         \
2437     prev = prev->is_valid() ? prev : zr;                                \
2438     AOP(sz, newv, prev, addr);                                          \
2439     return;                                                             \
2440   }                                                                     \
2441   Register result = rscratch2;                                          \
2442   if (prev->is_valid())                                                 \
2443     result = different(prev, newv, addr) ? prev : rscratch2;            \
2444                                                                         \
2445   Label retry_load;                                                     \
2446   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2447     prfm(Address(addr), PSTL1STRM);                                     \
2448   bind(retry_load);                                                     \
2449   LDXR(result, addr);                                                   \
2450   STXR(rscratch1, newv, addr);                                          \
2451   cbnzw(rscratch1, retry_load);                                         \
2452   if (prev->is_valid() && prev != result)                               \
2453     mov(prev, result);                                                  \
2454 }
2455 
2456 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2457 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2458 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2459 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2460 
2461 #undef ATOMIC_XCHG
2462 
2463 #ifndef PRODUCT
2464 extern "C" void findpc(intptr_t x);
2465 #endif
2466 
2467 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2468 {
2469   // In order to get locks to work, we need to fake a in_VM state
2470   if (ShowMessageBoxOnError ) {
2471     JavaThread* thread = JavaThread::current();
2472     JavaThreadState saved_state = thread->thread_state();
2473     thread->set_thread_state(_thread_in_vm);
2474 #ifndef PRODUCT
2475     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2476       ttyLocker ttyl;
2477       BytecodeCounter::print();
2478     }
2479 #endif
2480     if (os::message_box(msg, "Execution stopped, print registers?")) {
2481       ttyLocker ttyl;
2482       tty->print_cr(" pc = 0x%016lx", pc);
2483 #ifndef PRODUCT
2484       tty->cr();
2485       findpc(pc);
2486       tty->cr();
2487 #endif
2488       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2489       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2490       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2491       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2492       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2493       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2494       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2495       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2496       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2497       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2498       tty->print_cr("r10 = 0x%016lx", regs[10]);
2499       tty->print_cr("r11 = 0x%016lx", regs[11]);
2500       tty->print_cr("r12 = 0x%016lx", regs[12]);
2501       tty->print_cr("r13 = 0x%016lx", regs[13]);
2502       tty->print_cr("r14 = 0x%016lx", regs[14]);
2503       tty->print_cr("r15 = 0x%016lx", regs[15]);
2504       tty->print_cr("r16 = 0x%016lx", regs[16]);
2505       tty->print_cr("r17 = 0x%016lx", regs[17]);
2506       tty->print_cr("r18 = 0x%016lx", regs[18]);
2507       tty->print_cr("r19 = 0x%016lx", regs[19]);
2508       tty->print_cr("r20 = 0x%016lx", regs[20]);
2509       tty->print_cr("r21 = 0x%016lx", regs[21]);
2510       tty->print_cr("r22 = 0x%016lx", regs[22]);
2511       tty->print_cr("r23 = 0x%016lx", regs[23]);
2512       tty->print_cr("r24 = 0x%016lx", regs[24]);
2513       tty->print_cr("r25 = 0x%016lx", regs[25]);
2514       tty->print_cr("r26 = 0x%016lx", regs[26]);
2515       tty->print_cr("r27 = 0x%016lx", regs[27]);
2516       tty->print_cr("r28 = 0x%016lx", regs[28]);
2517       tty->print_cr("r30 = 0x%016lx", regs[30]);
2518       tty->print_cr("r31 = 0x%016lx", regs[31]);
2519       BREAKPOINT;
2520     }
2521     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2522   } else {
2523     ttyLocker ttyl;
2524     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2525                     msg);
2526     assert(false, "DEBUG MESSAGE: %s", msg);
2527   }
2528 }
2529 
2530 #ifdef BUILTIN_SIM
2531 // routine to generate an x86 prolog for a stub function which
2532 // bootstraps into the generated ARM code which directly follows the
2533 // stub
2534 //
2535 // the argument encodes the number of general and fp registers
2536 // passed by the caller and the callng convention (currently just
2537 // the number of general registers and assumes C argument passing)
2538 
2539 extern "C" {
2540 int aarch64_stub_prolog_size();
2541 void aarch64_stub_prolog();
2542 void aarch64_prolog();
2543 }
2544 
2545 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2546                                    address *prolog_ptr)
2547 {
2548   int calltype = (((ret_type & 0x3) << 8) |
2549                   ((fp_arg_count & 0xf) << 4) |
2550                   (gp_arg_count & 0xf));
2551 
2552   // the addresses for the x86 to ARM entry code we need to use
2553   address start = pc();
2554   // printf("start = %lx\n", start);
2555   int byteCount =  aarch64_stub_prolog_size();
2556   // printf("byteCount = %x\n", byteCount);
2557   int instructionCount = (byteCount + 3)/ 4;
2558   // printf("instructionCount = %x\n", instructionCount);
2559   for (int i = 0; i < instructionCount; i++) {
2560     nop();
2561   }
2562 
2563   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2564 
2565   // write the address of the setup routine and the call format at the
2566   // end of into the copied code
2567   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2568   if (prolog_ptr)
2569     patch_end[-2] = (u_int64_t)prolog_ptr;
2570   patch_end[-1] = calltype;
2571 }
2572 #endif
2573 
2574 void MacroAssembler::push_call_clobbered_fp_registers() {
2575   int step = 4 * wordSize;
2576   sub(sp, sp, step);
2577   mov(rscratch1, -step);
2578   // Push v0-v7, v16-v31.
2579   for (int i = 31; i>= 4; i -= 4) {
2580     if (i <= v7->encoding() || i >= v16->encoding())
2581       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2582           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2583   }
2584   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2585       as_FloatRegister(3), T1D, Address(sp));
2586 }
2587 
2588 void MacroAssembler::pop_call_clobbered_fp_registers() {
2589   for (int i = 0; i < 32; i += 4) {
2590     if (i <= v7->encoding() || i >= v16->encoding())
2591       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2592           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2593   }
2594 }
2595 
2596 void MacroAssembler::push_call_clobbered_registers() {
2597   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2598   push_call_clobbered_fp_registers();
2599 }
2600 
2601 void MacroAssembler::pop_call_clobbered_registers() {
2602   pop_call_clobbered_fp_registers();
2603   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2604 }
2605 
2606 void MacroAssembler::push_CPU_state(bool save_vectors) {
2607   int step = (save_vectors ? 8 : 4) * wordSize;
2608   push(0x3fffffff, sp);         // integer registers except lr & sp
2609   mov(rscratch1, -step);
2610   sub(sp, sp, step);
2611   for (int i = 28; i >= 4; i -= 4) {
2612     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2613         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2614   }
2615   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2616 }
2617 
2618 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2619   int step = (restore_vectors ? 8 : 4) * wordSize;
2620   for (int i = 0; i <= 28; i += 4)
2621     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2622         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2623   pop(0x3fffffff, sp);         // integer registers except lr & sp
2624 }
2625 
2626 /**
2627  * Helpers for multiply_to_len().
2628  */
2629 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2630                                      Register src1, Register src2) {
2631   adds(dest_lo, dest_lo, src1);
2632   adc(dest_hi, dest_hi, zr);
2633   adds(dest_lo, dest_lo, src2);
2634   adc(final_dest_hi, dest_hi, zr);
2635 }
2636 
2637 // Generate an address from (r + r1 extend offset).  "size" is the
2638 // size of the operand.  The result may be in rscratch2.
2639 Address MacroAssembler::offsetted_address(Register r, Register r1,
2640                                           Address::extend ext, int offset, int size) {
2641   if (offset || (ext.shift() % size != 0)) {
2642     lea(rscratch2, Address(r, r1, ext));
2643     return Address(rscratch2, offset);
2644   } else {
2645     return Address(r, r1, ext);
2646   }
2647 }
2648 
2649 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2650 {
2651   assert(offset >= 0, "spill to negative address?");
2652   // Offset reachable ?
2653   //   Not aligned - 9 bits signed offset
2654   //   Aligned - 12 bits unsigned offset shifted
2655   Register base = sp;
2656   if ((offset & (size-1)) && offset >= (1<<8)) {
2657     add(tmp, base, offset & ((1<<12)-1));
2658     base = tmp;
2659     offset &= -1<<12;
2660   }
2661 
2662   if (offset >= (1<<12) * size) {
2663     add(tmp, base, offset & (((1<<12)-1)<<12));
2664     base = tmp;
2665     offset &= ~(((1<<12)-1)<<12);
2666   }
2667 
2668   return Address(base, offset);
2669 }
2670 
2671 // Checks whether offset is aligned.
2672 // Returns true if it is, else false.
2673 bool MacroAssembler::merge_alignment_check(Register base,
2674                                            size_t size,
2675                                            long cur_offset,
2676                                            long prev_offset) const {
2677   if (AvoidUnalignedAccesses) {
2678     if (base == sp) {
2679       // Checks whether low offset if aligned to pair of registers.
2680       long pair_mask = size * 2 - 1;
2681       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2682       return (offset & pair_mask) == 0;
2683     } else { // If base is not sp, we can't guarantee the access is aligned.
2684       return false;
2685     }
2686   } else {
2687     long mask = size - 1;
2688     // Load/store pair instruction only supports element size aligned offset.
2689     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2690   }
2691 }
2692 
2693 // Checks whether current and previous loads/stores can be merged.
2694 // Returns true if it can be merged, else false.
2695 bool MacroAssembler::ldst_can_merge(Register rt,
2696                                     const Address &adr,
2697                                     size_t cur_size_in_bytes,
2698                                     bool is_store) const {
2699   address prev = pc() - NativeInstruction::instruction_size;
2700   address last = code()->last_insn();
2701 
2702   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2703     return false;
2704   }
2705 
2706   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2707     return false;
2708   }
2709 
2710   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2711   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2712 
2713   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2714   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2715 
2716   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2717     return false;
2718   }
2719 
2720   long max_offset = 63 * prev_size_in_bytes;
2721   long min_offset = -64 * prev_size_in_bytes;
2722 
2723   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2724 
2725   // Only same base can be merged.
2726   if (adr.base() != prev_ldst->base()) {
2727     return false;
2728   }
2729 
2730   long cur_offset = adr.offset();
2731   long prev_offset = prev_ldst->offset();
2732   size_t diff = abs(cur_offset - prev_offset);
2733   if (diff != prev_size_in_bytes) {
2734     return false;
2735   }
2736 
2737   // Following cases can not be merged:
2738   // ldr x2, [x2, #8]
2739   // ldr x3, [x2, #16]
2740   // or:
2741   // ldr x2, [x3, #8]
2742   // ldr x2, [x3, #16]
2743   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2744   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2745     return false;
2746   }
2747 
2748   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2749   // Offset range must be in ldp/stp instruction's range.
2750   if (low_offset > max_offset || low_offset < min_offset) {
2751     return false;
2752   }
2753 
2754   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2755     return true;
2756   }
2757 
2758   return false;
2759 }
2760 
2761 // Merge current load/store with previous load/store into ldp/stp.
2762 void MacroAssembler::merge_ldst(Register rt,
2763                                 const Address &adr,
2764                                 size_t cur_size_in_bytes,
2765                                 bool is_store) {
2766 
2767   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2768 
2769   Register rt_low, rt_high;
2770   address prev = pc() - NativeInstruction::instruction_size;
2771   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2772 
2773   long offset;
2774 
2775   if (adr.offset() < prev_ldst->offset()) {
2776     offset = adr.offset();
2777     rt_low = rt;
2778     rt_high = prev_ldst->target();
2779   } else {
2780     offset = prev_ldst->offset();
2781     rt_low = prev_ldst->target();
2782     rt_high = rt;
2783   }
2784 
2785   Address adr_p = Address(prev_ldst->base(), offset);
2786   // Overwrite previous generated binary.
2787   code_section()->set_end(prev);
2788 
2789   const int sz = prev_ldst->size_in_bytes();
2790   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2791   if (!is_store) {
2792     BLOCK_COMMENT("merged ldr pair");
2793     if (sz == 8) {
2794       ldp(rt_low, rt_high, adr_p);
2795     } else {
2796       ldpw(rt_low, rt_high, adr_p);
2797     }
2798   } else {
2799     BLOCK_COMMENT("merged str pair");
2800     if (sz == 8) {
2801       stp(rt_low, rt_high, adr_p);
2802     } else {
2803       stpw(rt_low, rt_high, adr_p);
2804     }
2805   }
2806 }
2807 
2808 /**
2809  * Multiply 64 bit by 64 bit first loop.
2810  */
2811 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2812                                            Register y, Register y_idx, Register z,
2813                                            Register carry, Register product,
2814                                            Register idx, Register kdx) {
2815   //
2816   //  jlong carry, x[], y[], z[];
2817   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2818   //    huge_128 product = y[idx] * x[xstart] + carry;
2819   //    z[kdx] = (jlong)product;
2820   //    carry  = (jlong)(product >>> 64);
2821   //  }
2822   //  z[xstart] = carry;
2823   //
2824 
2825   Label L_first_loop, L_first_loop_exit;
2826   Label L_one_x, L_one_y, L_multiply;
2827 
2828   subsw(xstart, xstart, 1);
2829   br(Assembler::MI, L_one_x);
2830 
2831   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2832   ldr(x_xstart, Address(rscratch1));
2833   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2834 
2835   bind(L_first_loop);
2836   subsw(idx, idx, 1);
2837   br(Assembler::MI, L_first_loop_exit);
2838   subsw(idx, idx, 1);
2839   br(Assembler::MI, L_one_y);
2840   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2841   ldr(y_idx, Address(rscratch1));
2842   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2843   bind(L_multiply);
2844 
2845   // AArch64 has a multiply-accumulate instruction that we can't use
2846   // here because it has no way to process carries, so we have to use
2847   // separate add and adc instructions.  Bah.
2848   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2849   mul(product, x_xstart, y_idx);
2850   adds(product, product, carry);
2851   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2852 
2853   subw(kdx, kdx, 2);
2854   ror(product, product, 32); // back to big-endian
2855   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2856 
2857   b(L_first_loop);
2858 
2859   bind(L_one_y);
2860   ldrw(y_idx, Address(y,  0));
2861   b(L_multiply);
2862 
2863   bind(L_one_x);
2864   ldrw(x_xstart, Address(x,  0));
2865   b(L_first_loop);
2866 
2867   bind(L_first_loop_exit);
2868 }
2869 
2870 /**
2871  * Multiply 128 bit by 128. Unrolled inner loop.
2872  *
2873  */
2874 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2875                                              Register carry, Register carry2,
2876                                              Register idx, Register jdx,
2877                                              Register yz_idx1, Register yz_idx2,
2878                                              Register tmp, Register tmp3, Register tmp4,
2879                                              Register tmp6, Register product_hi) {
2880 
2881   //   jlong carry, x[], y[], z[];
2882   //   int kdx = ystart+1;
2883   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2884   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2885   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2886   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2887   //     carry  = (jlong)(tmp4 >>> 64);
2888   //     z[kdx+idx+1] = (jlong)tmp3;
2889   //     z[kdx+idx] = (jlong)tmp4;
2890   //   }
2891   //   idx += 2;
2892   //   if (idx > 0) {
2893   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2894   //     z[kdx+idx] = (jlong)yz_idx1;
2895   //     carry  = (jlong)(yz_idx1 >>> 64);
2896   //   }
2897   //
2898 
2899   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2900 
2901   lsrw(jdx, idx, 2);
2902 
2903   bind(L_third_loop);
2904 
2905   subsw(jdx, jdx, 1);
2906   br(Assembler::MI, L_third_loop_exit);
2907   subw(idx, idx, 4);
2908 
2909   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2910 
2911   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2912 
2913   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2914 
2915   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2916   ror(yz_idx2, yz_idx2, 32);
2917 
2918   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2919 
2920   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2921   umulh(tmp4, product_hi, yz_idx1);
2922 
2923   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2924   ror(rscratch2, rscratch2, 32);
2925 
2926   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2927   umulh(carry2, product_hi, yz_idx2);
2928 
2929   // propagate sum of both multiplications into carry:tmp4:tmp3
2930   adds(tmp3, tmp3, carry);
2931   adc(tmp4, tmp4, zr);
2932   adds(tmp3, tmp3, rscratch1);
2933   adcs(tmp4, tmp4, tmp);
2934   adc(carry, carry2, zr);
2935   adds(tmp4, tmp4, rscratch2);
2936   adc(carry, carry, zr);
2937 
2938   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2939   ror(tmp4, tmp4, 32);
2940   stp(tmp4, tmp3, Address(tmp6, 0));
2941 
2942   b(L_third_loop);
2943   bind (L_third_loop_exit);
2944 
2945   andw (idx, idx, 0x3);
2946   cbz(idx, L_post_third_loop_done);
2947 
2948   Label L_check_1;
2949   subsw(idx, idx, 2);
2950   br(Assembler::MI, L_check_1);
2951 
2952   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2953   ldr(yz_idx1, Address(rscratch1, 0));
2954   ror(yz_idx1, yz_idx1, 32);
2955   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2956   umulh(tmp4, product_hi, yz_idx1);
2957   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2958   ldr(yz_idx2, Address(rscratch1, 0));
2959   ror(yz_idx2, yz_idx2, 32);
2960 
2961   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2962 
2963   ror(tmp3, tmp3, 32);
2964   str(tmp3, Address(rscratch1, 0));
2965 
2966   bind (L_check_1);
2967 
2968   andw (idx, idx, 0x1);
2969   subsw(idx, idx, 1);
2970   br(Assembler::MI, L_post_third_loop_done);
2971   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2972   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2973   umulh(carry2, tmp4, product_hi);
2974   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2975 
2976   add2_with_carry(carry2, tmp3, tmp4, carry);
2977 
2978   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2979   extr(carry, carry2, tmp3, 32);
2980 
2981   bind(L_post_third_loop_done);
2982 }
2983 
2984 /**
2985  * Code for BigInteger::multiplyToLen() instrinsic.
2986  *
2987  * r0: x
2988  * r1: xlen
2989  * r2: y
2990  * r3: ylen
2991  * r4:  z
2992  * r5: zlen
2993  * r10: tmp1
2994  * r11: tmp2
2995  * r12: tmp3
2996  * r13: tmp4
2997  * r14: tmp5
2998  * r15: tmp6
2999  * r16: tmp7
3000  *
3001  */
3002 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3003                                      Register z, Register zlen,
3004                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3005                                      Register tmp5, Register tmp6, Register product_hi) {
3006 
3007   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3008 
3009   const Register idx = tmp1;
3010   const Register kdx = tmp2;
3011   const Register xstart = tmp3;
3012 
3013   const Register y_idx = tmp4;
3014   const Register carry = tmp5;
3015   const Register product  = xlen;
3016   const Register x_xstart = zlen;  // reuse register
3017 
3018   // First Loop.
3019   //
3020   //  final static long LONG_MASK = 0xffffffffL;
3021   //  int xstart = xlen - 1;
3022   //  int ystart = ylen - 1;
3023   //  long carry = 0;
3024   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3025   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3026   //    z[kdx] = (int)product;
3027   //    carry = product >>> 32;
3028   //  }
3029   //  z[xstart] = (int)carry;
3030   //
3031 
3032   movw(idx, ylen);      // idx = ylen;
3033   movw(kdx, zlen);      // kdx = xlen+ylen;
3034   mov(carry, zr);       // carry = 0;
3035 
3036   Label L_done;
3037 
3038   movw(xstart, xlen);
3039   subsw(xstart, xstart, 1);
3040   br(Assembler::MI, L_done);
3041 
3042   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3043 
3044   Label L_second_loop;
3045   cbzw(kdx, L_second_loop);
3046 
3047   Label L_carry;
3048   subw(kdx, kdx, 1);
3049   cbzw(kdx, L_carry);
3050 
3051   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3052   lsr(carry, carry, 32);
3053   subw(kdx, kdx, 1);
3054 
3055   bind(L_carry);
3056   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3057 
3058   // Second and third (nested) loops.
3059   //
3060   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3061   //   carry = 0;
3062   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3063   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3064   //                    (z[k] & LONG_MASK) + carry;
3065   //     z[k] = (int)product;
3066   //     carry = product >>> 32;
3067   //   }
3068   //   z[i] = (int)carry;
3069   // }
3070   //
3071   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3072 
3073   const Register jdx = tmp1;
3074 
3075   bind(L_second_loop);
3076   mov(carry, zr);                // carry = 0;
3077   movw(jdx, ylen);               // j = ystart+1
3078 
3079   subsw(xstart, xstart, 1);      // i = xstart-1;
3080   br(Assembler::MI, L_done);
3081 
3082   str(z, Address(pre(sp, -4 * wordSize)));
3083 
3084   Label L_last_x;
3085   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3086   subsw(xstart, xstart, 1);       // i = xstart-1;
3087   br(Assembler::MI, L_last_x);
3088 
3089   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3090   ldr(product_hi, Address(rscratch1));
3091   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3092 
3093   Label L_third_loop_prologue;
3094   bind(L_third_loop_prologue);
3095 
3096   str(ylen, Address(sp, wordSize));
3097   stp(x, xstart, Address(sp, 2 * wordSize));
3098   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3099                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3100   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3101   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3102 
3103   addw(tmp3, xlen, 1);
3104   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3105   subsw(tmp3, tmp3, 1);
3106   br(Assembler::MI, L_done);
3107 
3108   lsr(carry, carry, 32);
3109   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3110   b(L_second_loop);
3111 
3112   // Next infrequent code is moved outside loops.
3113   bind(L_last_x);
3114   ldrw(product_hi, Address(x,  0));
3115   b(L_third_loop_prologue);
3116 
3117   bind(L_done);
3118 }
3119 
3120 // Code for BigInteger::mulAdd instrinsic
3121 // out     = r0
3122 // in      = r1
3123 // offset  = r2  (already out.length-offset)
3124 // len     = r3
3125 // k       = r4
3126 //
3127 // pseudo code from java implementation:
3128 // carry = 0;
3129 // offset = out.length-offset - 1;
3130 // for (int j=len-1; j >= 0; j--) {
3131 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3132 //     out[offset--] = (int)product;
3133 //     carry = product >>> 32;
3134 // }
3135 // return (int)carry;
3136 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3137       Register len, Register k) {
3138     Label LOOP, END;
3139     // pre-loop
3140     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3141     csel(out, zr, out, Assembler::EQ);
3142     br(Assembler::EQ, END);
3143     add(in, in, len, LSL, 2); // in[j+1] address
3144     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3145     mov(out, zr); // used to keep carry now
3146     BIND(LOOP);
3147     ldrw(rscratch1, Address(pre(in, -4)));
3148     madd(rscratch1, rscratch1, k, out);
3149     ldrw(rscratch2, Address(pre(offset, -4)));
3150     add(rscratch1, rscratch1, rscratch2);
3151     strw(rscratch1, Address(offset));
3152     lsr(out, rscratch1, 32);
3153     subs(len, len, 1);
3154     br(Assembler::NE, LOOP);
3155     BIND(END);
3156 }
3157 
3158 /**
3159  * Emits code to update CRC-32 with a byte value according to constants in table
3160  *
3161  * @param [in,out]crc   Register containing the crc.
3162  * @param [in]val       Register containing the byte to fold into the CRC.
3163  * @param [in]table     Register containing the table of crc constants.
3164  *
3165  * uint32_t crc;
3166  * val = crc_table[(val ^ crc) & 0xFF];
3167  * crc = val ^ (crc >> 8);
3168  *
3169  */
3170 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3171   eor(val, val, crc);
3172   andr(val, val, 0xff);
3173   ldrw(val, Address(table, val, Address::lsl(2)));
3174   eor(crc, val, crc, Assembler::LSR, 8);
3175 }
3176 
3177 /**
3178  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3179  *
3180  * @param [in,out]crc   Register containing the crc.
3181  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3182  * @param [in]table0    Register containing table 0 of crc constants.
3183  * @param [in]table1    Register containing table 1 of crc constants.
3184  * @param [in]table2    Register containing table 2 of crc constants.
3185  * @param [in]table3    Register containing table 3 of crc constants.
3186  *
3187  * uint32_t crc;
3188  *   v = crc ^ v
3189  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3190  *
3191  */
3192 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3193         Register table0, Register table1, Register table2, Register table3,
3194         bool upper) {
3195   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3196   uxtb(tmp, v);
3197   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3198   ubfx(tmp, v, 8, 8);
3199   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3200   eor(crc, crc, tmp);
3201   ubfx(tmp, v, 16, 8);
3202   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3203   eor(crc, crc, tmp);
3204   ubfx(tmp, v, 24, 8);
3205   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3206   eor(crc, crc, tmp);
3207 }
3208 
3209 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3210         Register len, Register tmp0, Register tmp1, Register tmp2,
3211         Register tmp3) {
3212     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3213     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3214 
3215     mvnw(crc, crc);
3216 
3217     subs(len, len, 128);
3218     br(Assembler::GE, CRC_by64_pre);
3219   BIND(CRC_less64);
3220     adds(len, len, 128-32);
3221     br(Assembler::GE, CRC_by32_loop);
3222   BIND(CRC_less32);
3223     adds(len, len, 32-4);
3224     br(Assembler::GE, CRC_by4_loop);
3225     adds(len, len, 4);
3226     br(Assembler::GT, CRC_by1_loop);
3227     b(L_exit);
3228 
3229   BIND(CRC_by32_loop);
3230     ldp(tmp0, tmp1, Address(post(buf, 16)));
3231     subs(len, len, 32);
3232     crc32x(crc, crc, tmp0);
3233     ldr(tmp2, Address(post(buf, 8)));
3234     crc32x(crc, crc, tmp1);
3235     ldr(tmp3, Address(post(buf, 8)));
3236     crc32x(crc, crc, tmp2);
3237     crc32x(crc, crc, tmp3);
3238     br(Assembler::GE, CRC_by32_loop);
3239     cmn(len, 32);
3240     br(Assembler::NE, CRC_less32);
3241     b(L_exit);
3242 
3243   BIND(CRC_by4_loop);
3244     ldrw(tmp0, Address(post(buf, 4)));
3245     subs(len, len, 4);
3246     crc32w(crc, crc, tmp0);
3247     br(Assembler::GE, CRC_by4_loop);
3248     adds(len, len, 4);
3249     br(Assembler::LE, L_exit);
3250   BIND(CRC_by1_loop);
3251     ldrb(tmp0, Address(post(buf, 1)));
3252     subs(len, len, 1);
3253     crc32b(crc, crc, tmp0);
3254     br(Assembler::GT, CRC_by1_loop);
3255     b(L_exit);
3256 
3257   BIND(CRC_by64_pre);
3258     sub(buf, buf, 8);
3259     ldp(tmp0, tmp1, Address(buf, 8));
3260     crc32x(crc, crc, tmp0);
3261     ldr(tmp2, Address(buf, 24));
3262     crc32x(crc, crc, tmp1);
3263     ldr(tmp3, Address(buf, 32));
3264     crc32x(crc, crc, tmp2);
3265     ldr(tmp0, Address(buf, 40));
3266     crc32x(crc, crc, tmp3);
3267     ldr(tmp1, Address(buf, 48));
3268     crc32x(crc, crc, tmp0);
3269     ldr(tmp2, Address(buf, 56));
3270     crc32x(crc, crc, tmp1);
3271     ldr(tmp3, Address(pre(buf, 64)));
3272 
3273     b(CRC_by64_loop);
3274 
3275     align(CodeEntryAlignment);
3276   BIND(CRC_by64_loop);
3277     subs(len, len, 64);
3278     crc32x(crc, crc, tmp2);
3279     ldr(tmp0, Address(buf, 8));
3280     crc32x(crc, crc, tmp3);
3281     ldr(tmp1, Address(buf, 16));
3282     crc32x(crc, crc, tmp0);
3283     ldr(tmp2, Address(buf, 24));
3284     crc32x(crc, crc, tmp1);
3285     ldr(tmp3, Address(buf, 32));
3286     crc32x(crc, crc, tmp2);
3287     ldr(tmp0, Address(buf, 40));
3288     crc32x(crc, crc, tmp3);
3289     ldr(tmp1, Address(buf, 48));
3290     crc32x(crc, crc, tmp0);
3291     ldr(tmp2, Address(buf, 56));
3292     crc32x(crc, crc, tmp1);
3293     ldr(tmp3, Address(pre(buf, 64)));
3294     br(Assembler::GE, CRC_by64_loop);
3295 
3296     // post-loop
3297     crc32x(crc, crc, tmp2);
3298     crc32x(crc, crc, tmp3);
3299 
3300     sub(len, len, 64);
3301     add(buf, buf, 8);
3302     cmn(len, 128);
3303     br(Assembler::NE, CRC_less64);
3304   BIND(L_exit);
3305     mvnw(crc, crc);
3306 }
3307 
3308 /**
3309  * @param crc   register containing existing CRC (32-bit)
3310  * @param buf   register pointing to input byte buffer (byte*)
3311  * @param len   register containing number of bytes
3312  * @param table register that will contain address of CRC table
3313  * @param tmp   scratch register
3314  */
3315 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3316         Register table0, Register table1, Register table2, Register table3,
3317         Register tmp, Register tmp2, Register tmp3) {
3318   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3319   unsigned long offset;
3320 
3321   if (UseCRC32) {
3322       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3323       return;
3324   }
3325 
3326     mvnw(crc, crc);
3327 
3328     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3329     if (offset) add(table0, table0, offset);
3330     add(table1, table0, 1*256*sizeof(juint));
3331     add(table2, table0, 2*256*sizeof(juint));
3332     add(table3, table0, 3*256*sizeof(juint));
3333 
3334   if (UseNeon) {
3335       cmp(len, (u1)64);
3336       br(Assembler::LT, L_by16);
3337       eor(v16, T16B, v16, v16);
3338 
3339     Label L_fold;
3340 
3341       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3342 
3343       ld1(v0, v1, T2D, post(buf, 32));
3344       ld1r(v4, T2D, post(tmp, 8));
3345       ld1r(v5, T2D, post(tmp, 8));
3346       ld1r(v6, T2D, post(tmp, 8));
3347       ld1r(v7, T2D, post(tmp, 8));
3348       mov(v16, T4S, 0, crc);
3349 
3350       eor(v0, T16B, v0, v16);
3351       sub(len, len, 64);
3352 
3353     BIND(L_fold);
3354       pmull(v22, T8H, v0, v5, T8B);
3355       pmull(v20, T8H, v0, v7, T8B);
3356       pmull(v23, T8H, v0, v4, T8B);
3357       pmull(v21, T8H, v0, v6, T8B);
3358 
3359       pmull2(v18, T8H, v0, v5, T16B);
3360       pmull2(v16, T8H, v0, v7, T16B);
3361       pmull2(v19, T8H, v0, v4, T16B);
3362       pmull2(v17, T8H, v0, v6, T16B);
3363 
3364       uzp1(v24, T8H, v20, v22);
3365       uzp2(v25, T8H, v20, v22);
3366       eor(v20, T16B, v24, v25);
3367 
3368       uzp1(v26, T8H, v16, v18);
3369       uzp2(v27, T8H, v16, v18);
3370       eor(v16, T16B, v26, v27);
3371 
3372       ushll2(v22, T4S, v20, T8H, 8);
3373       ushll(v20, T4S, v20, T4H, 8);
3374 
3375       ushll2(v18, T4S, v16, T8H, 8);
3376       ushll(v16, T4S, v16, T4H, 8);
3377 
3378       eor(v22, T16B, v23, v22);
3379       eor(v18, T16B, v19, v18);
3380       eor(v20, T16B, v21, v20);
3381       eor(v16, T16B, v17, v16);
3382 
3383       uzp1(v17, T2D, v16, v20);
3384       uzp2(v21, T2D, v16, v20);
3385       eor(v17, T16B, v17, v21);
3386 
3387       ushll2(v20, T2D, v17, T4S, 16);
3388       ushll(v16, T2D, v17, T2S, 16);
3389 
3390       eor(v20, T16B, v20, v22);
3391       eor(v16, T16B, v16, v18);
3392 
3393       uzp1(v17, T2D, v20, v16);
3394       uzp2(v21, T2D, v20, v16);
3395       eor(v28, T16B, v17, v21);
3396 
3397       pmull(v22, T8H, v1, v5, T8B);
3398       pmull(v20, T8H, v1, v7, T8B);
3399       pmull(v23, T8H, v1, v4, T8B);
3400       pmull(v21, T8H, v1, v6, T8B);
3401 
3402       pmull2(v18, T8H, v1, v5, T16B);
3403       pmull2(v16, T8H, v1, v7, T16B);
3404       pmull2(v19, T8H, v1, v4, T16B);
3405       pmull2(v17, T8H, v1, v6, T16B);
3406 
3407       ld1(v0, v1, T2D, post(buf, 32));
3408 
3409       uzp1(v24, T8H, v20, v22);
3410       uzp2(v25, T8H, v20, v22);
3411       eor(v20, T16B, v24, v25);
3412 
3413       uzp1(v26, T8H, v16, v18);
3414       uzp2(v27, T8H, v16, v18);
3415       eor(v16, T16B, v26, v27);
3416 
3417       ushll2(v22, T4S, v20, T8H, 8);
3418       ushll(v20, T4S, v20, T4H, 8);
3419 
3420       ushll2(v18, T4S, v16, T8H, 8);
3421       ushll(v16, T4S, v16, T4H, 8);
3422 
3423       eor(v22, T16B, v23, v22);
3424       eor(v18, T16B, v19, v18);
3425       eor(v20, T16B, v21, v20);
3426       eor(v16, T16B, v17, v16);
3427 
3428       uzp1(v17, T2D, v16, v20);
3429       uzp2(v21, T2D, v16, v20);
3430       eor(v16, T16B, v17, v21);
3431 
3432       ushll2(v20, T2D, v16, T4S, 16);
3433       ushll(v16, T2D, v16, T2S, 16);
3434 
3435       eor(v20, T16B, v22, v20);
3436       eor(v16, T16B, v16, v18);
3437 
3438       uzp1(v17, T2D, v20, v16);
3439       uzp2(v21, T2D, v20, v16);
3440       eor(v20, T16B, v17, v21);
3441 
3442       shl(v16, T2D, v28, 1);
3443       shl(v17, T2D, v20, 1);
3444 
3445       eor(v0, T16B, v0, v16);
3446       eor(v1, T16B, v1, v17);
3447 
3448       subs(len, len, 32);
3449       br(Assembler::GE, L_fold);
3450 
3451       mov(crc, 0);
3452       mov(tmp, v0, T1D, 0);
3453       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3454       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3455       mov(tmp, v0, T1D, 1);
3456       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3457       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3458       mov(tmp, v1, T1D, 0);
3459       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3460       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3461       mov(tmp, v1, T1D, 1);
3462       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3463       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3464 
3465       add(len, len, 32);
3466   }
3467 
3468   BIND(L_by16);
3469     subs(len, len, 16);
3470     br(Assembler::GE, L_by16_loop);
3471     adds(len, len, 16-4);
3472     br(Assembler::GE, L_by4_loop);
3473     adds(len, len, 4);
3474     br(Assembler::GT, L_by1_loop);
3475     b(L_exit);
3476 
3477   BIND(L_by4_loop);
3478     ldrw(tmp, Address(post(buf, 4)));
3479     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3480     subs(len, len, 4);
3481     br(Assembler::GE, L_by4_loop);
3482     adds(len, len, 4);
3483     br(Assembler::LE, L_exit);
3484   BIND(L_by1_loop);
3485     subs(len, len, 1);
3486     ldrb(tmp, Address(post(buf, 1)));
3487     update_byte_crc32(crc, tmp, table0);
3488     br(Assembler::GT, L_by1_loop);
3489     b(L_exit);
3490 
3491     align(CodeEntryAlignment);
3492   BIND(L_by16_loop);
3493     subs(len, len, 16);
3494     ldp(tmp, tmp3, Address(post(buf, 16)));
3495     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3496     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3497     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3498     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3499     br(Assembler::GE, L_by16_loop);
3500     adds(len, len, 16-4);
3501     br(Assembler::GE, L_by4_loop);
3502     adds(len, len, 4);
3503     br(Assembler::GT, L_by1_loop);
3504   BIND(L_exit);
3505     mvnw(crc, crc);
3506 }
3507 
3508 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3509         Register len, Register tmp0, Register tmp1, Register tmp2,
3510         Register tmp3) {
3511     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3512     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3513 
3514     subs(len, len, 128);
3515     br(Assembler::GE, CRC_by64_pre);
3516   BIND(CRC_less64);
3517     adds(len, len, 128-32);
3518     br(Assembler::GE, CRC_by32_loop);
3519   BIND(CRC_less32);
3520     adds(len, len, 32-4);
3521     br(Assembler::GE, CRC_by4_loop);
3522     adds(len, len, 4);
3523     br(Assembler::GT, CRC_by1_loop);
3524     b(L_exit);
3525 
3526   BIND(CRC_by32_loop);
3527     ldp(tmp0, tmp1, Address(post(buf, 16)));
3528     subs(len, len, 32);
3529     crc32cx(crc, crc, tmp0);
3530     ldr(tmp2, Address(post(buf, 8)));
3531     crc32cx(crc, crc, tmp1);
3532     ldr(tmp3, Address(post(buf, 8)));
3533     crc32cx(crc, crc, tmp2);
3534     crc32cx(crc, crc, tmp3);
3535     br(Assembler::GE, CRC_by32_loop);
3536     cmn(len, 32);
3537     br(Assembler::NE, CRC_less32);
3538     b(L_exit);
3539 
3540   BIND(CRC_by4_loop);
3541     ldrw(tmp0, Address(post(buf, 4)));
3542     subs(len, len, 4);
3543     crc32cw(crc, crc, tmp0);
3544     br(Assembler::GE, CRC_by4_loop);
3545     adds(len, len, 4);
3546     br(Assembler::LE, L_exit);
3547   BIND(CRC_by1_loop);
3548     ldrb(tmp0, Address(post(buf, 1)));
3549     subs(len, len, 1);
3550     crc32cb(crc, crc, tmp0);
3551     br(Assembler::GT, CRC_by1_loop);
3552     b(L_exit);
3553 
3554   BIND(CRC_by64_pre);
3555     sub(buf, buf, 8);
3556     ldp(tmp0, tmp1, Address(buf, 8));
3557     crc32cx(crc, crc, tmp0);
3558     ldr(tmp2, Address(buf, 24));
3559     crc32cx(crc, crc, tmp1);
3560     ldr(tmp3, Address(buf, 32));
3561     crc32cx(crc, crc, tmp2);
3562     ldr(tmp0, Address(buf, 40));
3563     crc32cx(crc, crc, tmp3);
3564     ldr(tmp1, Address(buf, 48));
3565     crc32cx(crc, crc, tmp0);
3566     ldr(tmp2, Address(buf, 56));
3567     crc32cx(crc, crc, tmp1);
3568     ldr(tmp3, Address(pre(buf, 64)));
3569 
3570     b(CRC_by64_loop);
3571 
3572     align(CodeEntryAlignment);
3573   BIND(CRC_by64_loop);
3574     subs(len, len, 64);
3575     crc32cx(crc, crc, tmp2);
3576     ldr(tmp0, Address(buf, 8));
3577     crc32cx(crc, crc, tmp3);
3578     ldr(tmp1, Address(buf, 16));
3579     crc32cx(crc, crc, tmp0);
3580     ldr(tmp2, Address(buf, 24));
3581     crc32cx(crc, crc, tmp1);
3582     ldr(tmp3, Address(buf, 32));
3583     crc32cx(crc, crc, tmp2);
3584     ldr(tmp0, Address(buf, 40));
3585     crc32cx(crc, crc, tmp3);
3586     ldr(tmp1, Address(buf, 48));
3587     crc32cx(crc, crc, tmp0);
3588     ldr(tmp2, Address(buf, 56));
3589     crc32cx(crc, crc, tmp1);
3590     ldr(tmp3, Address(pre(buf, 64)));
3591     br(Assembler::GE, CRC_by64_loop);
3592 
3593     // post-loop
3594     crc32cx(crc, crc, tmp2);
3595     crc32cx(crc, crc, tmp3);
3596 
3597     sub(len, len, 64);
3598     add(buf, buf, 8);
3599     cmn(len, 128);
3600     br(Assembler::NE, CRC_less64);
3601   BIND(L_exit);
3602 }
3603 
3604 /**
3605  * @param crc   register containing existing CRC (32-bit)
3606  * @param buf   register pointing to input byte buffer (byte*)
3607  * @param len   register containing number of bytes
3608  * @param table register that will contain address of CRC table
3609  * @param tmp   scratch register
3610  */
3611 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3612         Register table0, Register table1, Register table2, Register table3,
3613         Register tmp, Register tmp2, Register tmp3) {
3614   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3615 }
3616 
3617 
3618 SkipIfEqual::SkipIfEqual(
3619     MacroAssembler* masm, const bool* flag_addr, bool value) {
3620   _masm = masm;
3621   unsigned long offset;
3622   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3623   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3624   _masm->cbzw(rscratch1, _label);
3625 }
3626 
3627 SkipIfEqual::~SkipIfEqual() {
3628   _masm->bind(_label);
3629 }
3630 
3631 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3632   Address adr;
3633   switch(dst.getMode()) {
3634   case Address::base_plus_offset:
3635     // This is the expected mode, although we allow all the other
3636     // forms below.
3637     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3638     break;
3639   default:
3640     lea(rscratch2, dst);
3641     adr = Address(rscratch2);
3642     break;
3643   }
3644   ldr(rscratch1, adr);
3645   add(rscratch1, rscratch1, src);
3646   str(rscratch1, adr);
3647 }
3648 
3649 void MacroAssembler::cmpptr(Register src1, Address src2) {
3650   unsigned long offset;
3651   adrp(rscratch1, src2, offset);
3652   ldr(rscratch1, Address(rscratch1, offset));
3653   cmp(src1, rscratch1);
3654 }
3655 
3656 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3657   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3658   bs->obj_equals(this, obj1, obj2);
3659 }
3660 
3661 void MacroAssembler::load_klass(Register dst, Register src) {
3662   if (UseCompressedClassPointers) {
3663     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3664     decode_klass_not_null(dst);
3665   } else {
3666     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3667   }
3668 }
3669 
3670 // ((OopHandle)result).resolve();
3671 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3672   // OopHandle::resolve is an indirection.
3673   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3674 }
3675 
3676 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3677   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3678   ldr(dst, Address(rmethod, Method::const_offset()));
3679   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3680   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3681   ldr(dst, Address(dst, mirror_offset));
3682   resolve_oop_handle(dst, tmp);
3683 }
3684 
3685 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3686   if (UseCompressedClassPointers) {
3687     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3688     if (Universe::narrow_klass_base() == NULL) {
3689       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3690       return;
3691     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3692                && Universe::narrow_klass_shift() == 0) {
3693       // Only the bottom 32 bits matter
3694       cmpw(trial_klass, tmp);
3695       return;
3696     }
3697     decode_klass_not_null(tmp);
3698   } else {
3699     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3700   }
3701   cmp(trial_klass, tmp);
3702 }
3703 
3704 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3705   load_klass(dst, src);
3706   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3707 }
3708 
3709 void MacroAssembler::store_klass(Register dst, Register src) {
3710   // FIXME: Should this be a store release?  concurrent gcs assumes
3711   // klass length is valid if klass field is not null.
3712   if (UseCompressedClassPointers) {
3713     encode_klass_not_null(src);
3714     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3715   } else {
3716     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3717   }
3718 }
3719 
3720 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3721   if (UseCompressedClassPointers) {
3722     // Store to klass gap in destination
3723     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3724   }
3725 }
3726 
3727 // Algorithm must match CompressedOops::encode.
3728 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3729 #ifdef ASSERT
3730   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3731 #endif
3732   verify_oop(s, "broken oop in encode_heap_oop");
3733   if (Universe::narrow_oop_base() == NULL) {
3734     if (Universe::narrow_oop_shift() != 0) {
3735       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3736       lsr(d, s, LogMinObjAlignmentInBytes);
3737     } else {
3738       mov(d, s);
3739     }
3740   } else {
3741     subs(d, s, rheapbase);
3742     csel(d, d, zr, Assembler::HS);
3743     lsr(d, d, LogMinObjAlignmentInBytes);
3744 
3745     /*  Old algorithm: is this any worse?
3746     Label nonnull;
3747     cbnz(r, nonnull);
3748     sub(r, r, rheapbase);
3749     bind(nonnull);
3750     lsr(r, r, LogMinObjAlignmentInBytes);
3751     */
3752   }
3753 }
3754 
3755 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3756 #ifdef ASSERT
3757   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3758   if (CheckCompressedOops) {
3759     Label ok;
3760     cbnz(r, ok);
3761     stop("null oop passed to encode_heap_oop_not_null");
3762     bind(ok);
3763   }
3764 #endif
3765   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3766   if (Universe::narrow_oop_base() != NULL) {
3767     sub(r, r, rheapbase);
3768   }
3769   if (Universe::narrow_oop_shift() != 0) {
3770     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3771     lsr(r, r, LogMinObjAlignmentInBytes);
3772   }
3773 }
3774 
3775 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3776 #ifdef ASSERT
3777   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3778   if (CheckCompressedOops) {
3779     Label ok;
3780     cbnz(src, ok);
3781     stop("null oop passed to encode_heap_oop_not_null2");
3782     bind(ok);
3783   }
3784 #endif
3785   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3786 
3787   Register data = src;
3788   if (Universe::narrow_oop_base() != NULL) {
3789     sub(dst, src, rheapbase);
3790     data = dst;
3791   }
3792   if (Universe::narrow_oop_shift() != 0) {
3793     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3794     lsr(dst, data, LogMinObjAlignmentInBytes);
3795     data = dst;
3796   }
3797   if (data == src)
3798     mov(dst, src);
3799 }
3800 
3801 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3802 #ifdef ASSERT
3803   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3804 #endif
3805   if (Universe::narrow_oop_base() == NULL) {
3806     if (Universe::narrow_oop_shift() != 0 || d != s) {
3807       lsl(d, s, Universe::narrow_oop_shift());
3808     }
3809   } else {
3810     Label done;
3811     if (d != s)
3812       mov(d, s);
3813     cbz(s, done);
3814     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3815     bind(done);
3816   }
3817   verify_oop(d, "broken oop in decode_heap_oop");
3818 }
3819 
3820 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3821   assert (UseCompressedOops, "should only be used for compressed headers");
3822   assert (Universe::heap() != NULL, "java heap should be initialized");
3823   // Cannot assert, unverified entry point counts instructions (see .ad file)
3824   // vtableStubs also counts instructions in pd_code_size_limit.
3825   // Also do not verify_oop as this is called by verify_oop.
3826   if (Universe::narrow_oop_shift() != 0) {
3827     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3828     if (Universe::narrow_oop_base() != NULL) {
3829       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3830     } else {
3831       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3832     }
3833   } else {
3834     assert (Universe::narrow_oop_base() == NULL, "sanity");
3835   }
3836 }
3837 
3838 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3839   assert (UseCompressedOops, "should only be used for compressed headers");
3840   assert (Universe::heap() != NULL, "java heap should be initialized");
3841   // Cannot assert, unverified entry point counts instructions (see .ad file)
3842   // vtableStubs also counts instructions in pd_code_size_limit.
3843   // Also do not verify_oop as this is called by verify_oop.
3844   if (Universe::narrow_oop_shift() != 0) {
3845     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3846     if (Universe::narrow_oop_base() != NULL) {
3847       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3848     } else {
3849       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3850     }
3851   } else {
3852     assert (Universe::narrow_oop_base() == NULL, "sanity");
3853     if (dst != src) {
3854       mov(dst, src);
3855     }
3856   }
3857 }
3858 
3859 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3860   if (Universe::narrow_klass_base() == NULL) {
3861     if (Universe::narrow_klass_shift() != 0) {
3862       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3863       lsr(dst, src, LogKlassAlignmentInBytes);
3864     } else {
3865       if (dst != src) mov(dst, src);
3866     }
3867     return;
3868   }
3869 
3870   if (use_XOR_for_compressed_class_base) {
3871     if (Universe::narrow_klass_shift() != 0) {
3872       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3873       lsr(dst, dst, LogKlassAlignmentInBytes);
3874     } else {
3875       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3876     }
3877     return;
3878   }
3879 
3880   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3881       && Universe::narrow_klass_shift() == 0) {
3882     movw(dst, src);
3883     return;
3884   }
3885 
3886 #ifdef ASSERT
3887   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3888 #endif
3889 
3890   Register rbase = dst;
3891   if (dst == src) rbase = rheapbase;
3892   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3893   sub(dst, src, rbase);
3894   if (Universe::narrow_klass_shift() != 0) {
3895     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3896     lsr(dst, dst, LogKlassAlignmentInBytes);
3897   }
3898   if (dst == src) reinit_heapbase();
3899 }
3900 
3901 void MacroAssembler::encode_klass_not_null(Register r) {
3902   encode_klass_not_null(r, r);
3903 }
3904 
3905 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3906   Register rbase = dst;
3907   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3908 
3909   if (Universe::narrow_klass_base() == NULL) {
3910     if (Universe::narrow_klass_shift() != 0) {
3911       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3912       lsl(dst, src, LogKlassAlignmentInBytes);
3913     } else {
3914       if (dst != src) mov(dst, src);
3915     }
3916     return;
3917   }
3918 
3919   if (use_XOR_for_compressed_class_base) {
3920     if (Universe::narrow_klass_shift() != 0) {
3921       lsl(dst, src, LogKlassAlignmentInBytes);
3922       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3923     } else {
3924       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3925     }
3926     return;
3927   }
3928 
3929   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3930       && Universe::narrow_klass_shift() == 0) {
3931     if (dst != src)
3932       movw(dst, src);
3933     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3934     return;
3935   }
3936 
3937   // Cannot assert, unverified entry point counts instructions (see .ad file)
3938   // vtableStubs also counts instructions in pd_code_size_limit.
3939   // Also do not verify_oop as this is called by verify_oop.
3940   if (dst == src) rbase = rheapbase;
3941   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3942   if (Universe::narrow_klass_shift() != 0) {
3943     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3944     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3945   } else {
3946     add(dst, rbase, src);
3947   }
3948   if (dst == src) reinit_heapbase();
3949 }
3950 
3951 void  MacroAssembler::decode_klass_not_null(Register r) {
3952   decode_klass_not_null(r, r);
3953 }
3954 
3955 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3956 #ifdef ASSERT
3957   {
3958     ThreadInVMfromUnknown tiv;
3959     assert (UseCompressedOops, "should only be used for compressed oops");
3960     assert (Universe::heap() != NULL, "java heap should be initialized");
3961     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3962     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3963   }
3964 #endif
3965   int oop_index = oop_recorder()->find_index(obj);
3966   InstructionMark im(this);
3967   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3968   code_section()->relocate(inst_mark(), rspec);
3969   movz(dst, 0xDEAD, 16);
3970   movk(dst, 0xBEEF);
3971 }
3972 
3973 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3974   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3975   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3976   int index = oop_recorder()->find_index(k);
3977   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3978 
3979   InstructionMark im(this);
3980   RelocationHolder rspec = metadata_Relocation::spec(index);
3981   code_section()->relocate(inst_mark(), rspec);
3982   narrowKlass nk = Klass::encode_klass(k);
3983   movz(dst, (nk >> 16), 16);
3984   movk(dst, nk & 0xffff);
3985 }
3986 
3987 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3988                                     Register dst, Address src,
3989                                     Register tmp1, Register thread_tmp) {
3990   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
3991   decorators = AccessInternal::decorator_fixup(decorators);
3992   bool as_raw = (decorators & AS_RAW) != 0;
3993   if (as_raw) {
3994     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3995   } else {
3996     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
3997   }
3998 }
3999 
4000 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4001                                      Address dst, Register src,
4002                                      Register tmp1, Register thread_tmp) {
4003   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4004   decorators = AccessInternal::decorator_fixup(decorators);
4005   bool as_raw = (decorators & AS_RAW) != 0;
4006   if (as_raw) {
4007     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4008   } else {
4009     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4010   }
4011 }
4012 
4013 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4014   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4015   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4016     decorators |= ACCESS_READ | ACCESS_WRITE;
4017   }
4018   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4019   return bs->resolve(this, decorators, obj);
4020 }
4021 
4022 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4023                                    Register thread_tmp, DecoratorSet decorators) {
4024   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4025 }
4026 
4027 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4028                                             Register thread_tmp, DecoratorSet decorators) {
4029   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4030 }
4031 
4032 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4033                                     Register thread_tmp, DecoratorSet decorators) {
4034   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4035 }
4036 
4037 // Used for storing NULLs.
4038 void MacroAssembler::store_heap_oop_null(Address dst) {
4039   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4040 }
4041 
4042 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4043   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4044   int index = oop_recorder()->allocate_metadata_index(obj);
4045   RelocationHolder rspec = metadata_Relocation::spec(index);
4046   return Address((address)obj, rspec);
4047 }
4048 
4049 // Move an oop into a register.  immediate is true if we want
4050 // immediate instrcutions, i.e. we are not going to patch this
4051 // instruction while the code is being executed by another thread.  In
4052 // that case we can use move immediates rather than the constant pool.
4053 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4054   int oop_index;
4055   if (obj == NULL) {
4056     oop_index = oop_recorder()->allocate_oop_index(obj);
4057   } else {
4058 #ifdef ASSERT
4059     {
4060       ThreadInVMfromUnknown tiv;
4061       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4062     }
4063 #endif
4064     oop_index = oop_recorder()->find_index(obj);
4065   }
4066   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4067   if (! immediate) {
4068     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4069     ldr_constant(dst, Address(dummy, rspec));
4070   } else
4071     mov(dst, Address((address)obj, rspec));
4072 }
4073 
4074 // Move a metadata address into a register.
4075 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4076   int oop_index;
4077   if (obj == NULL) {
4078     oop_index = oop_recorder()->allocate_metadata_index(obj);
4079   } else {
4080     oop_index = oop_recorder()->find_index(obj);
4081   }
4082   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4083   mov(dst, Address((address)obj, rspec));
4084 }
4085 
4086 Address MacroAssembler::constant_oop_address(jobject obj) {
4087 #ifdef ASSERT
4088   {
4089     ThreadInVMfromUnknown tiv;
4090     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4091     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4092   }
4093 #endif
4094   int oop_index = oop_recorder()->find_index(obj);
4095   return Address((address)obj, oop_Relocation::spec(oop_index));
4096 }
4097 
4098 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4099 void MacroAssembler::tlab_allocate(Register obj,
4100                                    Register var_size_in_bytes,
4101                                    int con_size_in_bytes,
4102                                    Register t1,
4103                                    Register t2,
4104                                    Label& slow_case) {
4105   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4106   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4107 }
4108 
4109 // Defines obj, preserves var_size_in_bytes
4110 void MacroAssembler::eden_allocate(Register obj,
4111                                    Register var_size_in_bytes,
4112                                    int con_size_in_bytes,
4113                                    Register t1,
4114                                    Label& slow_case) {
4115   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4116   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4117 }
4118 
4119 // Zero words; len is in bytes
4120 // Destroys all registers except addr
4121 // len must be a nonzero multiple of wordSize
4122 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4123   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4124 
4125 #ifdef ASSERT
4126   { Label L;
4127     tst(len, BytesPerWord - 1);
4128     br(Assembler::EQ, L);
4129     stop("len is not a multiple of BytesPerWord");
4130     bind(L);
4131   }
4132 #endif
4133 
4134 #ifndef PRODUCT
4135   block_comment("zero memory");
4136 #endif
4137 
4138   Label loop;
4139   Label entry;
4140 
4141 //  Algorithm:
4142 //
4143 //    scratch1 = cnt & 7;
4144 //    cnt -= scratch1;
4145 //    p += scratch1;
4146 //    switch (scratch1) {
4147 //      do {
4148 //        cnt -= 8;
4149 //          p[-8] = 0;
4150 //        case 7:
4151 //          p[-7] = 0;
4152 //        case 6:
4153 //          p[-6] = 0;
4154 //          // ...
4155 //        case 1:
4156 //          p[-1] = 0;
4157 //        case 0:
4158 //          p += 8;
4159 //      } while (cnt);
4160 //    }
4161 
4162   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4163 
4164   lsr(len, len, LogBytesPerWord);
4165   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4166   sub(len, len, rscratch1);      // cnt -= unroll
4167   // t1 always points to the end of the region we're about to zero
4168   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4169   adr(rscratch2, entry);
4170   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4171   br(rscratch2);
4172   bind(loop);
4173   sub(len, len, unroll);
4174   for (int i = -unroll; i < 0; i++)
4175     Assembler::str(zr, Address(t1, i * wordSize));
4176   bind(entry);
4177   add(t1, t1, unroll * wordSize);
4178   cbnz(len, loop);
4179 }
4180 
4181 void MacroAssembler::verify_tlab() {
4182 #ifdef ASSERT
4183   if (UseTLAB && VerifyOops) {
4184     Label next, ok;
4185 
4186     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4187 
4188     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4189     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4190     cmp(rscratch2, rscratch1);
4191     br(Assembler::HS, next);
4192     STOP("assert(top >= start)");
4193     should_not_reach_here();
4194 
4195     bind(next);
4196     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4197     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4198     cmp(rscratch2, rscratch1);
4199     br(Assembler::HS, ok);
4200     STOP("assert(top <= end)");
4201     should_not_reach_here();
4202 
4203     bind(ok);
4204     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4205   }
4206 #endif
4207 }
4208 
4209 // Writes to stack successive pages until offset reached to check for
4210 // stack overflow + shadow pages.  This clobbers tmp.
4211 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4212   assert_different_registers(tmp, size, rscratch1);
4213   mov(tmp, sp);
4214   // Bang stack for total size given plus shadow page size.
4215   // Bang one page at a time because large size can bang beyond yellow and
4216   // red zones.
4217   Label loop;
4218   mov(rscratch1, os::vm_page_size());
4219   bind(loop);
4220   lea(tmp, Address(tmp, -os::vm_page_size()));
4221   subsw(size, size, rscratch1);
4222   str(size, Address(tmp));
4223   br(Assembler::GT, loop);
4224 
4225   // Bang down shadow pages too.
4226   // At this point, (tmp-0) is the last address touched, so don't
4227   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4228   // was post-decremented.)  Skip this address by starting at i=1, and
4229   // touch a few more pages below.  N.B.  It is important to touch all
4230   // the way down to and including i=StackShadowPages.
4231   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4232     // this could be any sized move but this is can be a debugging crumb
4233     // so the bigger the better.
4234     lea(tmp, Address(tmp, -os::vm_page_size()));
4235     str(size, Address(tmp));
4236   }
4237 }
4238 
4239 
4240 // Move the address of the polling page into dest.
4241 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4242   if (SafepointMechanism::uses_thread_local_poll()) {
4243     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4244   } else {
4245     unsigned long off;
4246     adrp(dest, Address(page, rtype), off);
4247     assert(off == 0, "polling page must be page aligned");
4248   }
4249 }
4250 
4251 // Move the address of the polling page into r, then read the polling
4252 // page.
4253 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4254   get_polling_page(r, page, rtype);
4255   return read_polling_page(r, rtype);
4256 }
4257 
4258 // Read the polling page.  The address of the polling page must
4259 // already be in r.
4260 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4261   InstructionMark im(this);
4262   code_section()->relocate(inst_mark(), rtype);
4263   ldrw(zr, Address(r, 0));
4264   return inst_mark();
4265 }
4266 
4267 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4268   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4269   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4270   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4271   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4272   long offset_low = dest_page - low_page;
4273   long offset_high = dest_page - high_page;
4274 
4275   assert(is_valid_AArch64_address(dest.target()), "bad address");
4276   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4277 
4278   InstructionMark im(this);
4279   code_section()->relocate(inst_mark(), dest.rspec());
4280   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4281   // the code cache so that if it is relocated we know it will still reach
4282   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4283     _adrp(reg1, dest.target());
4284   } else {
4285     unsigned long target = (unsigned long)dest.target();
4286     unsigned long adrp_target
4287       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4288 
4289     _adrp(reg1, (address)adrp_target);
4290     movk(reg1, target >> 32, 32);
4291   }
4292   byte_offset = (unsigned long)dest.target() & 0xfff;
4293 }
4294 
4295 void MacroAssembler::load_byte_map_base(Register reg) {
4296   jbyte *byte_map_base =
4297     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4298 
4299   if (is_valid_AArch64_address((address)byte_map_base)) {
4300     // Strictly speaking the byte_map_base isn't an address at all,
4301     // and it might even be negative.
4302     unsigned long offset;
4303     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4304     // We expect offset to be zero with most collectors.
4305     if (offset != 0) {
4306       add(reg, reg, offset);
4307     }
4308   } else {
4309     mov(reg, (uint64_t)byte_map_base);
4310   }
4311 }
4312 
4313 void MacroAssembler::build_frame(int framesize) {
4314   assert(framesize > 0, "framesize must be > 0");
4315   if (framesize < ((1 << 9) + 2 * wordSize)) {
4316     sub(sp, sp, framesize);
4317     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4318     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4319   } else {
4320     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4321     if (PreserveFramePointer) mov(rfp, sp);
4322     if (framesize < ((1 << 12) + 2 * wordSize))
4323       sub(sp, sp, framesize - 2 * wordSize);
4324     else {
4325       mov(rscratch1, framesize - 2 * wordSize);
4326       sub(sp, sp, rscratch1);
4327     }
4328   }
4329 }
4330 
4331 void MacroAssembler::remove_frame(int framesize) {
4332   assert(framesize > 0, "framesize must be > 0");
4333   if (framesize < ((1 << 9) + 2 * wordSize)) {
4334     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4335     add(sp, sp, framesize);
4336   } else {
4337     if (framesize < ((1 << 12) + 2 * wordSize))
4338       add(sp, sp, framesize - 2 * wordSize);
4339     else {
4340       mov(rscratch1, framesize - 2 * wordSize);
4341       add(sp, sp, rscratch1);
4342     }
4343     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4344   }
4345 }
4346 
4347 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4348 
4349 // Search for str1 in str2 and return index or -1
4350 void MacroAssembler::string_indexof(Register str2, Register str1,
4351                                     Register cnt2, Register cnt1,
4352                                     Register tmp1, Register tmp2,
4353                                     Register tmp3, Register tmp4,
4354                                     Register tmp5, Register tmp6,
4355                                     int icnt1, Register result, int ae) {
4356   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4357   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4358 
4359   Register ch1 = rscratch1;
4360   Register ch2 = rscratch2;
4361   Register cnt1tmp = tmp1;
4362   Register cnt2tmp = tmp2;
4363   Register cnt1_neg = cnt1;
4364   Register cnt2_neg = cnt2;
4365   Register result_tmp = tmp4;
4366 
4367   bool isL = ae == StrIntrinsicNode::LL;
4368 
4369   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4370   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4371   int str1_chr_shift = str1_isL ? 0:1;
4372   int str2_chr_shift = str2_isL ? 0:1;
4373   int str1_chr_size = str1_isL ? 1:2;
4374   int str2_chr_size = str2_isL ? 1:2;
4375   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4376                                       (chr_insn)&MacroAssembler::ldrh;
4377   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4378                                       (chr_insn)&MacroAssembler::ldrh;
4379   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4380   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4381 
4382   // Note, inline_string_indexOf() generates checks:
4383   // if (substr.count > string.count) return -1;
4384   // if (substr.count == 0) return 0;
4385 
4386   // We have two strings, a source string in str2, cnt2 and a pattern string
4387   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4388 
4389   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4390   // With a small pattern and source we use linear scan.
4391 
4392   if (icnt1 == -1) {
4393     sub(result_tmp, cnt2, cnt1);
4394     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4395     br(LT, LINEARSEARCH);
4396     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4397     subs(zr, cnt1, 256);
4398     lsr(tmp1, cnt2, 2);
4399     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4400     br(GE, LINEARSTUB);
4401   }
4402 
4403 // The Boyer Moore alogorithm is based on the description here:-
4404 //
4405 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4406 //
4407 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4408 // and the 'Good Suffix' rule.
4409 //
4410 // These rules are essentially heuristics for how far we can shift the
4411 // pattern along the search string.
4412 //
4413 // The implementation here uses the 'Bad Character' rule only because of the
4414 // complexity of initialisation for the 'Good Suffix' rule.
4415 //
4416 // This is also known as the Boyer-Moore-Horspool algorithm:-
4417 //
4418 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4419 //
4420 // This particular implementation has few java-specific optimizations.
4421 //
4422 // #define ASIZE 256
4423 //
4424 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4425 //       int i, j;
4426 //       unsigned c;
4427 //       unsigned char bc[ASIZE];
4428 //
4429 //       /* Preprocessing */
4430 //       for (i = 0; i < ASIZE; ++i)
4431 //          bc[i] = m;
4432 //       for (i = 0; i < m - 1; ) {
4433 //          c = x[i];
4434 //          ++i;
4435 //          // c < 256 for Latin1 string, so, no need for branch
4436 //          #ifdef PATTERN_STRING_IS_LATIN1
4437 //          bc[c] = m - i;
4438 //          #else
4439 //          if (c < ASIZE) bc[c] = m - i;
4440 //          #endif
4441 //       }
4442 //
4443 //       /* Searching */
4444 //       j = 0;
4445 //       while (j <= n - m) {
4446 //          c = y[i+j];
4447 //          if (x[m-1] == c)
4448 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4449 //          if (i < 0) return j;
4450 //          // c < 256 for Latin1 string, so, no need for branch
4451 //          #ifdef SOURCE_STRING_IS_LATIN1
4452 //          // LL case: (c< 256) always true. Remove branch
4453 //          j += bc[y[j+m-1]];
4454 //          #endif
4455 //          #ifndef PATTERN_STRING_IS_UTF
4456 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4457 //          if (c < ASIZE)
4458 //            j += bc[y[j+m-1]];
4459 //          else
4460 //            j += 1
4461 //          #endif
4462 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4463 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4464 //          if (c < ASIZE)
4465 //            j += bc[y[j+m-1]];
4466 //          else
4467 //            j += m
4468 //          #endif
4469 //       }
4470 //    }
4471 
4472   if (icnt1 == -1) {
4473     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4474         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4475     Register cnt1end = tmp2;
4476     Register str2end = cnt2;
4477     Register skipch = tmp2;
4478 
4479     // str1 length is >=8, so, we can read at least 1 register for cases when
4480     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4481     // UL case. We'll re-read last character in inner pre-loop code to have
4482     // single outer pre-loop load
4483     const int firstStep = isL ? 7 : 3;
4484 
4485     const int ASIZE = 256;
4486     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4487     sub(sp, sp, ASIZE);
4488     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4489     mov(ch1, sp);
4490     BIND(BM_INIT_LOOP);
4491       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4492       subs(tmp5, tmp5, 1);
4493       br(GT, BM_INIT_LOOP);
4494 
4495       sub(cnt1tmp, cnt1, 1);
4496       mov(tmp5, str2);
4497       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4498       sub(ch2, cnt1, 1);
4499       mov(tmp3, str1);
4500     BIND(BCLOOP);
4501       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4502       if (!str1_isL) {
4503         subs(zr, ch1, ASIZE);
4504         br(HS, BCSKIP);
4505       }
4506       strb(ch2, Address(sp, ch1));
4507     BIND(BCSKIP);
4508       subs(ch2, ch2, 1);
4509       br(GT, BCLOOP);
4510 
4511       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4512       if (str1_isL == str2_isL) {
4513         // load last 8 bytes (8LL/4UU symbols)
4514         ldr(tmp6, Address(tmp6, -wordSize));
4515       } else {
4516         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4517         // convert Latin1 to UTF. We'll have to wait until load completed, but
4518         // it's still faster than per-character loads+checks
4519         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4520         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4521         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4522         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4523         orr(ch2, ch1, ch2, LSL, 16);
4524         orr(tmp6, tmp6, tmp3, LSL, 48);
4525         orr(tmp6, tmp6, ch2, LSL, 16);
4526       }
4527     BIND(BMLOOPSTR2);
4528       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4529       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4530       if (str1_isL == str2_isL) {
4531         // re-init tmp3. It's for free because it's executed in parallel with
4532         // load above. Alternative is to initialize it before loop, but it'll
4533         // affect performance on in-order systems with 2 or more ld/st pipelines
4534         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4535       }
4536       if (!isL) { // UU/UL case
4537         lsl(ch2, cnt1tmp, 1); // offset in bytes
4538       }
4539       cmp(tmp3, skipch);
4540       br(NE, BMSKIP);
4541       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4542       mov(ch1, tmp6);
4543       if (isL) {
4544         b(BMLOOPSTR1_AFTER_LOAD);
4545       } else {
4546         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4547         b(BMLOOPSTR1_CMP);
4548       }
4549     BIND(BMLOOPSTR1);
4550       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4551       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4552     BIND(BMLOOPSTR1_AFTER_LOAD);
4553       subs(cnt1tmp, cnt1tmp, 1);
4554       br(LT, BMLOOPSTR1_LASTCMP);
4555     BIND(BMLOOPSTR1_CMP);
4556       cmp(ch1, ch2);
4557       br(EQ, BMLOOPSTR1);
4558     BIND(BMSKIP);
4559       if (!isL) {
4560         // if we've met UTF symbol while searching Latin1 pattern, then we can
4561         // skip cnt1 symbols
4562         if (str1_isL != str2_isL) {
4563           mov(result_tmp, cnt1);
4564         } else {
4565           mov(result_tmp, 1);
4566         }
4567         subs(zr, skipch, ASIZE);
4568         br(HS, BMADV);
4569       }
4570       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4571     BIND(BMADV);
4572       sub(cnt1tmp, cnt1, 1);
4573       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4574       cmp(str2, str2end);
4575       br(LE, BMLOOPSTR2);
4576       add(sp, sp, ASIZE);
4577       b(NOMATCH);
4578     BIND(BMLOOPSTR1_LASTCMP);
4579       cmp(ch1, ch2);
4580       br(NE, BMSKIP);
4581     BIND(BMMATCH);
4582       sub(result, str2, tmp5);
4583       if (!str2_isL) lsr(result, result, 1);
4584       add(sp, sp, ASIZE);
4585       b(DONE);
4586 
4587     BIND(LINEARSTUB);
4588     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4589     br(LT, LINEAR_MEDIUM);
4590     mov(result, zr);
4591     RuntimeAddress stub = NULL;
4592     if (isL) {
4593       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4594       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4595     } else if (str1_isL) {
4596       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4597        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4598     } else {
4599       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4600       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4601     }
4602     trampoline_call(stub);
4603     b(DONE);
4604   }
4605 
4606   BIND(LINEARSEARCH);
4607   {
4608     Label DO1, DO2, DO3;
4609 
4610     Register str2tmp = tmp2;
4611     Register first = tmp3;
4612 
4613     if (icnt1 == -1)
4614     {
4615         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4616 
4617         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4618         br(LT, DOSHORT);
4619       BIND(LINEAR_MEDIUM);
4620         (this->*str1_load_1chr)(first, Address(str1));
4621         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4622         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4623         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4624         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4625 
4626       BIND(FIRST_LOOP);
4627         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4628         cmp(first, ch2);
4629         br(EQ, STR1_LOOP);
4630       BIND(STR2_NEXT);
4631         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4632         br(LE, FIRST_LOOP);
4633         b(NOMATCH);
4634 
4635       BIND(STR1_LOOP);
4636         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4637         add(cnt2tmp, cnt2_neg, str2_chr_size);
4638         br(GE, MATCH);
4639 
4640       BIND(STR1_NEXT);
4641         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4642         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4643         cmp(ch1, ch2);
4644         br(NE, STR2_NEXT);
4645         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4646         add(cnt2tmp, cnt2tmp, str2_chr_size);
4647         br(LT, STR1_NEXT);
4648         b(MATCH);
4649 
4650       BIND(DOSHORT);
4651       if (str1_isL == str2_isL) {
4652         cmp(cnt1, (u1)2);
4653         br(LT, DO1);
4654         br(GT, DO3);
4655       }
4656     }
4657 
4658     if (icnt1 == 4) {
4659       Label CH1_LOOP;
4660 
4661         (this->*load_4chr)(ch1, str1);
4662         sub(result_tmp, cnt2, 4);
4663         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4664         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4665 
4666       BIND(CH1_LOOP);
4667         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4668         cmp(ch1, ch2);
4669         br(EQ, MATCH);
4670         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4671         br(LE, CH1_LOOP);
4672         b(NOMATCH);
4673       }
4674 
4675     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4676       Label CH1_LOOP;
4677 
4678       BIND(DO2);
4679         (this->*load_2chr)(ch1, str1);
4680         if (icnt1 == 2) {
4681           sub(result_tmp, cnt2, 2);
4682         }
4683         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4684         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4685       BIND(CH1_LOOP);
4686         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4687         cmp(ch1, ch2);
4688         br(EQ, MATCH);
4689         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4690         br(LE, CH1_LOOP);
4691         b(NOMATCH);
4692     }
4693 
4694     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4695       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4696 
4697       BIND(DO3);
4698         (this->*load_2chr)(first, str1);
4699         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4700         if (icnt1 == 3) {
4701           sub(result_tmp, cnt2, 3);
4702         }
4703         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4704         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4705       BIND(FIRST_LOOP);
4706         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4707         cmpw(first, ch2);
4708         br(EQ, STR1_LOOP);
4709       BIND(STR2_NEXT);
4710         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4711         br(LE, FIRST_LOOP);
4712         b(NOMATCH);
4713 
4714       BIND(STR1_LOOP);
4715         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4716         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4717         cmp(ch1, ch2);
4718         br(NE, STR2_NEXT);
4719         b(MATCH);
4720     }
4721 
4722     if (icnt1 == -1 || icnt1 == 1) {
4723       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4724 
4725       BIND(DO1);
4726         (this->*str1_load_1chr)(ch1, str1);
4727         cmp(cnt2, (u1)8);
4728         br(LT, DO1_SHORT);
4729 
4730         sub(result_tmp, cnt2, 8/str2_chr_size);
4731         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4732         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4733         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4734 
4735         if (str2_isL) {
4736           orr(ch1, ch1, ch1, LSL, 8);
4737         }
4738         orr(ch1, ch1, ch1, LSL, 16);
4739         orr(ch1, ch1, ch1, LSL, 32);
4740       BIND(CH1_LOOP);
4741         ldr(ch2, Address(str2, cnt2_neg));
4742         eor(ch2, ch1, ch2);
4743         sub(tmp1, ch2, tmp3);
4744         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4745         bics(tmp1, tmp1, tmp2);
4746         br(NE, HAS_ZERO);
4747         adds(cnt2_neg, cnt2_neg, 8);
4748         br(LT, CH1_LOOP);
4749 
4750         cmp(cnt2_neg, (u1)8);
4751         mov(cnt2_neg, 0);
4752         br(LT, CH1_LOOP);
4753         b(NOMATCH);
4754 
4755       BIND(HAS_ZERO);
4756         rev(tmp1, tmp1);
4757         clz(tmp1, tmp1);
4758         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4759         b(MATCH);
4760 
4761       BIND(DO1_SHORT);
4762         mov(result_tmp, cnt2);
4763         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4764         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4765       BIND(DO1_LOOP);
4766         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4767         cmpw(ch1, ch2);
4768         br(EQ, MATCH);
4769         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4770         br(LT, DO1_LOOP);
4771     }
4772   }
4773   BIND(NOMATCH);
4774     mov(result, -1);
4775     b(DONE);
4776   BIND(MATCH);
4777     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4778   BIND(DONE);
4779 }
4780 
4781 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4782 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4783 
4784 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4785                                          Register ch, Register result,
4786                                          Register tmp1, Register tmp2, Register tmp3)
4787 {
4788   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4789   Register cnt1_neg = cnt1;
4790   Register ch1 = rscratch1;
4791   Register result_tmp = rscratch2;
4792 
4793   cmp(cnt1, (u1)4);
4794   br(LT, DO1_SHORT);
4795 
4796   orr(ch, ch, ch, LSL, 16);
4797   orr(ch, ch, ch, LSL, 32);
4798 
4799   sub(cnt1, cnt1, 4);
4800   mov(result_tmp, cnt1);
4801   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4802   sub(cnt1_neg, zr, cnt1, LSL, 1);
4803 
4804   mov(tmp3, 0x0001000100010001);
4805 
4806   BIND(CH1_LOOP);
4807     ldr(ch1, Address(str1, cnt1_neg));
4808     eor(ch1, ch, ch1);
4809     sub(tmp1, ch1, tmp3);
4810     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4811     bics(tmp1, tmp1, tmp2);
4812     br(NE, HAS_ZERO);
4813     adds(cnt1_neg, cnt1_neg, 8);
4814     br(LT, CH1_LOOP);
4815 
4816     cmp(cnt1_neg, (u1)8);
4817     mov(cnt1_neg, 0);
4818     br(LT, CH1_LOOP);
4819     b(NOMATCH);
4820 
4821   BIND(HAS_ZERO);
4822     rev(tmp1, tmp1);
4823     clz(tmp1, tmp1);
4824     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4825     b(MATCH);
4826 
4827   BIND(DO1_SHORT);
4828     mov(result_tmp, cnt1);
4829     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4830     sub(cnt1_neg, zr, cnt1, LSL, 1);
4831   BIND(DO1_LOOP);
4832     ldrh(ch1, Address(str1, cnt1_neg));
4833     cmpw(ch, ch1);
4834     br(EQ, MATCH);
4835     adds(cnt1_neg, cnt1_neg, 2);
4836     br(LT, DO1_LOOP);
4837   BIND(NOMATCH);
4838     mov(result, -1);
4839     b(DONE);
4840   BIND(MATCH);
4841     add(result, result_tmp, cnt1_neg, ASR, 1);
4842   BIND(DONE);
4843 }
4844 
4845 // Compare strings.
4846 void MacroAssembler::string_compare(Register str1, Register str2,
4847     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4848     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4849   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4850       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4851       SHORT_LOOP_START, TAIL_CHECK;
4852 
4853   const u1 STUB_THRESHOLD = 64 + 8;
4854   bool isLL = ae == StrIntrinsicNode::LL;
4855   bool isLU = ae == StrIntrinsicNode::LU;
4856   bool isUL = ae == StrIntrinsicNode::UL;
4857 
4858   bool str1_isL = isLL || isLU;
4859   bool str2_isL = isLL || isUL;
4860 
4861   int str1_chr_shift = str1_isL ? 0 : 1;
4862   int str2_chr_shift = str2_isL ? 0 : 1;
4863   int str1_chr_size = str1_isL ? 1 : 2;
4864   int str2_chr_size = str2_isL ? 1 : 2;
4865   int minCharsInWord = isLL ? wordSize : wordSize/2;
4866 
4867   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4868   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4869                                       (chr_insn)&MacroAssembler::ldrh;
4870   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4871                                       (chr_insn)&MacroAssembler::ldrh;
4872   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4873                             (uxt_insn)&MacroAssembler::uxthw;
4874 
4875   BLOCK_COMMENT("string_compare {");
4876 
4877   // Bizzarely, the counts are passed in bytes, regardless of whether they
4878   // are L or U strings, however the result is always in characters.
4879   if (!str1_isL) asrw(cnt1, cnt1, 1);
4880   if (!str2_isL) asrw(cnt2, cnt2, 1);
4881 
4882   // Compute the minimum of the string lengths and save the difference.
4883   subsw(result, cnt1, cnt2);
4884   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4885 
4886   // A very short string
4887   cmpw(cnt2, minCharsInWord);
4888   br(Assembler::LT, SHORT_STRING);
4889 
4890   // Compare longwords
4891   // load first parts of strings and finish initialization while loading
4892   {
4893     if (str1_isL == str2_isL) { // LL or UU
4894       ldr(tmp1, Address(str1));
4895       cmp(str1, str2);
4896       br(Assembler::EQ, DONE);
4897       ldr(tmp2, Address(str2));
4898       cmp(cnt2, STUB_THRESHOLD);
4899       br(GE, STUB);
4900       subsw(cnt2, cnt2, minCharsInWord);
4901       br(EQ, TAIL_CHECK);
4902       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4903       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4904       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4905     } else if (isLU) {
4906       ldrs(vtmp, Address(str1));
4907       cmp(str1, str2);
4908       br(Assembler::EQ, DONE);
4909       ldr(tmp2, Address(str2));
4910       cmp(cnt2, STUB_THRESHOLD);
4911       br(GE, STUB);
4912       subsw(cnt2, cnt2, 4);
4913       br(EQ, TAIL_CHECK);
4914       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4915       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4916       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4917       zip1(vtmp, T8B, vtmp, vtmpZ);
4918       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4919       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4920       add(cnt1, cnt1, 4);
4921       fmovd(tmp1, vtmp);
4922     } else { // UL case
4923       ldr(tmp1, Address(str1));
4924       cmp(str1, str2);
4925       br(Assembler::EQ, DONE);
4926       ldrs(vtmp, Address(str2));
4927       cmp(cnt2, STUB_THRESHOLD);
4928       br(GE, STUB);
4929       subsw(cnt2, cnt2, 4);
4930       br(EQ, TAIL_CHECK);
4931       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4932       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4933       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4934       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4935       zip1(vtmp, T8B, vtmp, vtmpZ);
4936       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4937       add(cnt1, cnt1, 8);
4938       fmovd(tmp2, vtmp);
4939     }
4940     adds(cnt2, cnt2, isUL ? 4 : 8);
4941     br(GE, TAIL);
4942     eor(rscratch2, tmp1, tmp2);
4943     cbnz(rscratch2, DIFFERENCE);
4944     // main loop
4945     bind(NEXT_WORD);
4946     if (str1_isL == str2_isL) {
4947       ldr(tmp1, Address(str1, cnt2));
4948       ldr(tmp2, Address(str2, cnt2));
4949       adds(cnt2, cnt2, 8);
4950     } else if (isLU) {
4951       ldrs(vtmp, Address(str1, cnt1));
4952       ldr(tmp2, Address(str2, cnt2));
4953       add(cnt1, cnt1, 4);
4954       zip1(vtmp, T8B, vtmp, vtmpZ);
4955       fmovd(tmp1, vtmp);
4956       adds(cnt2, cnt2, 8);
4957     } else { // UL
4958       ldrs(vtmp, Address(str2, cnt2));
4959       ldr(tmp1, Address(str1, cnt1));
4960       zip1(vtmp, T8B, vtmp, vtmpZ);
4961       add(cnt1, cnt1, 8);
4962       fmovd(tmp2, vtmp);
4963       adds(cnt2, cnt2, 4);
4964     }
4965     br(GE, TAIL);
4966 
4967     eor(rscratch2, tmp1, tmp2);
4968     cbz(rscratch2, NEXT_WORD);
4969     b(DIFFERENCE);
4970     bind(TAIL);
4971     eor(rscratch2, tmp1, tmp2);
4972     cbnz(rscratch2, DIFFERENCE);
4973     // Last longword.  In the case where length == 4 we compare the
4974     // same longword twice, but that's still faster than another
4975     // conditional branch.
4976     if (str1_isL == str2_isL) {
4977       ldr(tmp1, Address(str1));
4978       ldr(tmp2, Address(str2));
4979     } else if (isLU) {
4980       ldrs(vtmp, Address(str1));
4981       ldr(tmp2, Address(str2));
4982       zip1(vtmp, T8B, vtmp, vtmpZ);
4983       fmovd(tmp1, vtmp);
4984     } else { // UL
4985       ldrs(vtmp, Address(str2));
4986       ldr(tmp1, Address(str1));
4987       zip1(vtmp, T8B, vtmp, vtmpZ);
4988       fmovd(tmp2, vtmp);
4989     }
4990     bind(TAIL_CHECK);
4991     eor(rscratch2, tmp1, tmp2);
4992     cbz(rscratch2, DONE);
4993 
4994     // Find the first different characters in the longwords and
4995     // compute their difference.
4996     bind(DIFFERENCE);
4997     rev(rscratch2, rscratch2);
4998     clz(rscratch2, rscratch2);
4999     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5000     lsrv(tmp1, tmp1, rscratch2);
5001     (this->*ext_chr)(tmp1, tmp1);
5002     lsrv(tmp2, tmp2, rscratch2);
5003     (this->*ext_chr)(tmp2, tmp2);
5004     subw(result, tmp1, tmp2);
5005     b(DONE);
5006   }
5007 
5008   bind(STUB);
5009     RuntimeAddress stub = NULL;
5010     switch(ae) {
5011       case StrIntrinsicNode::LL:
5012         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5013         break;
5014       case StrIntrinsicNode::UU:
5015         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5016         break;
5017       case StrIntrinsicNode::LU:
5018         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5019         break;
5020       case StrIntrinsicNode::UL:
5021         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5022         break;
5023       default:
5024         ShouldNotReachHere();
5025      }
5026     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5027     trampoline_call(stub);
5028     b(DONE);
5029 
5030   bind(SHORT_STRING);
5031   // Is the minimum length zero?
5032   cbz(cnt2, DONE);
5033   // arrange code to do most branches while loading and loading next characters
5034   // while comparing previous
5035   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5036   subs(cnt2, cnt2, 1);
5037   br(EQ, SHORT_LAST_INIT);
5038   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5039   b(SHORT_LOOP_START);
5040   bind(SHORT_LOOP);
5041   subs(cnt2, cnt2, 1);
5042   br(EQ, SHORT_LAST);
5043   bind(SHORT_LOOP_START);
5044   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5045   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5046   cmp(tmp1, cnt1);
5047   br(NE, SHORT_LOOP_TAIL);
5048   subs(cnt2, cnt2, 1);
5049   br(EQ, SHORT_LAST2);
5050   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5051   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5052   cmp(tmp2, rscratch1);
5053   br(EQ, SHORT_LOOP);
5054   sub(result, tmp2, rscratch1);
5055   b(DONE);
5056   bind(SHORT_LOOP_TAIL);
5057   sub(result, tmp1, cnt1);
5058   b(DONE);
5059   bind(SHORT_LAST2);
5060   cmp(tmp2, rscratch1);
5061   br(EQ, DONE);
5062   sub(result, tmp2, rscratch1);
5063 
5064   b(DONE);
5065   bind(SHORT_LAST_INIT);
5066   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5067   bind(SHORT_LAST);
5068   cmp(tmp1, cnt1);
5069   br(EQ, DONE);
5070   sub(result, tmp1, cnt1);
5071 
5072   bind(DONE);
5073 
5074   BLOCK_COMMENT("} string_compare");
5075 }
5076 
5077 // This method checks if provided byte array contains byte with highest bit set.
5078 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5079     // Simple and most common case of aligned small array which is not at the
5080     // end of memory page is placed here. All other cases are in stub.
5081     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5082     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5083     assert_different_registers(ary1, len, result);
5084 
5085     cmpw(len, 0);
5086     br(LE, SET_RESULT);
5087     cmpw(len, 4 * wordSize);
5088     br(GE, STUB_LONG); // size > 32 then go to stub
5089 
5090     int shift = 64 - exact_log2(os::vm_page_size());
5091     lsl(rscratch1, ary1, shift);
5092     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5093     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5094     br(CS, STUB); // at the end of page then go to stub
5095     subs(len, len, wordSize);
5096     br(LT, END);
5097 
5098   BIND(LOOP);
5099     ldr(rscratch1, Address(post(ary1, wordSize)));
5100     tst(rscratch1, UPPER_BIT_MASK);
5101     br(NE, SET_RESULT);
5102     subs(len, len, wordSize);
5103     br(GE, LOOP);
5104     cmpw(len, -wordSize);
5105     br(EQ, SET_RESULT);
5106 
5107   BIND(END);
5108     ldr(result, Address(ary1));
5109     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5110     lslv(result, result, len);
5111     tst(result, UPPER_BIT_MASK);
5112     b(SET_RESULT);
5113 
5114   BIND(STUB);
5115     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5116     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5117     trampoline_call(has_neg);
5118     b(DONE);
5119 
5120   BIND(STUB_LONG);
5121     RuntimeAddress has_neg_long =  RuntimeAddress(
5122             StubRoutines::aarch64::has_negatives_long());
5123     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5124     trampoline_call(has_neg_long);
5125     b(DONE);
5126 
5127   BIND(SET_RESULT);
5128     cset(result, NE); // set true or false
5129 
5130   BIND(DONE);
5131 }
5132 
5133 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5134                                    Register tmp4, Register tmp5, Register result,
5135                                    Register cnt1, int elem_size) {
5136   Label DONE, SAME;
5137   Register tmp1 = rscratch1;
5138   Register tmp2 = rscratch2;
5139   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5140   int elem_per_word = wordSize/elem_size;
5141   int log_elem_size = exact_log2(elem_size);
5142   int length_offset = arrayOopDesc::length_offset_in_bytes();
5143   int base_offset
5144     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5145   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5146 
5147   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5148   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5149 
5150 #ifndef PRODUCT
5151   {
5152     const char kind = (elem_size == 2) ? 'U' : 'L';
5153     char comment[64];
5154     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5155     BLOCK_COMMENT(comment);
5156   }
5157 #endif
5158 
5159   // if (a1 == a2)
5160   //     return true;
5161   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5162   br(EQ, SAME);
5163 
5164   if (UseSimpleArrayEquals) {
5165     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5166     // if (a1 == null || a2 == null)
5167     //     return false;
5168     // a1 & a2 == 0 means (some-pointer is null) or
5169     // (very-rare-or-even-probably-impossible-pointer-values)
5170     // so, we can save one branch in most cases
5171     tst(a1, a2);
5172     mov(result, false);
5173     br(EQ, A_MIGHT_BE_NULL);
5174     // if (a1.length != a2.length)
5175     //      return false;
5176     bind(A_IS_NOT_NULL);
5177     ldrw(cnt1, Address(a1, length_offset));
5178     ldrw(cnt2, Address(a2, length_offset));
5179     eorw(tmp5, cnt1, cnt2);
5180     cbnzw(tmp5, DONE);
5181     lea(a1, Address(a1, base_offset));
5182     lea(a2, Address(a2, base_offset));
5183     // Check for short strings, i.e. smaller than wordSize.
5184     subs(cnt1, cnt1, elem_per_word);
5185     br(Assembler::LT, SHORT);
5186     // Main 8 byte comparison loop.
5187     bind(NEXT_WORD); {
5188       ldr(tmp1, Address(post(a1, wordSize)));
5189       ldr(tmp2, Address(post(a2, wordSize)));
5190       subs(cnt1, cnt1, elem_per_word);
5191       eor(tmp5, tmp1, tmp2);
5192       cbnz(tmp5, DONE);
5193     } br(GT, NEXT_WORD);
5194     // Last longword.  In the case where length == 4 we compare the
5195     // same longword twice, but that's still faster than another
5196     // conditional branch.
5197     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5198     // length == 4.
5199     if (log_elem_size > 0)
5200       lsl(cnt1, cnt1, log_elem_size);
5201     ldr(tmp3, Address(a1, cnt1));
5202     ldr(tmp4, Address(a2, cnt1));
5203     eor(tmp5, tmp3, tmp4);
5204     cbnz(tmp5, DONE);
5205     b(SAME);
5206     bind(A_MIGHT_BE_NULL);
5207     // in case both a1 and a2 are not-null, proceed with loads
5208     cbz(a1, DONE);
5209     cbz(a2, DONE);
5210     b(A_IS_NOT_NULL);
5211     bind(SHORT);
5212 
5213     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5214     {
5215       ldrw(tmp1, Address(post(a1, 4)));
5216       ldrw(tmp2, Address(post(a2, 4)));
5217       eorw(tmp5, tmp1, tmp2);
5218       cbnzw(tmp5, DONE);
5219     }
5220     bind(TAIL03);
5221     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5222     {
5223       ldrh(tmp3, Address(post(a1, 2)));
5224       ldrh(tmp4, Address(post(a2, 2)));
5225       eorw(tmp5, tmp3, tmp4);
5226       cbnzw(tmp5, DONE);
5227     }
5228     bind(TAIL01);
5229     if (elem_size == 1) { // Only needed when comparing byte arrays.
5230       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5231       {
5232         ldrb(tmp1, a1);
5233         ldrb(tmp2, a2);
5234         eorw(tmp5, tmp1, tmp2);
5235         cbnzw(tmp5, DONE);
5236       }
5237     }
5238   } else {
5239     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5240         CSET_EQ, LAST_CHECK;
5241     mov(result, false);
5242     cbz(a1, DONE);
5243     ldrw(cnt1, Address(a1, length_offset));
5244     cbz(a2, DONE);
5245     ldrw(cnt2, Address(a2, length_offset));
5246     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5247     // faster to perform another branch before comparing a1 and a2
5248     cmp(cnt1, (u1)elem_per_word);
5249     br(LE, SHORT); // short or same
5250     ldr(tmp3, Address(pre(a1, base_offset)));
5251     subs(zr, cnt1, stubBytesThreshold);
5252     br(GE, STUB);
5253     ldr(tmp4, Address(pre(a2, base_offset)));
5254     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5255     cmp(cnt2, cnt1);
5256     br(NE, DONE);
5257 
5258     // Main 16 byte comparison loop with 2 exits
5259     bind(NEXT_DWORD); {
5260       ldr(tmp1, Address(pre(a1, wordSize)));
5261       ldr(tmp2, Address(pre(a2, wordSize)));
5262       subs(cnt1, cnt1, 2 * elem_per_word);
5263       br(LE, TAIL);
5264       eor(tmp4, tmp3, tmp4);
5265       cbnz(tmp4, DONE);
5266       ldr(tmp3, Address(pre(a1, wordSize)));
5267       ldr(tmp4, Address(pre(a2, wordSize)));
5268       cmp(cnt1, (u1)elem_per_word);
5269       br(LE, TAIL2);
5270       cmp(tmp1, tmp2);
5271     } br(EQ, NEXT_DWORD);
5272     b(DONE);
5273 
5274     bind(TAIL);
5275     eor(tmp4, tmp3, tmp4);
5276     eor(tmp2, tmp1, tmp2);
5277     lslv(tmp2, tmp2, tmp5);
5278     orr(tmp5, tmp4, tmp2);
5279     cmp(tmp5, zr);
5280     b(CSET_EQ);
5281 
5282     bind(TAIL2);
5283     eor(tmp2, tmp1, tmp2);
5284     cbnz(tmp2, DONE);
5285     b(LAST_CHECK);
5286 
5287     bind(STUB);
5288     ldr(tmp4, Address(pre(a2, base_offset)));
5289     cmp(cnt2, cnt1);
5290     br(NE, DONE);
5291     if (elem_size == 2) { // convert to byte counter
5292       lsl(cnt1, cnt1, 1);
5293     }
5294     eor(tmp5, tmp3, tmp4);
5295     cbnz(tmp5, DONE);
5296     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5297     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5298     trampoline_call(stub);
5299     b(DONE);
5300 
5301     bind(EARLY_OUT);
5302     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5303     // so, if a2 == null => return false(0), else return true, so we can return a2
5304     mov(result, a2);
5305     b(DONE);
5306     bind(SHORT);
5307     cmp(cnt2, cnt1);
5308     br(NE, DONE);
5309     cbz(cnt1, SAME);
5310     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5311     ldr(tmp3, Address(a1, base_offset));
5312     ldr(tmp4, Address(a2, base_offset));
5313     bind(LAST_CHECK);
5314     eor(tmp4, tmp3, tmp4);
5315     lslv(tmp5, tmp4, tmp5);
5316     cmp(tmp5, zr);
5317     bind(CSET_EQ);
5318     cset(result, EQ);
5319     b(DONE);
5320   }
5321 
5322   bind(SAME);
5323   mov(result, true);
5324   // That's it.
5325   bind(DONE);
5326 
5327   BLOCK_COMMENT("} array_equals");
5328 }
5329 
5330 // Compare Strings
5331 
5332 // For Strings we're passed the address of the first characters in a1
5333 // and a2 and the length in cnt1.
5334 // elem_size is the element size in bytes: either 1 or 2.
5335 // There are two implementations.  For arrays >= 8 bytes, all
5336 // comparisons (including the final one, which may overlap) are
5337 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5338 // halfword, then a short, and then a byte.
5339 
5340 void MacroAssembler::string_equals(Register a1, Register a2,
5341                                    Register result, Register cnt1, int elem_size)
5342 {
5343   Label SAME, DONE, SHORT, NEXT_WORD;
5344   Register tmp1 = rscratch1;
5345   Register tmp2 = rscratch2;
5346   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5347 
5348   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5349   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5350 
5351 #ifndef PRODUCT
5352   {
5353     const char kind = (elem_size == 2) ? 'U' : 'L';
5354     char comment[64];
5355     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5356     BLOCK_COMMENT(comment);
5357   }
5358 #endif
5359 
5360   mov(result, false);
5361 
5362   // Check for short strings, i.e. smaller than wordSize.
5363   subs(cnt1, cnt1, wordSize);
5364   br(Assembler::LT, SHORT);
5365   // Main 8 byte comparison loop.
5366   bind(NEXT_WORD); {
5367     ldr(tmp1, Address(post(a1, wordSize)));
5368     ldr(tmp2, Address(post(a2, wordSize)));
5369     subs(cnt1, cnt1, wordSize);
5370     eor(tmp1, tmp1, tmp2);
5371     cbnz(tmp1, DONE);
5372   } br(GT, NEXT_WORD);
5373   // Last longword.  In the case where length == 4 we compare the
5374   // same longword twice, but that's still faster than another
5375   // conditional branch.
5376   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5377   // length == 4.
5378   ldr(tmp1, Address(a1, cnt1));
5379   ldr(tmp2, Address(a2, cnt1));
5380   eor(tmp2, tmp1, tmp2);
5381   cbnz(tmp2, DONE);
5382   b(SAME);
5383 
5384   bind(SHORT);
5385   Label TAIL03, TAIL01;
5386 
5387   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5388   {
5389     ldrw(tmp1, Address(post(a1, 4)));
5390     ldrw(tmp2, Address(post(a2, 4)));
5391     eorw(tmp1, tmp1, tmp2);
5392     cbnzw(tmp1, DONE);
5393   }
5394   bind(TAIL03);
5395   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5396   {
5397     ldrh(tmp1, Address(post(a1, 2)));
5398     ldrh(tmp2, Address(post(a2, 2)));
5399     eorw(tmp1, tmp1, tmp2);
5400     cbnzw(tmp1, DONE);
5401   }
5402   bind(TAIL01);
5403   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5404     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5405     {
5406       ldrb(tmp1, a1);
5407       ldrb(tmp2, a2);
5408       eorw(tmp1, tmp1, tmp2);
5409       cbnzw(tmp1, DONE);
5410     }
5411   }
5412   // Arrays are equal.
5413   bind(SAME);
5414   mov(result, true);
5415 
5416   // That's it.
5417   bind(DONE);
5418   BLOCK_COMMENT("} string_equals");
5419 }
5420 
5421 
5422 // The size of the blocks erased by the zero_blocks stub.  We must
5423 // handle anything smaller than this ourselves in zero_words().
5424 const int MacroAssembler::zero_words_block_size = 8;
5425 
5426 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5427 // possible, handling small word counts locally and delegating
5428 // anything larger to the zero_blocks stub.  It is expanded many times
5429 // in compiled code, so it is important to keep it short.
5430 
5431 // ptr:   Address of a buffer to be zeroed.
5432 // cnt:   Count in HeapWords.
5433 //
5434 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5435 void MacroAssembler::zero_words(Register ptr, Register cnt)
5436 {
5437   assert(is_power_of_2(zero_words_block_size), "adjust this");
5438   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5439 
5440   BLOCK_COMMENT("zero_words {");
5441   cmp(cnt, (u1)zero_words_block_size);
5442   Label around, done, done16;
5443   br(LO, around);
5444   {
5445     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5446     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5447     if (StubRoutines::aarch64::complete()) {
5448       trampoline_call(zero_blocks);
5449     } else {
5450       bl(zero_blocks);
5451     }
5452   }
5453   bind(around);
5454   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5455     Label l;
5456     tbz(cnt, exact_log2(i), l);
5457     for (int j = 0; j < i; j += 2) {
5458       stp(zr, zr, post(ptr, 16));
5459     }
5460     bind(l);
5461   }
5462   {
5463     Label l;
5464     tbz(cnt, 0, l);
5465     str(zr, Address(ptr));
5466     bind(l);
5467   }
5468   BLOCK_COMMENT("} zero_words");
5469 }
5470 
5471 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5472 // cnt:          Immediate count in HeapWords.
5473 #define SmallArraySize (18 * BytesPerLong)
5474 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5475 {
5476   BLOCK_COMMENT("zero_words {");
5477   int i = cnt & 1;  // store any odd word to start
5478   if (i) str(zr, Address(base));
5479 
5480   if (cnt <= SmallArraySize / BytesPerLong) {
5481     for (; i < (int)cnt; i += 2)
5482       stp(zr, zr, Address(base, i * wordSize));
5483   } else {
5484     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5485     int remainder = cnt % (2 * unroll);
5486     for (; i < remainder; i += 2)
5487       stp(zr, zr, Address(base, i * wordSize));
5488 
5489     Label loop;
5490     Register cnt_reg = rscratch1;
5491     Register loop_base = rscratch2;
5492     cnt = cnt - remainder;
5493     mov(cnt_reg, cnt);
5494     // adjust base and prebias by -2 * wordSize so we can pre-increment
5495     add(loop_base, base, (remainder - 2) * wordSize);
5496     bind(loop);
5497     sub(cnt_reg, cnt_reg, 2 * unroll);
5498     for (i = 1; i < unroll; i++)
5499       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5500     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5501     cbnz(cnt_reg, loop);
5502   }
5503   BLOCK_COMMENT("} zero_words");
5504 }
5505 
5506 // Zero blocks of memory by using DC ZVA.
5507 //
5508 // Aligns the base address first sufficently for DC ZVA, then uses
5509 // DC ZVA repeatedly for every full block.  cnt is the size to be
5510 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5511 // in cnt.
5512 //
5513 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5514 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5515 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5516   Register tmp = rscratch1;
5517   Register tmp2 = rscratch2;
5518   int zva_length = VM_Version::zva_length();
5519   Label initial_table_end, loop_zva;
5520   Label fini;
5521 
5522   // Base must be 16 byte aligned. If not just return and let caller handle it
5523   tst(base, 0x0f);
5524   br(Assembler::NE, fini);
5525   // Align base with ZVA length.
5526   neg(tmp, base);
5527   andr(tmp, tmp, zva_length - 1);
5528 
5529   // tmp: the number of bytes to be filled to align the base with ZVA length.
5530   add(base, base, tmp);
5531   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5532   adr(tmp2, initial_table_end);
5533   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5534   br(tmp2);
5535 
5536   for (int i = -zva_length + 16; i < 0; i += 16)
5537     stp(zr, zr, Address(base, i));
5538   bind(initial_table_end);
5539 
5540   sub(cnt, cnt, zva_length >> 3);
5541   bind(loop_zva);
5542   dc(Assembler::ZVA, base);
5543   subs(cnt, cnt, zva_length >> 3);
5544   add(base, base, zva_length);
5545   br(Assembler::GE, loop_zva);
5546   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5547   bind(fini);
5548 }
5549 
5550 // base:   Address of a buffer to be filled, 8 bytes aligned.
5551 // cnt:    Count in 8-byte unit.
5552 // value:  Value to be filled with.
5553 // base will point to the end of the buffer after filling.
5554 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5555 {
5556 //  Algorithm:
5557 //
5558 //    scratch1 = cnt & 7;
5559 //    cnt -= scratch1;
5560 //    p += scratch1;
5561 //    switch (scratch1) {
5562 //      do {
5563 //        cnt -= 8;
5564 //          p[-8] = v;
5565 //        case 7:
5566 //          p[-7] = v;
5567 //        case 6:
5568 //          p[-6] = v;
5569 //          // ...
5570 //        case 1:
5571 //          p[-1] = v;
5572 //        case 0:
5573 //          p += 8;
5574 //      } while (cnt);
5575 //    }
5576 
5577   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5578 
5579   Label fini, skip, entry, loop;
5580   const int unroll = 8; // Number of stp instructions we'll unroll
5581 
5582   cbz(cnt, fini);
5583   tbz(base, 3, skip);
5584   str(value, Address(post(base, 8)));
5585   sub(cnt, cnt, 1);
5586   bind(skip);
5587 
5588   andr(rscratch1, cnt, (unroll-1) * 2);
5589   sub(cnt, cnt, rscratch1);
5590   add(base, base, rscratch1, Assembler::LSL, 3);
5591   adr(rscratch2, entry);
5592   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5593   br(rscratch2);
5594 
5595   bind(loop);
5596   add(base, base, unroll * 16);
5597   for (int i = -unroll; i < 0; i++)
5598     stp(value, value, Address(base, i * 16));
5599   bind(entry);
5600   subs(cnt, cnt, unroll * 2);
5601   br(Assembler::GE, loop);
5602 
5603   tbz(cnt, 0, fini);
5604   str(value, Address(post(base, 8)));
5605   bind(fini);
5606 }
5607 
5608 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5609 // java/lang/StringUTF16.compress.
5610 void MacroAssembler::encode_iso_array(Register src, Register dst,
5611                       Register len, Register result,
5612                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5613                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5614 {
5615     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5616         NEXT_32_START, NEXT_32_PRFM_START;
5617     Register tmp1 = rscratch1, tmp2 = rscratch2;
5618 
5619       mov(result, len); // Save initial len
5620 
5621 #ifndef BUILTIN_SIM
5622       cmp(len, (u1)8); // handle shortest strings first
5623       br(LT, LOOP_1);
5624       cmp(len, (u1)32);
5625       br(LT, NEXT_8);
5626       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5627       // to convert chars to bytes
5628       if (SoftwarePrefetchHintDistance >= 0) {
5629         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5630         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5631         br(LE, NEXT_32_START);
5632         b(NEXT_32_PRFM_START);
5633         BIND(NEXT_32_PRFM);
5634           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5635         BIND(NEXT_32_PRFM_START);
5636           prfm(Address(src, SoftwarePrefetchHintDistance));
5637           orr(v4, T16B, Vtmp1, Vtmp2);
5638           orr(v5, T16B, Vtmp3, Vtmp4);
5639           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5640           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5641           stpq(Vtmp1, Vtmp3, dst);
5642           uzp2(v5, T16B, v4, v5); // high bytes
5643           umov(tmp2, v5, D, 1);
5644           fmovd(tmp1, v5);
5645           orr(tmp1, tmp1, tmp2);
5646           cbnz(tmp1, LOOP_8);
5647           sub(len, len, 32);
5648           add(dst, dst, 32);
5649           add(src, src, 64);
5650           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5651           br(GE, NEXT_32_PRFM);
5652           cmp(len, (u1)32);
5653           br(LT, LOOP_8);
5654         BIND(NEXT_32);
5655           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5656         BIND(NEXT_32_START);
5657       } else {
5658         BIND(NEXT_32);
5659           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5660       }
5661       prfm(Address(src, SoftwarePrefetchHintDistance));
5662       uzp1(v4, T16B, Vtmp1, Vtmp2);
5663       uzp1(v5, T16B, Vtmp3, Vtmp4);
5664       stpq(v4, v5, dst);
5665       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5666       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5667       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5668       umov(tmp2, Vtmp1, D, 1);
5669       fmovd(tmp1, Vtmp1);
5670       orr(tmp1, tmp1, tmp2);
5671       cbnz(tmp1, LOOP_8);
5672       sub(len, len, 32);
5673       add(dst, dst, 32);
5674       add(src, src, 64);
5675       cmp(len, (u1)32);
5676       br(GE, NEXT_32);
5677       cbz(len, DONE);
5678 
5679     BIND(LOOP_8);
5680       cmp(len, (u1)8);
5681       br(LT, LOOP_1);
5682     BIND(NEXT_8);
5683       ld1(Vtmp1, T8H, src);
5684       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5685       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5686       strd(Vtmp2, dst);
5687       fmovd(tmp1, Vtmp3);
5688       cbnz(tmp1, NEXT_1);
5689 
5690       sub(len, len, 8);
5691       add(dst, dst, 8);
5692       add(src, src, 16);
5693       cmp(len, (u1)8);
5694       br(GE, NEXT_8);
5695 
5696     BIND(LOOP_1);
5697 #endif
5698     cbz(len, DONE);
5699     BIND(NEXT_1);
5700       ldrh(tmp1, Address(post(src, 2)));
5701       strb(tmp1, Address(post(dst, 1)));
5702       tst(tmp1, 0xff00);
5703       br(NE, SET_RESULT);
5704       subs(len, len, 1);
5705       br(GT, NEXT_1);
5706 
5707     BIND(SET_RESULT);
5708       sub(result, result, len); // Return index where we stopped
5709                                 // Return len == 0 if we processed all
5710                                 // characters
5711     BIND(DONE);
5712 }
5713 
5714 
5715 // Inflate byte[] array to char[].
5716 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5717                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5718                                         Register tmp4) {
5719   Label big, done, after_init, to_stub;
5720 
5721   assert_different_registers(src, dst, len, tmp4, rscratch1);
5722 
5723   fmovd(vtmp1, zr);
5724   lsrw(tmp4, len, 3);
5725   bind(after_init);
5726   cbnzw(tmp4, big);
5727   // Short string: less than 8 bytes.
5728   {
5729     Label loop, tiny;
5730 
5731     cmpw(len, 4);
5732     br(LT, tiny);
5733     // Use SIMD to do 4 bytes.
5734     ldrs(vtmp2, post(src, 4));
5735     zip1(vtmp3, T8B, vtmp2, vtmp1);
5736     subw(len, len, 4);
5737     strd(vtmp3, post(dst, 8));
5738 
5739     cbzw(len, done);
5740 
5741     // Do the remaining bytes by steam.
5742     bind(loop);
5743     ldrb(tmp4, post(src, 1));
5744     strh(tmp4, post(dst, 2));
5745     subw(len, len, 1);
5746 
5747     bind(tiny);
5748     cbnz(len, loop);
5749 
5750     b(done);
5751   }
5752 
5753   if (SoftwarePrefetchHintDistance >= 0) {
5754     bind(to_stub);
5755       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5756       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5757       trampoline_call(stub);
5758       b(after_init);
5759   }
5760 
5761   // Unpack the bytes 8 at a time.
5762   bind(big);
5763   {
5764     Label loop, around, loop_last, loop_start;
5765 
5766     if (SoftwarePrefetchHintDistance >= 0) {
5767       const int large_loop_threshold = (64 + 16)/8;
5768       ldrd(vtmp2, post(src, 8));
5769       andw(len, len, 7);
5770       cmp(tmp4, (u1)large_loop_threshold);
5771       br(GE, to_stub);
5772       b(loop_start);
5773 
5774       bind(loop);
5775       ldrd(vtmp2, post(src, 8));
5776       bind(loop_start);
5777       subs(tmp4, tmp4, 1);
5778       br(EQ, loop_last);
5779       zip1(vtmp2, T16B, vtmp2, vtmp1);
5780       ldrd(vtmp3, post(src, 8));
5781       st1(vtmp2, T8H, post(dst, 16));
5782       subs(tmp4, tmp4, 1);
5783       zip1(vtmp3, T16B, vtmp3, vtmp1);
5784       st1(vtmp3, T8H, post(dst, 16));
5785       br(NE, loop);
5786       b(around);
5787       bind(loop_last);
5788       zip1(vtmp2, T16B, vtmp2, vtmp1);
5789       st1(vtmp2, T8H, post(dst, 16));
5790       bind(around);
5791       cbz(len, done);
5792     } else {
5793       andw(len, len, 7);
5794       bind(loop);
5795       ldrd(vtmp2, post(src, 8));
5796       sub(tmp4, tmp4, 1);
5797       zip1(vtmp3, T16B, vtmp2, vtmp1);
5798       st1(vtmp3, T8H, post(dst, 16));
5799       cbnz(tmp4, loop);
5800     }
5801   }
5802 
5803   // Do the tail of up to 8 bytes.
5804   add(src, src, len);
5805   ldrd(vtmp3, Address(src, -8));
5806   add(dst, dst, len, ext::uxtw, 1);
5807   zip1(vtmp3, T16B, vtmp3, vtmp1);
5808   strq(vtmp3, Address(dst, -16));
5809 
5810   bind(done);
5811 }
5812 
5813 // Compress char[] array to byte[].
5814 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5815                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5816                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5817                                          Register result) {
5818   encode_iso_array(src, dst, len, result,
5819                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5820   cmp(len, zr);
5821   csel(result, result, zr, EQ);
5822 }
5823 
5824 // get_thread() can be called anywhere inside generated code so we
5825 // need to save whatever non-callee save context might get clobbered
5826 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5827 // the call setup code.
5828 //
5829 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5830 //
5831 void MacroAssembler::get_thread(Register dst) {
5832   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5833   push(saved_regs, sp);
5834 
5835   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5836   blrt(lr, 1, 0, 1);
5837   if (dst != c_rarg0) {
5838     mov(dst, c_rarg0);
5839   }
5840 
5841   pop(saved_regs, sp);
5842 }