1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/signature_cc.hpp"
  50 #include "runtime/thread.hpp"
  51 #ifdef COMPILER1
  52 #include "c1/c1_LIRAssembler.hpp"
  53 #endif
  54 #ifdef COMPILER2
  55 #include "oops/oop.hpp"
  56 #include "opto/compile.hpp"
  57 #include "opto/intrinsicnode.hpp"
  58 #include "opto/node.hpp"
  59 #endif
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #define STOP(error) stop(error)
  64 #else
  65 #define BLOCK_COMMENT(str) block_comment(str)
  66 #define STOP(error) block_comment(error); stop(error)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   int instructions = 1;
  75   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  76   long offset = (target - branch) >> 2;
  77   unsigned insn = *(unsigned*)branch;
  78   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  79     // Load register (literal)
  80     Instruction_aarch64::spatch(branch, 23, 5, offset);
  81   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  82     // Unconditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 25, 0, offset);
  84   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  85     // Conditional branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  88     // Compare & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 23, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  91     // Test & branch (immediate)
  92     Instruction_aarch64::spatch(branch, 18, 5, offset);
  93   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  94     // PC-rel. addressing
  95     offset = target-branch;
  96     int shift = Instruction_aarch64::extract(insn, 31, 31);
  97     if (shift) {
  98       u_int64_t dest = (u_int64_t)target;
  99       uint64_t pc_page = (uint64_t)branch >> 12;
 100       uint64_t adr_page = (uint64_t)target >> 12;
 101       unsigned offset_lo = dest & 0xfff;
 102       offset = adr_page - pc_page;
 103 
 104       // We handle 4 types of PC relative addressing
 105       //   1 - adrp    Rx, target_page
 106       //       ldr/str Ry, [Rx, #offset_in_page]
 107       //   2 - adrp    Rx, target_page
 108       //       add     Ry, Rx, #offset_in_page
 109       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       //       movk    Rx, #imm16<<32
 111       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 112       // In the first 3 cases we must check that Rx is the same in the adrp and the
 113       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 114       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 115       // to be followed by a random unrelated ldr/str, add or movk instruction.
 116       //
 117       unsigned insn2 = ((unsigned*)branch)[1];
 118       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 9, 5)) {
 121         // Load/store register (unsigned immediate)
 122         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 123         Instruction_aarch64::patch(branch + sizeof (unsigned),
 124                                     21, 10, offset_lo >> size);
 125         guarantee(((dest >> size) << size) == dest, "misaligned target");
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 128                 Instruction_aarch64::extract(insn, 4, 0) ==
 129                         Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // add (immediate)
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                    21, 10, offset_lo);
 133         instructions = 2;
 134       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 135                    Instruction_aarch64::extract(insn, 4, 0) ==
 136                      Instruction_aarch64::extract(insn2, 4, 0)) {
 137         // movk #imm16<<32
 138         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 139         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 140         long pc_page = (long)branch >> 12;
 141         long adr_page = (long)dest >> 12;
 142         offset = adr_page - pc_page;
 143         instructions = 2;
 144       }
 145     }
 146     int offset_lo = offset & 3;
 147     offset >>= 2;
 148     Instruction_aarch64::spatch(branch, 23, 5, offset);
 149     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 150   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 151     u_int64_t dest = (u_int64_t)target;
 152     // Move wide constant
 153     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 154     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 155     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 156     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 157     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 158     assert(target_addr_for_insn(branch) == target, "should be");
 159     instructions = 3;
 160   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 161              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 162     // nothing to do
 163     assert(target == 0, "did not expect to relocate target for polling page load");
 164   } else {
 165     ShouldNotReachHere();
 166   }
 167   return instructions * NativeInstruction::instruction_size;
 168 }
 169 
 170 int MacroAssembler::patch_oop(address insn_addr, address o) {
 171   int instructions;
 172   unsigned insn = *(unsigned*)insn_addr;
 173   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 174 
 175   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 176   // narrow OOPs by setting the upper 16 bits in the first
 177   // instruction.
 178   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 179     // Move narrow OOP
 180     narrowOop n = CompressedOops::encode((oop)o);
 181     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 183     instructions = 2;
 184   } else {
 185     // Move wide OOP
 186     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 187     uintptr_t dest = (uintptr_t)o;
 188     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 190     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 191     instructions = 3;
 192   }
 193   return instructions * NativeInstruction::instruction_size;
 194 }
 195 
 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 197   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 198   // We encode narrow ones by setting the upper 16 bits in the first
 199   // instruction.
 200   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 201   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 202          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 203 
 204   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 205   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 206   return 2 * NativeInstruction::instruction_size;
 207 }
 208 
 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 210   long offset = 0;
 211   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 212     // Load register (literal)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214     return address(((uint64_t)insn_addr + (offset << 2)));
 215   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 216     // Unconditional branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 25, 0);
 218   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 219     // Conditional branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 23, 5);
 221   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 222     // Compare & branch (immediate)
 223     offset = Instruction_aarch64::sextract(insn, 23, 5);
 224    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 225     // Test & branch (immediate)
 226     offset = Instruction_aarch64::sextract(insn, 18, 5);
 227   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 228     // PC-rel. addressing
 229     offset = Instruction_aarch64::extract(insn, 30, 29);
 230     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 231     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 232     if (shift) {
 233       offset <<= shift;
 234       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 235       target_page &= ((uint64_t)-1) << shift;
 236       // Return the target address for the following sequences
 237       //   1 - adrp    Rx, target_page
 238       //       ldr/str Ry, [Rx, #offset_in_page]
 239       //   2 - adrp    Rx, target_page
 240       //       add     Ry, Rx, #offset_in_page
 241       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //       movk    Rx, #imm12<<32
 243       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 244       //
 245       // In the first two cases  we check that the register is the same and
 246       // return the target_page + the offset within the page.
 247       // Otherwise we assume it is a page aligned relocation and return
 248       // the target page only.
 249       //
 250       unsigned insn2 = ((unsigned*)insn_addr)[1];
 251       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 9, 5)) {
 254         // Load/store register (unsigned immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 257         return address(target_page + (byte_offset << size));
 258       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 259                 Instruction_aarch64::extract(insn, 4, 0) ==
 260                         Instruction_aarch64::extract(insn2, 4, 0)) {
 261         // add (immediate)
 262         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 263         return address(target_page + byte_offset);
 264       } else {
 265         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 266                Instruction_aarch64::extract(insn, 4, 0) ==
 267                  Instruction_aarch64::extract(insn2, 4, 0)) {
 268           target_page = (target_page & 0xffffffff) |
 269                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 270         }
 271         return (address)target_page;
 272       }
 273     } else {
 274       ShouldNotReachHere();
 275     }
 276   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 277     u_int32_t *insns = (u_int32_t *)insn_addr;
 278     // Move wide constant: movz, movk, movk.  See movptr().
 279     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 280     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 281     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 283                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 284   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 285              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 286     return 0;
 287   } else {
 288     ShouldNotReachHere();
 289   }
 290   return address(((uint64_t)insn_addr + (offset << 2)));
 291 }
 292 
 293 void MacroAssembler::safepoint_poll(Label& slow_path) {
 294   if (SafepointMechanism::uses_thread_local_poll()) {
 295     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 296     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 297   } else {
 298     unsigned long offset;
 299     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 300     ldrw(rscratch1, Address(rscratch1, offset));
 301     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 302     cbnz(rscratch1, slow_path);
 303   }
 304 }
 305 
 306 // Just like safepoint_poll, but use an acquiring load for thread-
 307 // local polling.
 308 //
 309 // We need an acquire here to ensure that any subsequent load of the
 310 // global SafepointSynchronize::_state flag is ordered after this load
 311 // of the local Thread::_polling page.  We don't want this poll to
 312 // return false (i.e. not safepointing) and a later poll of the global
 313 // SafepointSynchronize::_state spuriously to return true.
 314 //
 315 // This is to avoid a race when we're in a native->Java transition
 316 // racing the code which wakes up from a safepoint.
 317 //
 318 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 319   if (SafepointMechanism::uses_thread_local_poll()) {
 320     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 321     ldar(rscratch1, rscratch1);
 322     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 323   } else {
 324     safepoint_poll(slow_path);
 325   }
 326 }
 327 
 328 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 329   // we must set sp to zero to clear frame
 330   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 331 
 332   // must clear fp, so that compiled frames are not confused; it is
 333   // possible that we need it only for debugging
 334   if (clear_fp) {
 335     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 336   }
 337 
 338   // Always clear the pc because it could have been set by make_walkable()
 339   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 340 }
 341 
 342 // Calls to C land
 343 //
 344 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 345 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 346 // has to be reset to 0. This is required to allow proper stack traversal.
 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 348                                          Register last_java_fp,
 349                                          Register last_java_pc,
 350                                          Register scratch) {
 351 
 352   if (last_java_pc->is_valid()) {
 353       str(last_java_pc, Address(rthread,
 354                                 JavaThread::frame_anchor_offset()
 355                                 + JavaFrameAnchor::last_Java_pc_offset()));
 356     }
 357 
 358   // determine last_java_sp register
 359   if (last_java_sp == sp) {
 360     mov(scratch, sp);
 361     last_java_sp = scratch;
 362   } else if (!last_java_sp->is_valid()) {
 363     last_java_sp = esp;
 364   }
 365 
 366   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 367 
 368   // last_java_fp is optional
 369   if (last_java_fp->is_valid()) {
 370     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 371   }
 372 }
 373 
 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 375                                          Register last_java_fp,
 376                                          address  last_java_pc,
 377                                          Register scratch) {
 378   assert(last_java_pc != NULL, "must provide a valid PC");
 379 
 380   adr(scratch, last_java_pc);
 381   str(scratch, Address(rthread,
 382                        JavaThread::frame_anchor_offset()
 383                        + JavaFrameAnchor::last_Java_pc_offset()));
 384 
 385   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 386 }
 387 
 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 389                                          Register last_java_fp,
 390                                          Label &L,
 391                                          Register scratch) {
 392   if (L.is_bound()) {
 393     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 394   } else {
 395     InstructionMark im(this);
 396     L.add_patch_at(code(), locator());
 397     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 398   }
 399 }
 400 
 401 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 402   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 403   assert(CodeCache::find_blob(entry.target()) != NULL,
 404          "destination of far call not found in code cache");
 405   if (far_branches()) {
 406     unsigned long offset;
 407     // We can use ADRP here because we know that the total size of
 408     // the code cache cannot exceed 2Gb.
 409     adrp(tmp, entry, offset);
 410     add(tmp, tmp, offset);
 411     if (cbuf) cbuf->set_insts_mark();
 412     blr(tmp);
 413   } else {
 414     if (cbuf) cbuf->set_insts_mark();
 415     bl(entry);
 416   }
 417 }
 418 
 419 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 420   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 421   assert(CodeCache::find_blob(entry.target()) != NULL,
 422          "destination of far call not found in code cache");
 423   if (far_branches()) {
 424     unsigned long offset;
 425     // We can use ADRP here because we know that the total size of
 426     // the code cache cannot exceed 2Gb.
 427     adrp(tmp, entry, offset);
 428     add(tmp, tmp, offset);
 429     if (cbuf) cbuf->set_insts_mark();
 430     br(tmp);
 431   } else {
 432     if (cbuf) cbuf->set_insts_mark();
 433     b(entry);
 434   }
 435 }
 436 
 437 void MacroAssembler::reserved_stack_check() {
 438     // testing if reserved zone needs to be enabled
 439     Label no_reserved_zone_enabling;
 440 
 441     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 442     cmp(sp, rscratch1);
 443     br(Assembler::LO, no_reserved_zone_enabling);
 444 
 445     enter();   // LR and FP are live.
 446     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 447     mov(c_rarg0, rthread);
 448     blr(rscratch1);
 449     leave();
 450 
 451     // We have already removed our own frame.
 452     // throw_delayed_StackOverflowError will think that it's been
 453     // called by our caller.
 454     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 455     br(rscratch1);
 456     should_not_reach_here();
 457 
 458     bind(no_reserved_zone_enabling);
 459 }
 460 
 461 int MacroAssembler::biased_locking_enter(Register lock_reg,
 462                                          Register obj_reg,
 463                                          Register swap_reg,
 464                                          Register tmp_reg,
 465                                          bool swap_reg_contains_mark,
 466                                          Label& done,
 467                                          Label* slow_case,
 468                                          BiasedLockingCounters* counters) {
 469   assert(UseBiasedLocking, "why call this otherwise?");
 470   assert_different_registers(lock_reg, obj_reg, swap_reg);
 471 
 472   if (PrintBiasedLockingStatistics && counters == NULL)
 473     counters = BiasedLocking::counters();
 474 
 475   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 476   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
 477   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 478   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 479   Address saved_mark_addr(lock_reg, 0);
 480 
 481   // Biased locking
 482   // See whether the lock is currently biased toward our thread and
 483   // whether the epoch is still valid
 484   // Note that the runtime guarantees sufficient alignment of JavaThread
 485   // pointers to allow age to be placed into low bits
 486   // First check to see whether biasing is even enabled for this object
 487   Label cas_label;
 488   int null_check_offset = -1;
 489   if (!swap_reg_contains_mark) {
 490     null_check_offset = offset();
 491     ldr(swap_reg, mark_addr);
 492   }
 493   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
 494   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
 495   br(Assembler::NE, cas_label);
 496   // The bias pattern is present in the object's header. Need to check
 497   // whether the bias owner and the epoch are both still current.
 498   load_prototype_header(tmp_reg, obj_reg);
 499   orr(tmp_reg, tmp_reg, rthread);
 500   eor(tmp_reg, swap_reg, tmp_reg);
 501   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
 502   if (counters != NULL) {
 503     Label around;
 504     cbnz(tmp_reg, around);
 505     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 506     b(done);
 507     bind(around);
 508   } else {
 509     cbz(tmp_reg, done);
 510   }
 511 
 512   Label try_revoke_bias;
 513   Label try_rebias;
 514 
 515   // At this point we know that the header has the bias pattern and
 516   // that we are not the bias owner in the current epoch. We need to
 517   // figure out more details about the state of the header in order to
 518   // know what operations can be legally performed on the object's
 519   // header.
 520 
 521   // If the low three bits in the xor result aren't clear, that means
 522   // the prototype header is no longer biased and we have to revoke
 523   // the bias on this object.
 524   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
 525   cbnz(rscratch1, try_revoke_bias);
 526 
 527   // Biasing is still enabled for this data type. See whether the
 528   // epoch of the current bias is still valid, meaning that the epoch
 529   // bits of the mark word are equal to the epoch bits of the
 530   // prototype header. (Note that the prototype header's epoch bits
 531   // only change at a safepoint.) If not, attempt to rebias the object
 532   // toward the current thread. Note that we must be absolutely sure
 533   // that the current epoch is invalid in order to do this because
 534   // otherwise the manipulations it performs on the mark word are
 535   // illegal.
 536   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
 537   cbnz(rscratch1, try_rebias);
 538 
 539   // The epoch of the current bias is still valid but we know nothing
 540   // about the owner; it might be set or it might be clear. Try to
 541   // acquire the bias of the object using an atomic operation. If this
 542   // fails we will go in to the runtime to revoke the object's bias.
 543   // Note that we first construct the presumed unbiased header so we
 544   // don't accidentally blow away another thread's valid bias.
 545   {
 546     Label here;
 547     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
 548     andr(swap_reg, swap_reg, rscratch1);
 549     orr(tmp_reg, swap_reg, rthread);
 550     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 551     // If the biasing toward our thread failed, this means that
 552     // another thread succeeded in biasing it toward itself and we
 553     // need to revoke that bias. The revocation will occur in the
 554     // interpreter runtime in the slow case.
 555     bind(here);
 556     if (counters != NULL) {
 557       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 558                   tmp_reg, rscratch1, rscratch2);
 559     }
 560   }
 561   b(done);
 562 
 563   bind(try_rebias);
 564   // At this point we know the epoch has expired, meaning that the
 565   // current "bias owner", if any, is actually invalid. Under these
 566   // circumstances _only_, we are allowed to use the current header's
 567   // value as the comparison value when doing the cas to acquire the
 568   // bias in the current epoch. In other words, we allow transfer of
 569   // the bias from one thread to another directly in this situation.
 570   //
 571   // FIXME: due to a lack of registers we currently blow away the age
 572   // bits in this situation. Should attempt to preserve them.
 573   {
 574     Label here;
 575     load_prototype_header(tmp_reg, obj_reg);
 576     orr(tmp_reg, rthread, tmp_reg);
 577     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 578     // If the biasing toward our thread failed, then another thread
 579     // succeeded in biasing it toward itself and we need to revoke that
 580     // bias. The revocation will occur in the runtime in the slow case.
 581     bind(here);
 582     if (counters != NULL) {
 583       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 584                   tmp_reg, rscratch1, rscratch2);
 585     }
 586   }
 587   b(done);
 588 
 589   bind(try_revoke_bias);
 590   // The prototype mark in the klass doesn't have the bias bit set any
 591   // more, indicating that objects of this data type are not supposed
 592   // to be biased any more. We are going to try to reset the mark of
 593   // this object to the prototype value and fall through to the
 594   // CAS-based locking scheme. Note that if our CAS fails, it means
 595   // that another thread raced us for the privilege of revoking the
 596   // bias of this particular object, so it's okay to continue in the
 597   // normal locking code.
 598   //
 599   // FIXME: due to a lack of registers we currently blow away the age
 600   // bits in this situation. Should attempt to preserve them.
 601   {
 602     Label here, nope;
 603     load_prototype_header(tmp_reg, obj_reg);
 604     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 605     bind(here);
 606 
 607     // Fall through to the normal CAS-based lock, because no matter what
 608     // the result of the above CAS, some thread must have succeeded in
 609     // removing the bias bit from the object's header.
 610     if (counters != NULL) {
 611       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 612                   rscratch1, rscratch2);
 613     }
 614     bind(nope);
 615   }
 616 
 617   bind(cas_label);
 618 
 619   return null_check_offset;
 620 }
 621 
 622 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 623   assert(UseBiasedLocking, "why call this otherwise?");
 624 
 625   // Check for biased locking unlock case, which is a no-op
 626   // Note: we do not have to check the thread ID for two reasons.
 627   // First, the interpreter checks for IllegalMonitorStateException at
 628   // a higher level. Second, if the bias was revoked while we held the
 629   // lock, the object could not be rebiased toward another thread, so
 630   // the bias bit would be clear.
 631   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 632   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
 633   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
 634   br(Assembler::EQ, done);
 635 }
 636 
 637 static void pass_arg0(MacroAssembler* masm, Register arg) {
 638   if (c_rarg0 != arg ) {
 639     masm->mov(c_rarg0, arg);
 640   }
 641 }
 642 
 643 static void pass_arg1(MacroAssembler* masm, Register arg) {
 644   if (c_rarg1 != arg ) {
 645     masm->mov(c_rarg1, arg);
 646   }
 647 }
 648 
 649 static void pass_arg2(MacroAssembler* masm, Register arg) {
 650   if (c_rarg2 != arg ) {
 651     masm->mov(c_rarg2, arg);
 652   }
 653 }
 654 
 655 static void pass_arg3(MacroAssembler* masm, Register arg) {
 656   if (c_rarg3 != arg ) {
 657     masm->mov(c_rarg3, arg);
 658   }
 659 }
 660 
 661 void MacroAssembler::call_VM_base(Register oop_result,
 662                                   Register java_thread,
 663                                   Register last_java_sp,
 664                                   address  entry_point,
 665                                   int      number_of_arguments,
 666                                   bool     check_exceptions) {
 667    // determine java_thread register
 668   if (!java_thread->is_valid()) {
 669     java_thread = rthread;
 670   }
 671 
 672   // determine last_java_sp register
 673   if (!last_java_sp->is_valid()) {
 674     last_java_sp = esp;
 675   }
 676 
 677   // debugging support
 678   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 679   assert(java_thread == rthread, "unexpected register");
 680 #ifdef ASSERT
 681   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 682   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 683 #endif // ASSERT
 684 
 685   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 686   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 687 
 688   // push java thread (becomes first argument of C function)
 689 
 690   mov(c_rarg0, java_thread);
 691 
 692   // set last Java frame before call
 693   assert(last_java_sp != rfp, "can't use rfp");
 694 
 695   Label l;
 696   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 697 
 698   // do the call, remove parameters
 699   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 700 
 701   // reset last Java frame
 702   // Only interpreter should have to clear fp
 703   reset_last_Java_frame(true);
 704 
 705    // C++ interp handles this in the interpreter
 706   check_and_handle_popframe(java_thread);
 707   check_and_handle_earlyret(java_thread);
 708 
 709   if (check_exceptions) {
 710     // check for pending exceptions (java_thread is set upon return)
 711     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 712     Label ok;
 713     cbz(rscratch1, ok);
 714     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 715     br(rscratch1);
 716     bind(ok);
 717   }
 718 
 719   // get oop result if there is one and reset the value in the thread
 720   if (oop_result->is_valid()) {
 721     get_vm_result(oop_result, java_thread);
 722   }
 723 }
 724 
 725 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 726   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 727 }
 728 
 729 // Maybe emit a call via a trampoline.  If the code cache is small
 730 // trampolines won't be emitted.
 731 
 732 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 733   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 734   assert(entry.rspec().type() == relocInfo::runtime_call_type
 735          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 736          || entry.rspec().type() == relocInfo::static_call_type
 737          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 738 
 739   // We need a trampoline if branches are far.
 740   if (far_branches()) {
 741     bool in_scratch_emit_size = false;
 742 #ifdef COMPILER2
 743     // We don't want to emit a trampoline if C2 is generating dummy
 744     // code during its branch shortening phase.
 745     CompileTask* task = ciEnv::current()->task();
 746     in_scratch_emit_size =
 747       (task != NULL && is_c2_compile(task->comp_level()) &&
 748        Compile::current()->in_scratch_emit_size());
 749 #endif
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   // Max stub size: alignment nop, TrampolineStub.
 784   address stub = start_a_stub(NativeInstruction::instruction_size
 785                    + NativeCallTrampolineStub::instruction_size);
 786   if (stub == NULL) {
 787     return NULL;  // CodeBuffer::expand failed
 788   }
 789 
 790   // Create a trampoline stub relocation which relates this trampoline stub
 791   // with the call instruction at insts_call_instruction_offset in the
 792   // instructions code-section.
 793   align(wordSize);
 794   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 795                                             + insts_call_instruction_offset));
 796   const int stub_start_offset = offset();
 797 
 798   // Now, create the trampoline stub's code:
 799   // - load the call
 800   // - call
 801   Label target;
 802   ldr(rscratch1, target);
 803   br(rscratch1);
 804   bind(target);
 805   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 806          "should be");
 807   emit_int64((int64_t)dest);
 808 
 809   const address stub_start_addr = addr_at(stub_start_offset);
 810 
 811   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 812 
 813   end_a_stub();
 814   return stub_start_addr;
 815 }
 816 
 817 void MacroAssembler::emit_static_call_stub() {
 818   // CompiledDirectStaticCall::set_to_interpreted knows the
 819   // exact layout of this stub.
 820 
 821   isb();
 822   mov_metadata(rmethod, (Metadata*)NULL);
 823 
 824   // Jump to the entry point of the i2c stub.
 825   movptr(rscratch1, 0);
 826   br(rscratch1);
 827 }
 828 
 829 void MacroAssembler::c2bool(Register x) {
 830   // implements x == 0 ? 0 : 1
 831   // note: must only look at least-significant byte of x
 832   //       since C-style booleans are stored in one byte
 833   //       only! (was bug)
 834   tst(x, 0xff);
 835   cset(x, Assembler::NE);
 836 }
 837 
 838 address MacroAssembler::ic_call(address entry, jint method_index) {
 839   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 840   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 841   // unsigned long offset;
 842   // ldr_constant(rscratch2, const_ptr);
 843   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 844   return trampoline_call(Address(entry, rh));
 845 }
 846 
 847 // Implementation of call_VM versions
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              bool check_exceptions) {
 852   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              bool check_exceptions) {
 859   pass_arg1(this, arg_1);
 860   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 861 }
 862 
 863 void MacroAssembler::call_VM(Register oop_result,
 864                              address entry_point,
 865                              Register arg_1,
 866                              Register arg_2,
 867                              bool check_exceptions) {
 868   assert(arg_1 != c_rarg2, "smashed arg");
 869   pass_arg2(this, arg_2);
 870   pass_arg1(this, arg_1);
 871   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 872 }
 873 
 874 void MacroAssembler::call_VM(Register oop_result,
 875                              address entry_point,
 876                              Register arg_1,
 877                              Register arg_2,
 878                              Register arg_3,
 879                              bool check_exceptions) {
 880   assert(arg_1 != c_rarg3, "smashed arg");
 881   assert(arg_2 != c_rarg3, "smashed arg");
 882   pass_arg3(this, arg_3);
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886 
 887   pass_arg1(this, arg_1);
 888   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              int number_of_arguments,
 895                              bool check_exceptions) {
 896   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 897 }
 898 
 899 void MacroAssembler::call_VM(Register oop_result,
 900                              Register last_java_sp,
 901                              address entry_point,
 902                              Register arg_1,
 903                              bool check_exceptions) {
 904   pass_arg1(this, arg_1);
 905   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 906 }
 907 
 908 void MacroAssembler::call_VM(Register oop_result,
 909                              Register last_java_sp,
 910                              address entry_point,
 911                              Register arg_1,
 912                              Register arg_2,
 913                              bool check_exceptions) {
 914 
 915   assert(arg_1 != c_rarg2, "smashed arg");
 916   pass_arg2(this, arg_2);
 917   pass_arg1(this, arg_1);
 918   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 919 }
 920 
 921 void MacroAssembler::call_VM(Register oop_result,
 922                              Register last_java_sp,
 923                              address entry_point,
 924                              Register arg_1,
 925                              Register arg_2,
 926                              Register arg_3,
 927                              bool check_exceptions) {
 928   assert(arg_1 != c_rarg3, "smashed arg");
 929   assert(arg_2 != c_rarg3, "smashed arg");
 930   pass_arg3(this, arg_3);
 931   assert(arg_1 != c_rarg2, "smashed arg");
 932   pass_arg2(this, arg_2);
 933   pass_arg1(this, arg_1);
 934   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 935 }
 936 
 937 
 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 939   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 940   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 941   verify_oop(oop_result, "broken oop in call_VM_base");
 942 }
 943 
 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 945   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 946   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 947 }
 948 
 949 void MacroAssembler::align(int modulus) {
 950   while (offset() % modulus != 0) nop();
 951 }
 952 
 953 // these are no-ops overridden by InterpreterMacroAssembler
 954 
 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 956 
 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 958 
 959 
 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 961                                                       Register tmp,
 962                                                       int offset) {
 963   intptr_t value = *delayed_value_addr;
 964   if (value != 0)
 965     return RegisterOrConstant(value + offset);
 966 
 967   // load indirectly to solve generation ordering problem
 968   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 969 
 970   if (offset != 0)
 971     add(tmp, tmp, offset);
 972 
 973   return RegisterOrConstant(tmp);
 974 }
 975 
 976 // Look up the method for a megamorphic invokeinterface call.
 977 // The target method is determined by <intf_klass, itable_index>.
 978 // The receiver klass is in recv_klass.
 979 // On success, the result will be in method_result, and execution falls through.
 980 // On failure, execution transfers to the given label.
 981 void MacroAssembler::lookup_interface_method(Register recv_klass,
 982                                              Register intf_klass,
 983                                              RegisterOrConstant itable_index,
 984                                              Register method_result,
 985                                              Register scan_temp,
 986                                              Label& L_no_such_interface,
 987                          bool return_method) {
 988   assert_different_registers(recv_klass, intf_klass, scan_temp);
 989   assert_different_registers(method_result, intf_klass, scan_temp);
 990   assert(recv_klass != method_result || !return_method,
 991      "recv_klass can be destroyed when method isn't needed");
 992   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 993          "caller must use same register for non-constant itable index as for method");
 994 
 995   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 996   int vtable_base = in_bytes(Klass::vtable_start_offset());
 997   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 998   int scan_step   = itableOffsetEntry::size() * wordSize;
 999   int vte_size    = vtableEntry::size_in_bytes();
1000   assert(vte_size == wordSize, "else adjust times_vte_scale");
1001 
1002   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1003 
1004   // %%% Could store the aligned, prescaled offset in the klassoop.
1005   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1006   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1007   add(scan_temp, scan_temp, vtable_base);
1008 
1009   if (return_method) {
1010     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1011     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1012     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1013     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1014     if (itentry_off)
1015       add(recv_klass, recv_klass, itentry_off);
1016   }
1017 
1018   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1019   //   if (scan->interface() == intf) {
1020   //     result = (klass + scan->offset() + itable_index);
1021   //   }
1022   // }
1023   Label search, found_method;
1024 
1025   for (int peel = 1; peel >= 0; peel--) {
1026     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1027     cmp(intf_klass, method_result);
1028 
1029     if (peel) {
1030       br(Assembler::EQ, found_method);
1031     } else {
1032       br(Assembler::NE, search);
1033       // (invert the test to fall through to found_method...)
1034     }
1035 
1036     if (!peel)  break;
1037 
1038     bind(search);
1039 
1040     // Check that the previous entry is non-null.  A null entry means that
1041     // the receiver class doesn't implement the interface, and wasn't the
1042     // same as when the caller was compiled.
1043     cbz(method_result, L_no_such_interface);
1044     add(scan_temp, scan_temp, scan_step);
1045   }
1046 
1047   bind(found_method);
1048 
1049   // Got a hit.
1050   if (return_method) {
1051     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1052     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1053   }
1054 }
1055 
1056 // virtual method calling
1057 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1058                                            RegisterOrConstant vtable_index,
1059                                            Register method_result) {
1060   const int base = in_bytes(Klass::vtable_start_offset());
1061   assert(vtableEntry::size() * wordSize == 8,
1062          "adjust the scaling in the code below");
1063   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1064 
1065   if (vtable_index.is_register()) {
1066     lea(method_result, Address(recv_klass,
1067                                vtable_index.as_register(),
1068                                Address::lsl(LogBytesPerWord)));
1069     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1070   } else {
1071     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1072     ldr(method_result,
1073         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1074   }
1075 }
1076 
1077 void MacroAssembler::check_klass_subtype(Register sub_klass,
1078                            Register super_klass,
1079                            Register temp_reg,
1080                            Label& L_success) {
1081   Label L_failure;
1082   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1083   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1084   bind(L_failure);
1085 }
1086 
1087 
1088 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1089                                                    Register super_klass,
1090                                                    Register temp_reg,
1091                                                    Label* L_success,
1092                                                    Label* L_failure,
1093                                                    Label* L_slow_path,
1094                                         RegisterOrConstant super_check_offset) {
1095   assert_different_registers(sub_klass, super_klass, temp_reg);
1096   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1097   if (super_check_offset.is_register()) {
1098     assert_different_registers(sub_klass, super_klass,
1099                                super_check_offset.as_register());
1100   } else if (must_load_sco) {
1101     assert(temp_reg != noreg, "supply either a temp or a register offset");
1102   }
1103 
1104   Label L_fallthrough;
1105   int label_nulls = 0;
1106   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1107   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1108   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1109   assert(label_nulls <= 1, "at most one NULL in the batch");
1110 
1111   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1112   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1113   Address super_check_offset_addr(super_klass, sco_offset);
1114 
1115   // Hacked jmp, which may only be used just before L_fallthrough.
1116 #define final_jmp(label)                                                \
1117   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1118   else                            b(label)                /*omit semi*/
1119 
1120   // If the pointers are equal, we are done (e.g., String[] elements).
1121   // This self-check enables sharing of secondary supertype arrays among
1122   // non-primary types such as array-of-interface.  Otherwise, each such
1123   // type would need its own customized SSA.
1124   // We move this check to the front of the fast path because many
1125   // type checks are in fact trivially successful in this manner,
1126   // so we get a nicely predicted branch right at the start of the check.
1127   cmp(sub_klass, super_klass);
1128   br(Assembler::EQ, *L_success);
1129 
1130   // Check the supertype display:
1131   if (must_load_sco) {
1132     ldrw(temp_reg, super_check_offset_addr);
1133     super_check_offset = RegisterOrConstant(temp_reg);
1134   }
1135   Address super_check_addr(sub_klass, super_check_offset);
1136   ldr(rscratch1, super_check_addr);
1137   cmp(super_klass, rscratch1); // load displayed supertype
1138 
1139   // This check has worked decisively for primary supers.
1140   // Secondary supers are sought in the super_cache ('super_cache_addr').
1141   // (Secondary supers are interfaces and very deeply nested subtypes.)
1142   // This works in the same check above because of a tricky aliasing
1143   // between the super_cache and the primary super display elements.
1144   // (The 'super_check_addr' can address either, as the case requires.)
1145   // Note that the cache is updated below if it does not help us find
1146   // what we need immediately.
1147   // So if it was a primary super, we can just fail immediately.
1148   // Otherwise, it's the slow path for us (no success at this point).
1149 
1150   if (super_check_offset.is_register()) {
1151     br(Assembler::EQ, *L_success);
1152     subs(zr, super_check_offset.as_register(), sc_offset);
1153     if (L_failure == &L_fallthrough) {
1154       br(Assembler::EQ, *L_slow_path);
1155     } else {
1156       br(Assembler::NE, *L_failure);
1157       final_jmp(*L_slow_path);
1158     }
1159   } else if (super_check_offset.as_constant() == sc_offset) {
1160     // Need a slow path; fast failure is impossible.
1161     if (L_slow_path == &L_fallthrough) {
1162       br(Assembler::EQ, *L_success);
1163     } else {
1164       br(Assembler::NE, *L_slow_path);
1165       final_jmp(*L_success);
1166     }
1167   } else {
1168     // No slow path; it's a fast decision.
1169     if (L_failure == &L_fallthrough) {
1170       br(Assembler::EQ, *L_success);
1171     } else {
1172       br(Assembler::NE, *L_failure);
1173       final_jmp(*L_success);
1174     }
1175   }
1176 
1177   bind(L_fallthrough);
1178 
1179 #undef final_jmp
1180 }
1181 
1182 // These two are taken from x86, but they look generally useful
1183 
1184 // scans count pointer sized words at [addr] for occurence of value,
1185 // generic
1186 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1187                                 Register scratch) {
1188   Label Lloop, Lexit;
1189   cbz(count, Lexit);
1190   bind(Lloop);
1191   ldr(scratch, post(addr, wordSize));
1192   cmp(value, scratch);
1193   br(EQ, Lexit);
1194   sub(count, count, 1);
1195   cbnz(count, Lloop);
1196   bind(Lexit);
1197 }
1198 
1199 // scans count 4 byte words at [addr] for occurence of value,
1200 // generic
1201 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1202                                 Register scratch) {
1203   Label Lloop, Lexit;
1204   cbz(count, Lexit);
1205   bind(Lloop);
1206   ldrw(scratch, post(addr, wordSize));
1207   cmpw(value, scratch);
1208   br(EQ, Lexit);
1209   sub(count, count, 1);
1210   cbnz(count, Lloop);
1211   bind(Lexit);
1212 }
1213 
1214 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1215                                                    Register super_klass,
1216                                                    Register temp_reg,
1217                                                    Register temp2_reg,
1218                                                    Label* L_success,
1219                                                    Label* L_failure,
1220                                                    bool set_cond_codes) {
1221   assert_different_registers(sub_klass, super_klass, temp_reg);
1222   if (temp2_reg != noreg)
1223     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1224 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1225 
1226   Label L_fallthrough;
1227   int label_nulls = 0;
1228   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1229   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1230   assert(label_nulls <= 1, "at most one NULL in the batch");
1231 
1232   // a couple of useful fields in sub_klass:
1233   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1234   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1235   Address secondary_supers_addr(sub_klass, ss_offset);
1236   Address super_cache_addr(     sub_klass, sc_offset);
1237 
1238   BLOCK_COMMENT("check_klass_subtype_slow_path");
1239 
1240   // Do a linear scan of the secondary super-klass chain.
1241   // This code is rarely used, so simplicity is a virtue here.
1242   // The repne_scan instruction uses fixed registers, which we must spill.
1243   // Don't worry too much about pre-existing connections with the input regs.
1244 
1245   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1246   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1247 
1248   RegSet pushed_registers;
1249   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1250   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1251 
1252   if (super_klass != r0 || UseCompressedOops) {
1253     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1254   }
1255 
1256   push(pushed_registers, sp);
1257 
1258   // Get super_klass value into r0 (even if it was in r5 or r2).
1259   if (super_klass != r0) {
1260     mov(r0, super_klass);
1261   }
1262 
1263 #ifndef PRODUCT
1264   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1265   Address pst_counter_addr(rscratch2);
1266   ldr(rscratch1, pst_counter_addr);
1267   add(rscratch1, rscratch1, 1);
1268   str(rscratch1, pst_counter_addr);
1269 #endif //PRODUCT
1270 
1271   // We will consult the secondary-super array.
1272   ldr(r5, secondary_supers_addr);
1273   // Load the array length.
1274   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1275   // Skip to start of data.
1276   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1277 
1278   cmp(sp, zr); // Clear Z flag; SP is never zero
1279   // Scan R2 words at [R5] for an occurrence of R0.
1280   // Set NZ/Z based on last compare.
1281   repne_scan(r5, r0, r2, rscratch1);
1282 
1283   // Unspill the temp. registers:
1284   pop(pushed_registers, sp);
1285 
1286   br(Assembler::NE, *L_failure);
1287 
1288   // Success.  Cache the super we found and proceed in triumph.
1289   str(super_klass, super_cache_addr);
1290 
1291   if (L_success != &L_fallthrough) {
1292     b(*L_success);
1293   }
1294 
1295 #undef IS_A_TEMP
1296 
1297   bind(L_fallthrough);
1298 }
1299 
1300 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1301   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1302   assert_different_registers(klass, rthread, scratch);
1303 
1304   Label L_fallthrough, L_tmp;
1305   if (L_fast_path == NULL) {
1306     L_fast_path = &L_fallthrough;
1307   } else if (L_slow_path == NULL) {
1308     L_slow_path = &L_fallthrough;
1309   }
1310   // Fast path check: class is fully initialized
1311   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1312   subs(zr, scratch, InstanceKlass::fully_initialized);
1313   br(Assembler::EQ, *L_fast_path);
1314 
1315   // Fast path check: current thread is initializer thread
1316   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1317   cmp(rthread, scratch);
1318 
1319   if (L_slow_path == &L_fallthrough) {
1320     br(Assembler::EQ, *L_fast_path);
1321     bind(*L_slow_path);
1322   } else if (L_fast_path == &L_fallthrough) {
1323     br(Assembler::NE, *L_slow_path);
1324     bind(*L_fast_path);
1325   } else {
1326     Unimplemented();
1327   }
1328 }
1329 
1330 void MacroAssembler::verify_oop(Register reg, const char* s) {
1331   if (!VerifyOops || VerifyAdapterSharing) {
1332     // Below address of the code string confuses VerifyAdapterSharing
1333     // because it may differ between otherwise equivalent adapters.
1334     return;
1335   }
1336 
1337   // Pass register number to verify_oop_subroutine
1338   const char* b = NULL;
1339   {
1340     ResourceMark rm;
1341     stringStream ss;
1342     ss.print("verify_oop: %s: %s", reg->name(), s);
1343     b = code_string(ss.as_string());
1344   }
1345   BLOCK_COMMENT("verify_oop {");
1346 
1347   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1348   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1349 
1350   mov(r0, reg);
1351   mov(rscratch1, (address)b);
1352 
1353   // call indirectly to solve generation ordering problem
1354   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1355   ldr(rscratch2, Address(rscratch2));
1356   blr(rscratch2);
1357 
1358   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1359   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1360 
1361   BLOCK_COMMENT("} verify_oop");
1362 }
1363 
1364 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1365   if (!VerifyOops || VerifyAdapterSharing) {
1366     // Below address of the code string confuses VerifyAdapterSharing
1367     // because it may differ between otherwise equivalent adapters.
1368     return;
1369   }
1370 
1371   const char* b = NULL;
1372   {
1373     ResourceMark rm;
1374     stringStream ss;
1375     ss.print("verify_oop_addr: %s", s);
1376     b = code_string(ss.as_string());
1377   }
1378   BLOCK_COMMENT("verify_oop_addr {");
1379 
1380   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1381   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1382 
1383   // addr may contain sp so we will have to adjust it based on the
1384   // pushes that we just did.
1385   if (addr.uses(sp)) {
1386     lea(r0, addr);
1387     ldr(r0, Address(r0, 4 * wordSize));
1388   } else {
1389     ldr(r0, addr);
1390   }
1391   mov(rscratch1, (address)b);
1392 
1393   // call indirectly to solve generation ordering problem
1394   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1395   ldr(rscratch2, Address(rscratch2));
1396   blr(rscratch2);
1397 
1398   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1399   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1400 
1401   BLOCK_COMMENT("} verify_oop_addr");
1402 }
1403 
1404 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1405                                          int extra_slot_offset) {
1406   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1407   int stackElementSize = Interpreter::stackElementSize;
1408   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1409 #ifdef ASSERT
1410   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1411   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1412 #endif
1413   if (arg_slot.is_constant()) {
1414     return Address(esp, arg_slot.as_constant() * stackElementSize
1415                    + offset);
1416   } else {
1417     add(rscratch1, esp, arg_slot.as_register(),
1418         ext::uxtx, exact_log2(stackElementSize));
1419     return Address(rscratch1, offset);
1420   }
1421 }
1422 
1423 void MacroAssembler::call_VM_leaf_base(address entry_point,
1424                                        int number_of_arguments,
1425                                        Label *retaddr) {
1426   Label E, L;
1427 
1428   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1429 
1430   mov(rscratch1, entry_point);
1431   blr(rscratch1);
1432   if (retaddr)
1433     bind(*retaddr);
1434 
1435   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1436   maybe_isb();
1437 }
1438 
1439 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1440   call_VM_leaf_base(entry_point, number_of_arguments);
1441 }
1442 
1443 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1444   pass_arg0(this, arg_0);
1445   call_VM_leaf_base(entry_point, 1);
1446 }
1447 
1448 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1449   pass_arg0(this, arg_0);
1450   pass_arg1(this, arg_1);
1451   call_VM_leaf_base(entry_point, 2);
1452 }
1453 
1454 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1455                                   Register arg_1, Register arg_2) {
1456   pass_arg0(this, arg_0);
1457   pass_arg1(this, arg_1);
1458   pass_arg2(this, arg_2);
1459   call_VM_leaf_base(entry_point, 3);
1460 }
1461 
1462 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1463   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1464 }
1465 
1466 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1467   pass_arg0(this, arg_0);
1468   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1469 }
1470 
1471 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1472 
1473   assert(arg_0 != c_rarg1, "smashed arg");
1474   pass_arg1(this, arg_1);
1475   pass_arg0(this, arg_0);
1476   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1477 }
1478 
1479 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1480   assert(arg_0 != c_rarg2, "smashed arg");
1481   assert(arg_1 != c_rarg2, "smashed arg");
1482   pass_arg2(this, arg_2);
1483   assert(arg_0 != c_rarg1, "smashed arg");
1484   pass_arg1(this, arg_1);
1485   pass_arg0(this, arg_0);
1486   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1487 }
1488 
1489 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1490   assert(arg_0 != c_rarg3, "smashed arg");
1491   assert(arg_1 != c_rarg3, "smashed arg");
1492   assert(arg_2 != c_rarg3, "smashed arg");
1493   pass_arg3(this, arg_3);
1494   assert(arg_0 != c_rarg2, "smashed arg");
1495   assert(arg_1 != c_rarg2, "smashed arg");
1496   pass_arg2(this, arg_2);
1497   assert(arg_0 != c_rarg1, "smashed arg");
1498   pass_arg1(this, arg_1);
1499   pass_arg0(this, arg_0);
1500   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1501 }
1502 
1503 void MacroAssembler::null_check(Register reg, int offset) {
1504   if (needs_explicit_null_check(offset)) {
1505     // provoke OS NULL exception if reg = NULL by
1506     // accessing M[reg] w/o changing any registers
1507     // NOTE: this is plenty to provoke a segv
1508     ldr(zr, Address(reg));
1509   } else {
1510     // nothing to do, (later) access of M[reg + offset]
1511     // will provoke OS NULL exception if reg = NULL
1512   }
1513 }
1514 
1515 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
1516   ldrw(temp_reg, Address(klass, Klass::access_flags_offset()));
1517   andr(temp_reg, temp_reg, JVM_ACC_VALUE);
1518   cbnz(temp_reg, is_value); 
1519 }
1520 
1521 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
1522   (void) temp_reg; // keep signature uniform with x86
1523   tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable);
1524 }
1525 
1526 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) {
1527   (void) temp_reg; // keep signature uniform with x86
1528   tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable);
1529 }
1530 
1531 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
1532   (void) temp_reg; // keep signature uniform with x86
1533   tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened);
1534 }
1535 
1536 void MacroAssembler::test_flattened_array_oop(Register oop, Register temp_reg, Label& is_flattened_array) {
1537   load_storage_props(temp_reg, oop);
1538   andr(temp_reg, temp_reg, ArrayStorageProperties::flattened_value);
1539   cbnz(temp_reg, is_flattened_array);
1540 }
1541 
1542 void MacroAssembler::test_null_free_array_oop(Register oop, Register temp_reg, Label& is_null_free_array) {
1543   load_storage_props(temp_reg, oop);
1544   andr(temp_reg, temp_reg, ArrayStorageProperties::null_free_value);
1545   cbnz(temp_reg, is_null_free_array);
1546 }
1547 
1548 // MacroAssembler protected routines needed to implement
1549 // public methods
1550 
1551 void MacroAssembler::mov(Register r, Address dest) {
1552   code_section()->relocate(pc(), dest.rspec());
1553   u_int64_t imm64 = (u_int64_t)dest.target();
1554   movptr(r, imm64);
1555 }
1556 
1557 // Move a constant pointer into r.  In AArch64 mode the virtual
1558 // address space is 48 bits in size, so we only need three
1559 // instructions to create a patchable instruction sequence that can
1560 // reach anywhere.
1561 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1562 #ifndef PRODUCT
1563   {
1564     char buffer[64];
1565     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1566     block_comment(buffer);
1567   }
1568 #endif
1569   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1570   movz(r, imm64 & 0xffff);
1571   imm64 >>= 16;
1572   movk(r, imm64 & 0xffff, 16);
1573   imm64 >>= 16;
1574   movk(r, imm64 & 0xffff, 32);
1575 }
1576 
1577 // Macro to mov replicated immediate to vector register.
1578 //  Vd will get the following values for different arrangements in T
1579 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1580 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1581 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1582 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1583 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1584 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1585 //   T1D/T2D: invalid
1586 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1587   assert(T != T1D && T != T2D, "invalid arrangement");
1588   if (T == T8B || T == T16B) {
1589     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1590     movi(Vd, T, imm32 & 0xff, 0);
1591     return;
1592   }
1593   u_int32_t nimm32 = ~imm32;
1594   if (T == T4H || T == T8H) {
1595     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1596     imm32 &= 0xffff;
1597     nimm32 &= 0xffff;
1598   }
1599   u_int32_t x = imm32;
1600   int movi_cnt = 0;
1601   int movn_cnt = 0;
1602   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1603   x = nimm32;
1604   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1605   if (movn_cnt < movi_cnt) imm32 = nimm32;
1606   unsigned lsl = 0;
1607   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1608   if (movn_cnt < movi_cnt)
1609     mvni(Vd, T, imm32 & 0xff, lsl);
1610   else
1611     movi(Vd, T, imm32 & 0xff, lsl);
1612   imm32 >>= 8; lsl += 8;
1613   while (imm32) {
1614     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1615     if (movn_cnt < movi_cnt)
1616       bici(Vd, T, imm32 & 0xff, lsl);
1617     else
1618       orri(Vd, T, imm32 & 0xff, lsl);
1619     lsl += 8; imm32 >>= 8;
1620   }
1621 }
1622 
1623 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1624 {
1625 #ifndef PRODUCT
1626   {
1627     char buffer[64];
1628     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1629     block_comment(buffer);
1630   }
1631 #endif
1632   if (operand_valid_for_logical_immediate(false, imm64)) {
1633     orr(dst, zr, imm64);
1634   } else {
1635     // we can use a combination of MOVZ or MOVN with
1636     // MOVK to build up the constant
1637     u_int64_t imm_h[4];
1638     int zero_count = 0;
1639     int neg_count = 0;
1640     int i;
1641     for (i = 0; i < 4; i++) {
1642       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1643       if (imm_h[i] == 0) {
1644         zero_count++;
1645       } else if (imm_h[i] == 0xffffL) {
1646         neg_count++;
1647       }
1648     }
1649     if (zero_count == 4) {
1650       // one MOVZ will do
1651       movz(dst, 0);
1652     } else if (neg_count == 4) {
1653       // one MOVN will do
1654       movn(dst, 0);
1655     } else if (zero_count == 3) {
1656       for (i = 0; i < 4; i++) {
1657         if (imm_h[i] != 0L) {
1658           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1659           break;
1660         }
1661       }
1662     } else if (neg_count == 3) {
1663       // one MOVN will do
1664       for (int i = 0; i < 4; i++) {
1665         if (imm_h[i] != 0xffffL) {
1666           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1667           break;
1668         }
1669       }
1670     } else if (zero_count == 2) {
1671       // one MOVZ and one MOVK will do
1672       for (i = 0; i < 3; i++) {
1673         if (imm_h[i] != 0L) {
1674           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1675           i++;
1676           break;
1677         }
1678       }
1679       for (;i < 4; i++) {
1680         if (imm_h[i] != 0L) {
1681           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1682         }
1683       }
1684     } else if (neg_count == 2) {
1685       // one MOVN and one MOVK will do
1686       for (i = 0; i < 4; i++) {
1687         if (imm_h[i] != 0xffffL) {
1688           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1689           i++;
1690           break;
1691         }
1692       }
1693       for (;i < 4; i++) {
1694         if (imm_h[i] != 0xffffL) {
1695           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1696         }
1697       }
1698     } else if (zero_count == 1) {
1699       // one MOVZ and two MOVKs will do
1700       for (i = 0; i < 4; i++) {
1701         if (imm_h[i] != 0L) {
1702           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1703           i++;
1704           break;
1705         }
1706       }
1707       for (;i < 4; i++) {
1708         if (imm_h[i] != 0x0L) {
1709           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1710         }
1711       }
1712     } else if (neg_count == 1) {
1713       // one MOVN and two MOVKs will do
1714       for (i = 0; i < 4; i++) {
1715         if (imm_h[i] != 0xffffL) {
1716           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1717           i++;
1718           break;
1719         }
1720       }
1721       for (;i < 4; i++) {
1722         if (imm_h[i] != 0xffffL) {
1723           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1724         }
1725       }
1726     } else {
1727       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1728       movz(dst, (u_int32_t)imm_h[0], 0);
1729       for (i = 1; i < 4; i++) {
1730         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1731       }
1732     }
1733   }
1734 }
1735 
1736 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1737 {
1738 #ifndef PRODUCT
1739     {
1740       char buffer[64];
1741       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1742       block_comment(buffer);
1743     }
1744 #endif
1745   if (operand_valid_for_logical_immediate(true, imm32)) {
1746     orrw(dst, zr, imm32);
1747   } else {
1748     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1749     // constant
1750     u_int32_t imm_h[2];
1751     imm_h[0] = imm32 & 0xffff;
1752     imm_h[1] = ((imm32 >> 16) & 0xffff);
1753     if (imm_h[0] == 0) {
1754       movzw(dst, imm_h[1], 16);
1755     } else if (imm_h[0] == 0xffff) {
1756       movnw(dst, imm_h[1] ^ 0xffff, 16);
1757     } else if (imm_h[1] == 0) {
1758       movzw(dst, imm_h[0], 0);
1759     } else if (imm_h[1] == 0xffff) {
1760       movnw(dst, imm_h[0] ^ 0xffff, 0);
1761     } else {
1762       // use a MOVZ and MOVK (makes it easier to debug)
1763       movzw(dst, imm_h[0], 0);
1764       movkw(dst, imm_h[1], 16);
1765     }
1766   }
1767 }
1768 
1769 // Form an address from base + offset in Rd.  Rd may or may
1770 // not actually be used: you must use the Address that is returned.
1771 // It is up to you to ensure that the shift provided matches the size
1772 // of your data.
1773 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1774   if (Address::offset_ok_for_immed(byte_offset, shift))
1775     // It fits; no need for any heroics
1776     return Address(base, byte_offset);
1777 
1778   // Don't do anything clever with negative or misaligned offsets
1779   unsigned mask = (1 << shift) - 1;
1780   if (byte_offset < 0 || byte_offset & mask) {
1781     mov(Rd, byte_offset);
1782     add(Rd, base, Rd);
1783     return Address(Rd);
1784   }
1785 
1786   // See if we can do this with two 12-bit offsets
1787   {
1788     unsigned long word_offset = byte_offset >> shift;
1789     unsigned long masked_offset = word_offset & 0xfff000;
1790     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1791         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1792       add(Rd, base, masked_offset << shift);
1793       word_offset -= masked_offset;
1794       return Address(Rd, word_offset << shift);
1795     }
1796   }
1797 
1798   // Do it the hard way
1799   mov(Rd, byte_offset);
1800   add(Rd, base, Rd);
1801   return Address(Rd);
1802 }
1803 
1804 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1805   if (UseLSE) {
1806     mov(tmp, 1);
1807     ldadd(Assembler::word, tmp, zr, counter_addr);
1808     return;
1809   }
1810   Label retry_load;
1811   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1812     prfm(Address(counter_addr), PSTL1STRM);
1813   bind(retry_load);
1814   // flush and load exclusive from the memory location
1815   ldxrw(tmp, counter_addr);
1816   addw(tmp, tmp, 1);
1817   // if we store+flush with no intervening write tmp wil be zero
1818   stxrw(tmp2, tmp, counter_addr);
1819   cbnzw(tmp2, retry_load);
1820 }
1821 
1822 
1823 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1824                                     bool want_remainder, Register scratch)
1825 {
1826   // Full implementation of Java idiv and irem.  The function
1827   // returns the (pc) offset of the div instruction - may be needed
1828   // for implicit exceptions.
1829   //
1830   // constraint : ra/rb =/= scratch
1831   //         normal case
1832   //
1833   // input : ra: dividend
1834   //         rb: divisor
1835   //
1836   // result: either
1837   //         quotient  (= ra idiv rb)
1838   //         remainder (= ra irem rb)
1839 
1840   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1841 
1842   int idivl_offset = offset();
1843   if (! want_remainder) {
1844     sdivw(result, ra, rb);
1845   } else {
1846     sdivw(scratch, ra, rb);
1847     Assembler::msubw(result, scratch, rb, ra);
1848   }
1849 
1850   return idivl_offset;
1851 }
1852 
1853 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1854                                     bool want_remainder, Register scratch)
1855 {
1856   // Full implementation of Java ldiv and lrem.  The function
1857   // returns the (pc) offset of the div instruction - may be needed
1858   // for implicit exceptions.
1859   //
1860   // constraint : ra/rb =/= scratch
1861   //         normal case
1862   //
1863   // input : ra: dividend
1864   //         rb: divisor
1865   //
1866   // result: either
1867   //         quotient  (= ra idiv rb)
1868   //         remainder (= ra irem rb)
1869 
1870   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1871 
1872   int idivq_offset = offset();
1873   if (! want_remainder) {
1874     sdiv(result, ra, rb);
1875   } else {
1876     sdiv(scratch, ra, rb);
1877     Assembler::msub(result, scratch, rb, ra);
1878   }
1879 
1880   return idivq_offset;
1881 }
1882 
1883 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1884   address prev = pc() - NativeMembar::instruction_size;
1885   address last = code()->last_insn();
1886   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1887     NativeMembar *bar = NativeMembar_at(prev);
1888     // We are merging two memory barrier instructions.  On AArch64 we
1889     // can do this simply by ORing them together.
1890     bar->set_kind(bar->get_kind() | order_constraint);
1891     BLOCK_COMMENT("merged membar");
1892   } else {
1893     code()->set_last_insn(pc());
1894     dmb(Assembler::barrier(order_constraint));
1895   }
1896 }
1897 
1898 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1899   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1900     merge_ldst(rt, adr, size_in_bytes, is_store);
1901     code()->clear_last_insn();
1902     return true;
1903   } else {
1904     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1905     const unsigned mask = size_in_bytes - 1;
1906     if (adr.getMode() == Address::base_plus_offset &&
1907         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1908       code()->set_last_insn(pc());
1909     }
1910     return false;
1911   }
1912 }
1913 
1914 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1915   // We always try to merge two adjacent loads into one ldp.
1916   if (!try_merge_ldst(Rx, adr, 8, false)) {
1917     Assembler::ldr(Rx, adr);
1918   }
1919 }
1920 
1921 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1922   // We always try to merge two adjacent loads into one ldp.
1923   if (!try_merge_ldst(Rw, adr, 4, false)) {
1924     Assembler::ldrw(Rw, adr);
1925   }
1926 }
1927 
1928 void MacroAssembler::str(Register Rx, const Address &adr) {
1929   // We always try to merge two adjacent stores into one stp.
1930   if (!try_merge_ldst(Rx, adr, 8, true)) {
1931     Assembler::str(Rx, adr);
1932   }
1933 }
1934 
1935 void MacroAssembler::strw(Register Rw, const Address &adr) {
1936   // We always try to merge two adjacent stores into one stp.
1937   if (!try_merge_ldst(Rw, adr, 4, true)) {
1938     Assembler::strw(Rw, adr);
1939   }
1940 }
1941 
1942 // MacroAssembler routines found actually to be needed
1943 
1944 void MacroAssembler::push(Register src)
1945 {
1946   str(src, Address(pre(esp, -1 * wordSize)));
1947 }
1948 
1949 void MacroAssembler::pop(Register dst)
1950 {
1951   ldr(dst, Address(post(esp, 1 * wordSize)));
1952 }
1953 
1954 // Note: load_unsigned_short used to be called load_unsigned_word.
1955 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1956   int off = offset();
1957   ldrh(dst, src);
1958   return off;
1959 }
1960 
1961 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1962   int off = offset();
1963   ldrb(dst, src);
1964   return off;
1965 }
1966 
1967 int MacroAssembler::load_signed_short(Register dst, Address src) {
1968   int off = offset();
1969   ldrsh(dst, src);
1970   return off;
1971 }
1972 
1973 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1974   int off = offset();
1975   ldrsb(dst, src);
1976   return off;
1977 }
1978 
1979 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1980   int off = offset();
1981   ldrshw(dst, src);
1982   return off;
1983 }
1984 
1985 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1986   int off = offset();
1987   ldrsbw(dst, src);
1988   return off;
1989 }
1990 
1991 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1992   switch (size_in_bytes) {
1993   case  8:  ldr(dst, src); break;
1994   case  4:  ldrw(dst, src); break;
1995   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1996   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1997   default:  ShouldNotReachHere();
1998   }
1999 }
2000 
2001 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
2002   switch (size_in_bytes) {
2003   case  8:  str(src, dst); break;
2004   case  4:  strw(src, dst); break;
2005   case  2:  strh(src, dst); break;
2006   case  1:  strb(src, dst); break;
2007   default:  ShouldNotReachHere();
2008   }
2009 }
2010 
2011 void MacroAssembler::decrementw(Register reg, int value)
2012 {
2013   if (value < 0)  { incrementw(reg, -value);      return; }
2014   if (value == 0) {                               return; }
2015   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2016   /* else */ {
2017     guarantee(reg != rscratch2, "invalid dst for register decrement");
2018     movw(rscratch2, (unsigned)value);
2019     subw(reg, reg, rscratch2);
2020   }
2021 }
2022 
2023 void MacroAssembler::decrement(Register reg, int value)
2024 {
2025   if (value < 0)  { increment(reg, -value);      return; }
2026   if (value == 0) {                              return; }
2027   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2028   /* else */ {
2029     assert(reg != rscratch2, "invalid dst for register decrement");
2030     mov(rscratch2, (unsigned long)value);
2031     sub(reg, reg, rscratch2);
2032   }
2033 }
2034 
2035 void MacroAssembler::decrementw(Address dst, int value)
2036 {
2037   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2038   if (dst.getMode() == Address::literal) {
2039     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2040     lea(rscratch2, dst);
2041     dst = Address(rscratch2);
2042   }
2043   ldrw(rscratch1, dst);
2044   decrementw(rscratch1, value);
2045   strw(rscratch1, dst);
2046 }
2047 
2048 void MacroAssembler::decrement(Address dst, int value)
2049 {
2050   assert(!dst.uses(rscratch1), "invalid address for decrement");
2051   if (dst.getMode() == Address::literal) {
2052     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2053     lea(rscratch2, dst);
2054     dst = Address(rscratch2);
2055   }
2056   ldr(rscratch1, dst);
2057   decrement(rscratch1, value);
2058   str(rscratch1, dst);
2059 }
2060 
2061 void MacroAssembler::incrementw(Register reg, int value)
2062 {
2063   if (value < 0)  { decrementw(reg, -value);      return; }
2064   if (value == 0) {                               return; }
2065   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2066   /* else */ {
2067     assert(reg != rscratch2, "invalid dst for register increment");
2068     movw(rscratch2, (unsigned)value);
2069     addw(reg, reg, rscratch2);
2070   }
2071 }
2072 
2073 void MacroAssembler::increment(Register reg, int value)
2074 {
2075   if (value < 0)  { decrement(reg, -value);      return; }
2076   if (value == 0) {                              return; }
2077   if (value < (1 << 12)) { add(reg, reg, value); return; }
2078   /* else */ {
2079     assert(reg != rscratch2, "invalid dst for register increment");
2080     movw(rscratch2, (unsigned)value);
2081     add(reg, reg, rscratch2);
2082   }
2083 }
2084 
2085 void MacroAssembler::incrementw(Address dst, int value)
2086 {
2087   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2088   if (dst.getMode() == Address::literal) {
2089     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2090     lea(rscratch2, dst);
2091     dst = Address(rscratch2);
2092   }
2093   ldrw(rscratch1, dst);
2094   incrementw(rscratch1, value);
2095   strw(rscratch1, dst);
2096 }
2097 
2098 void MacroAssembler::increment(Address dst, int value)
2099 {
2100   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2101   if (dst.getMode() == Address::literal) {
2102     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2103     lea(rscratch2, dst);
2104     dst = Address(rscratch2);
2105   }
2106   ldr(rscratch1, dst);
2107   increment(rscratch1, value);
2108   str(rscratch1, dst);
2109 }
2110 
2111 
2112 void MacroAssembler::pusha() {
2113   push(0x7fffffff, sp);
2114 }
2115 
2116 void MacroAssembler::popa() {
2117   pop(0x7fffffff, sp);
2118 }
2119 
2120 // Push lots of registers in the bit set supplied.  Don't push sp.
2121 // Return the number of words pushed
2122 int MacroAssembler::push(unsigned int bitset, Register stack) {
2123   int words_pushed = 0;
2124 
2125   // Scan bitset to accumulate register pairs
2126   unsigned char regs[32];
2127   int count = 0;
2128   for (int reg = 0; reg <= 30; reg++) {
2129     if (1 & bitset)
2130       regs[count++] = reg;
2131     bitset >>= 1;
2132   }
2133   regs[count++] = zr->encoding_nocheck();
2134   count &= ~1;  // Only push an even nuber of regs
2135 
2136   if (count) {
2137     stp(as_Register(regs[0]), as_Register(regs[1]),
2138        Address(pre(stack, -count * wordSize)));
2139     words_pushed += 2;
2140   }
2141   for (int i = 2; i < count; i += 2) {
2142     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2143        Address(stack, i * wordSize));
2144     words_pushed += 2;
2145   }
2146 
2147   assert(words_pushed == count, "oops, pushed != count");
2148 
2149   return count;
2150 }
2151 
2152 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2153   int words_pushed = 0;
2154 
2155   // Scan bitset to accumulate register pairs
2156   unsigned char regs[32];
2157   int count = 0;
2158   for (int reg = 0; reg <= 30; reg++) {
2159     if (1 & bitset)
2160       regs[count++] = reg;
2161     bitset >>= 1;
2162   }
2163   regs[count++] = zr->encoding_nocheck();
2164   count &= ~1;
2165 
2166   for (int i = 2; i < count; i += 2) {
2167     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2168        Address(stack, i * wordSize));
2169     words_pushed += 2;
2170   }
2171   if (count) {
2172     ldp(as_Register(regs[0]), as_Register(regs[1]),
2173        Address(post(stack, count * wordSize)));
2174     words_pushed += 2;
2175   }
2176 
2177   assert(words_pushed == count, "oops, pushed != count");
2178 
2179   return count;
2180 }
2181 #ifdef ASSERT
2182 void MacroAssembler::verify_heapbase(const char* msg) {
2183 #if 0
2184   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2185   assert (Universe::heap() != NULL, "java heap should be initialized");
2186   if (CheckCompressedOops) {
2187     Label ok;
2188     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2189     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2190     br(Assembler::EQ, ok);
2191     stop(msg);
2192     bind(ok);
2193     pop(1 << rscratch1->encoding(), sp);
2194   }
2195 #endif
2196 }
2197 #endif
2198 
2199 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2200   Label done, not_weak;
2201   cbz(value, done);           // Use NULL as-is.
2202 
2203   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2204   tbz(r0, 0, not_weak);    // Test for jweak tag.
2205 
2206   // Resolve jweak.
2207   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2208                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2209   verify_oop(value);
2210   b(done);
2211 
2212   bind(not_weak);
2213   // Resolve (untagged) jobject.
2214   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2215   verify_oop(value);
2216   bind(done);
2217 }
2218 
2219 void MacroAssembler::stop(const char* msg) {
2220   address ip = pc();
2221   pusha();
2222   mov(c_rarg0, (address)msg);
2223   mov(c_rarg1, (address)ip);
2224   mov(c_rarg2, sp);
2225   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2226   blr(c_rarg3);
2227   hlt(0);
2228 }
2229 
2230 void MacroAssembler::warn(const char* msg) {
2231   pusha();
2232   mov(c_rarg0, (address)msg);
2233   mov(lr, CAST_FROM_FN_PTR(address, warning));
2234   blr(lr);
2235   popa();
2236 }
2237 
2238 void MacroAssembler::unimplemented(const char* what) {
2239   const char* buf = NULL;
2240   {
2241     ResourceMark rm;
2242     stringStream ss;
2243     ss.print("unimplemented: %s", what);
2244     buf = code_string(ss.as_string());
2245   }
2246   stop(buf);
2247 }
2248 
2249 // If a constant does not fit in an immediate field, generate some
2250 // number of MOV instructions and then perform the operation.
2251 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2252                                            add_sub_imm_insn insn1,
2253                                            add_sub_reg_insn insn2) {
2254   assert(Rd != zr, "Rd = zr and not setting flags?");
2255   if (operand_valid_for_add_sub_immediate((int)imm)) {
2256     (this->*insn1)(Rd, Rn, imm);
2257   } else {
2258     if (uabs(imm) < (1 << 24)) {
2259        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2260        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2261     } else {
2262        assert_different_registers(Rd, Rn);
2263        mov(Rd, (uint64_t)imm);
2264        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2265     }
2266   }
2267 }
2268 
2269 // Seperate vsn which sets the flags. Optimisations are more restricted
2270 // because we must set the flags correctly.
2271 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2272                                            add_sub_imm_insn insn1,
2273                                            add_sub_reg_insn insn2) {
2274   if (operand_valid_for_add_sub_immediate((int)imm)) {
2275     (this->*insn1)(Rd, Rn, imm);
2276   } else {
2277     assert_different_registers(Rd, Rn);
2278     assert(Rd != zr, "overflow in immediate operand");
2279     mov(Rd, (uint64_t)imm);
2280     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2281   }
2282 }
2283 
2284 
2285 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2286   if (increment.is_register()) {
2287     add(Rd, Rn, increment.as_register());
2288   } else {
2289     add(Rd, Rn, increment.as_constant());
2290   }
2291 }
2292 
2293 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2294   if (increment.is_register()) {
2295     addw(Rd, Rn, increment.as_register());
2296   } else {
2297     addw(Rd, Rn, increment.as_constant());
2298   }
2299 }
2300 
2301 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2302   if (decrement.is_register()) {
2303     sub(Rd, Rn, decrement.as_register());
2304   } else {
2305     sub(Rd, Rn, decrement.as_constant());
2306   }
2307 }
2308 
2309 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2310   if (decrement.is_register()) {
2311     subw(Rd, Rn, decrement.as_register());
2312   } else {
2313     subw(Rd, Rn, decrement.as_constant());
2314   }
2315 }
2316 
2317 void MacroAssembler::reinit_heapbase()
2318 {
2319   if (UseCompressedOops) {
2320     if (Universe::is_fully_initialized()) {
2321       mov(rheapbase, CompressedOops::ptrs_base());
2322     } else {
2323       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2324       ldr(rheapbase, Address(rheapbase));
2325     }
2326   }
2327 }
2328 
2329 // this simulates the behaviour of the x86 cmpxchg instruction using a
2330 // load linked/store conditional pair. we use the acquire/release
2331 // versions of these instructions so that we flush pending writes as
2332 // per Java semantics.
2333 
2334 // n.b the x86 version assumes the old value to be compared against is
2335 // in rax and updates rax with the value located in memory if the
2336 // cmpxchg fails. we supply a register for the old value explicitly
2337 
2338 // the aarch64 load linked/store conditional instructions do not
2339 // accept an offset. so, unlike x86, we must provide a plain register
2340 // to identify the memory word to be compared/exchanged rather than a
2341 // register+offset Address.
2342 
2343 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2344                                 Label &succeed, Label *fail) {
2345   // oldv holds comparison value
2346   // newv holds value to write in exchange
2347   // addr identifies memory word to compare against/update
2348   if (UseLSE) {
2349     mov(tmp, oldv);
2350     casal(Assembler::xword, oldv, newv, addr);
2351     cmp(tmp, oldv);
2352     br(Assembler::EQ, succeed);
2353     membar(AnyAny);
2354   } else {
2355     Label retry_load, nope;
2356     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2357       prfm(Address(addr), PSTL1STRM);
2358     bind(retry_load);
2359     // flush and load exclusive from the memory location
2360     // and fail if it is not what we expect
2361     ldaxr(tmp, addr);
2362     cmp(tmp, oldv);
2363     br(Assembler::NE, nope);
2364     // if we store+flush with no intervening write tmp wil be zero
2365     stlxr(tmp, newv, addr);
2366     cbzw(tmp, succeed);
2367     // retry so we only ever return after a load fails to compare
2368     // ensures we don't return a stale value after a failed write.
2369     b(retry_load);
2370     // if the memory word differs we return it in oldv and signal a fail
2371     bind(nope);
2372     membar(AnyAny);
2373     mov(oldv, tmp);
2374   }
2375   if (fail)
2376     b(*fail);
2377 }
2378 
2379 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2380                                         Label &succeed, Label *fail) {
2381   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2382   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2383 }
2384 
2385 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2386                                 Label &succeed, Label *fail) {
2387   // oldv holds comparison value
2388   // newv holds value to write in exchange
2389   // addr identifies memory word to compare against/update
2390   // tmp returns 0/1 for success/failure
2391   if (UseLSE) {
2392     mov(tmp, oldv);
2393     casal(Assembler::word, oldv, newv, addr);
2394     cmp(tmp, oldv);
2395     br(Assembler::EQ, succeed);
2396     membar(AnyAny);
2397   } else {
2398     Label retry_load, nope;
2399     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2400       prfm(Address(addr), PSTL1STRM);
2401     bind(retry_load);
2402     // flush and load exclusive from the memory location
2403     // and fail if it is not what we expect
2404     ldaxrw(tmp, addr);
2405     cmp(tmp, oldv);
2406     br(Assembler::NE, nope);
2407     // if we store+flush with no intervening write tmp wil be zero
2408     stlxrw(tmp, newv, addr);
2409     cbzw(tmp, succeed);
2410     // retry so we only ever return after a load fails to compare
2411     // ensures we don't return a stale value after a failed write.
2412     b(retry_load);
2413     // if the memory word differs we return it in oldv and signal a fail
2414     bind(nope);
2415     membar(AnyAny);
2416     mov(oldv, tmp);
2417   }
2418   if (fail)
2419     b(*fail);
2420 }
2421 
2422 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2423 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2424 // Pass a register for the result, otherwise pass noreg.
2425 
2426 // Clobbers rscratch1
2427 void MacroAssembler::cmpxchg(Register addr, Register expected,
2428                              Register new_val,
2429                              enum operand_size size,
2430                              bool acquire, bool release,
2431                              bool weak,
2432                              Register result) {
2433   if (result == noreg)  result = rscratch1;
2434   BLOCK_COMMENT("cmpxchg {");
2435   if (UseLSE) {
2436     mov(result, expected);
2437     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2438     compare_eq(result, expected, size);
2439   } else {
2440     Label retry_load, done;
2441     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2442       prfm(Address(addr), PSTL1STRM);
2443     bind(retry_load);
2444     load_exclusive(result, addr, size, acquire);
2445     compare_eq(result, expected, size);
2446     br(Assembler::NE, done);
2447     store_exclusive(rscratch1, new_val, addr, size, release);
2448     if (weak) {
2449       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2450     } else {
2451       cbnzw(rscratch1, retry_load);
2452     }
2453     bind(done);
2454   }
2455   BLOCK_COMMENT("} cmpxchg");
2456 }
2457 
2458 // A generic comparison. Only compares for equality, clobbers rscratch1.
2459 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2460   if (size == xword) {
2461     cmp(rm, rn);
2462   } else if (size == word) {
2463     cmpw(rm, rn);
2464   } else if (size == halfword) {
2465     eorw(rscratch1, rm, rn);
2466     ands(zr, rscratch1, 0xffff);
2467   } else if (size == byte) {
2468     eorw(rscratch1, rm, rn);
2469     ands(zr, rscratch1, 0xff);
2470   } else {
2471     ShouldNotReachHere();
2472   }
2473 }
2474 
2475 
2476 static bool different(Register a, RegisterOrConstant b, Register c) {
2477   if (b.is_constant())
2478     return a != c;
2479   else
2480     return a != b.as_register() && a != c && b.as_register() != c;
2481 }
2482 
2483 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2484 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2485   if (UseLSE) {                                                         \
2486     prev = prev->is_valid() ? prev : zr;                                \
2487     if (incr.is_register()) {                                           \
2488       AOP(sz, incr.as_register(), prev, addr);                          \
2489     } else {                                                            \
2490       mov(rscratch2, incr.as_constant());                               \
2491       AOP(sz, rscratch2, prev, addr);                                   \
2492     }                                                                   \
2493     return;                                                             \
2494   }                                                                     \
2495   Register result = rscratch2;                                          \
2496   if (prev->is_valid())                                                 \
2497     result = different(prev, incr, addr) ? prev : rscratch2;            \
2498                                                                         \
2499   Label retry_load;                                                     \
2500   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2501     prfm(Address(addr), PSTL1STRM);                                     \
2502   bind(retry_load);                                                     \
2503   LDXR(result, addr);                                                   \
2504   OP(rscratch1, result, incr);                                          \
2505   STXR(rscratch2, rscratch1, addr);                                     \
2506   cbnzw(rscratch2, retry_load);                                         \
2507   if (prev->is_valid() && prev != result) {                             \
2508     IOP(prev, rscratch1, incr);                                         \
2509   }                                                                     \
2510 }
2511 
2512 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2513 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2514 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2515 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2516 
2517 #undef ATOMIC_OP
2518 
2519 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2520 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2521   if (UseLSE) {                                                         \
2522     prev = prev->is_valid() ? prev : zr;                                \
2523     AOP(sz, newv, prev, addr);                                          \
2524     return;                                                             \
2525   }                                                                     \
2526   Register result = rscratch2;                                          \
2527   if (prev->is_valid())                                                 \
2528     result = different(prev, newv, addr) ? prev : rscratch2;            \
2529                                                                         \
2530   Label retry_load;                                                     \
2531   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2532     prfm(Address(addr), PSTL1STRM);                                     \
2533   bind(retry_load);                                                     \
2534   LDXR(result, addr);                                                   \
2535   STXR(rscratch1, newv, addr);                                          \
2536   cbnzw(rscratch1, retry_load);                                         \
2537   if (prev->is_valid() && prev != result)                               \
2538     mov(prev, result);                                                  \
2539 }
2540 
2541 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2542 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2543 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2544 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2545 
2546 #undef ATOMIC_XCHG
2547 
2548 #ifndef PRODUCT
2549 extern "C" void findpc(intptr_t x);
2550 #endif
2551 
2552 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2553 {
2554   // In order to get locks to work, we need to fake a in_VM state
2555   if (ShowMessageBoxOnError ) {
2556     JavaThread* thread = JavaThread::current();
2557     JavaThreadState saved_state = thread->thread_state();
2558     thread->set_thread_state(_thread_in_vm);
2559 #ifndef PRODUCT
2560     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2561       ttyLocker ttyl;
2562       BytecodeCounter::print();
2563     }
2564 #endif
2565     if (os::message_box(msg, "Execution stopped, print registers?")) {
2566       ttyLocker ttyl;
2567       tty->print_cr(" pc = 0x%016lx", pc);
2568 #ifndef PRODUCT
2569       tty->cr();
2570       findpc(pc);
2571       tty->cr();
2572 #endif
2573       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2574       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2575       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2576       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2577       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2578       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2579       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2580       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2581       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2582       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2583       tty->print_cr("r10 = 0x%016lx", regs[10]);
2584       tty->print_cr("r11 = 0x%016lx", regs[11]);
2585       tty->print_cr("r12 = 0x%016lx", regs[12]);
2586       tty->print_cr("r13 = 0x%016lx", regs[13]);
2587       tty->print_cr("r14 = 0x%016lx", regs[14]);
2588       tty->print_cr("r15 = 0x%016lx", regs[15]);
2589       tty->print_cr("r16 = 0x%016lx", regs[16]);
2590       tty->print_cr("r17 = 0x%016lx", regs[17]);
2591       tty->print_cr("r18 = 0x%016lx", regs[18]);
2592       tty->print_cr("r19 = 0x%016lx", regs[19]);
2593       tty->print_cr("r20 = 0x%016lx", regs[20]);
2594       tty->print_cr("r21 = 0x%016lx", regs[21]);
2595       tty->print_cr("r22 = 0x%016lx", regs[22]);
2596       tty->print_cr("r23 = 0x%016lx", regs[23]);
2597       tty->print_cr("r24 = 0x%016lx", regs[24]);
2598       tty->print_cr("r25 = 0x%016lx", regs[25]);
2599       tty->print_cr("r26 = 0x%016lx", regs[26]);
2600       tty->print_cr("r27 = 0x%016lx", regs[27]);
2601       tty->print_cr("r28 = 0x%016lx", regs[28]);
2602       tty->print_cr("r30 = 0x%016lx", regs[30]);
2603       tty->print_cr("r31 = 0x%016lx", regs[31]);
2604       BREAKPOINT;
2605     }
2606   }
2607   fatal("DEBUG MESSAGE: %s", msg);
2608 }
2609 
2610 void MacroAssembler::push_call_clobbered_registers() {
2611   int step = 4 * wordSize;
2612   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2613   sub(sp, sp, step);
2614   mov(rscratch1, -step);
2615   // Push v0-v7, v16-v31.
2616   for (int i = 31; i>= 4; i -= 4) {
2617     if (i <= v7->encoding() || i >= v16->encoding())
2618       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2619           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2620   }
2621   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2622       as_FloatRegister(3), T1D, Address(sp));
2623 }
2624 
2625 void MacroAssembler::pop_call_clobbered_registers() {
2626   for (int i = 0; i < 32; i += 4) {
2627     if (i <= v7->encoding() || i >= v16->encoding())
2628       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2629           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2630   }
2631 
2632   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2633 }
2634 
2635 void MacroAssembler::push_CPU_state(bool save_vectors) {
2636   int step = (save_vectors ? 8 : 4) * wordSize;
2637   push(0x3fffffff, sp);         // integer registers except lr & sp
2638   mov(rscratch1, -step);
2639   sub(sp, sp, step);
2640   for (int i = 28; i >= 4; i -= 4) {
2641     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2642         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2643   }
2644   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2645 }
2646 
2647 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2648   int step = (restore_vectors ? 8 : 4) * wordSize;
2649   for (int i = 0; i <= 28; i += 4)
2650     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2651         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2652   pop(0x3fffffff, sp);         // integer registers except lr & sp
2653 }
2654 
2655 /**
2656  * Helpers for multiply_to_len().
2657  */
2658 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2659                                      Register src1, Register src2) {
2660   adds(dest_lo, dest_lo, src1);
2661   adc(dest_hi, dest_hi, zr);
2662   adds(dest_lo, dest_lo, src2);
2663   adc(final_dest_hi, dest_hi, zr);
2664 }
2665 
2666 // Generate an address from (r + r1 extend offset).  "size" is the
2667 // size of the operand.  The result may be in rscratch2.
2668 Address MacroAssembler::offsetted_address(Register r, Register r1,
2669                                           Address::extend ext, int offset, int size) {
2670   if (offset || (ext.shift() % size != 0)) {
2671     lea(rscratch2, Address(r, r1, ext));
2672     return Address(rscratch2, offset);
2673   } else {
2674     return Address(r, r1, ext);
2675   }
2676 }
2677 
2678 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2679 {
2680   assert(offset >= 0, "spill to negative address?");
2681   // Offset reachable ?
2682   //   Not aligned - 9 bits signed offset
2683   //   Aligned - 12 bits unsigned offset shifted
2684   Register base = sp;
2685   if ((offset & (size-1)) && offset >= (1<<8)) {
2686     add(tmp, base, offset & ((1<<12)-1));
2687     base = tmp;
2688     offset &= -1u<<12;
2689   }
2690 
2691   if (offset >= (1<<12) * size) {
2692     add(tmp, base, offset & (((1<<12)-1)<<12));
2693     base = tmp;
2694     offset &= ~(((1<<12)-1)<<12);
2695   }
2696 
2697   return Address(base, offset);
2698 }
2699 
2700 // Checks whether offset is aligned.
2701 // Returns true if it is, else false.
2702 bool MacroAssembler::merge_alignment_check(Register base,
2703                                            size_t size,
2704                                            long cur_offset,
2705                                            long prev_offset) const {
2706   if (AvoidUnalignedAccesses) {
2707     if (base == sp) {
2708       // Checks whether low offset if aligned to pair of registers.
2709       long pair_mask = size * 2 - 1;
2710       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2711       return (offset & pair_mask) == 0;
2712     } else { // If base is not sp, we can't guarantee the access is aligned.
2713       return false;
2714     }
2715   } else {
2716     long mask = size - 1;
2717     // Load/store pair instruction only supports element size aligned offset.
2718     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2719   }
2720 }
2721 
2722 // Checks whether current and previous loads/stores can be merged.
2723 // Returns true if it can be merged, else false.
2724 bool MacroAssembler::ldst_can_merge(Register rt,
2725                                     const Address &adr,
2726                                     size_t cur_size_in_bytes,
2727                                     bool is_store) const {
2728   address prev = pc() - NativeInstruction::instruction_size;
2729   address last = code()->last_insn();
2730 
2731   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2732     return false;
2733   }
2734 
2735   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2736     return false;
2737   }
2738 
2739   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2740   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2741 
2742   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2743   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2744 
2745   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2746     return false;
2747   }
2748 
2749   long max_offset = 63 * prev_size_in_bytes;
2750   long min_offset = -64 * prev_size_in_bytes;
2751 
2752   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2753 
2754   // Only same base can be merged.
2755   if (adr.base() != prev_ldst->base()) {
2756     return false;
2757   }
2758 
2759   long cur_offset = adr.offset();
2760   long prev_offset = prev_ldst->offset();
2761   size_t diff = abs(cur_offset - prev_offset);
2762   if (diff != prev_size_in_bytes) {
2763     return false;
2764   }
2765 
2766   // Following cases can not be merged:
2767   // ldr x2, [x2, #8]
2768   // ldr x3, [x2, #16]
2769   // or:
2770   // ldr x2, [x3, #8]
2771   // ldr x2, [x3, #16]
2772   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2773   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2774     return false;
2775   }
2776 
2777   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2778   // Offset range must be in ldp/stp instruction's range.
2779   if (low_offset > max_offset || low_offset < min_offset) {
2780     return false;
2781   }
2782 
2783   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2784     return true;
2785   }
2786 
2787   return false;
2788 }
2789 
2790 // Merge current load/store with previous load/store into ldp/stp.
2791 void MacroAssembler::merge_ldst(Register rt,
2792                                 const Address &adr,
2793                                 size_t cur_size_in_bytes,
2794                                 bool is_store) {
2795 
2796   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2797 
2798   Register rt_low, rt_high;
2799   address prev = pc() - NativeInstruction::instruction_size;
2800   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2801 
2802   long offset;
2803 
2804   if (adr.offset() < prev_ldst->offset()) {
2805     offset = adr.offset();
2806     rt_low = rt;
2807     rt_high = prev_ldst->target();
2808   } else {
2809     offset = prev_ldst->offset();
2810     rt_low = prev_ldst->target();
2811     rt_high = rt;
2812   }
2813 
2814   Address adr_p = Address(prev_ldst->base(), offset);
2815   // Overwrite previous generated binary.
2816   code_section()->set_end(prev);
2817 
2818   const int sz = prev_ldst->size_in_bytes();
2819   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2820   if (!is_store) {
2821     BLOCK_COMMENT("merged ldr pair");
2822     if (sz == 8) {
2823       ldp(rt_low, rt_high, adr_p);
2824     } else {
2825       ldpw(rt_low, rt_high, adr_p);
2826     }
2827   } else {
2828     BLOCK_COMMENT("merged str pair");
2829     if (sz == 8) {
2830       stp(rt_low, rt_high, adr_p);
2831     } else {
2832       stpw(rt_low, rt_high, adr_p);
2833     }
2834   }
2835 }
2836 
2837 /**
2838  * Multiply 64 bit by 64 bit first loop.
2839  */
2840 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2841                                            Register y, Register y_idx, Register z,
2842                                            Register carry, Register product,
2843                                            Register idx, Register kdx) {
2844   //
2845   //  jlong carry, x[], y[], z[];
2846   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2847   //    huge_128 product = y[idx] * x[xstart] + carry;
2848   //    z[kdx] = (jlong)product;
2849   //    carry  = (jlong)(product >>> 64);
2850   //  }
2851   //  z[xstart] = carry;
2852   //
2853 
2854   Label L_first_loop, L_first_loop_exit;
2855   Label L_one_x, L_one_y, L_multiply;
2856 
2857   subsw(xstart, xstart, 1);
2858   br(Assembler::MI, L_one_x);
2859 
2860   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2861   ldr(x_xstart, Address(rscratch1));
2862   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2863 
2864   bind(L_first_loop);
2865   subsw(idx, idx, 1);
2866   br(Assembler::MI, L_first_loop_exit);
2867   subsw(idx, idx, 1);
2868   br(Assembler::MI, L_one_y);
2869   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2870   ldr(y_idx, Address(rscratch1));
2871   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2872   bind(L_multiply);
2873 
2874   // AArch64 has a multiply-accumulate instruction that we can't use
2875   // here because it has no way to process carries, so we have to use
2876   // separate add and adc instructions.  Bah.
2877   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2878   mul(product, x_xstart, y_idx);
2879   adds(product, product, carry);
2880   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2881 
2882   subw(kdx, kdx, 2);
2883   ror(product, product, 32); // back to big-endian
2884   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2885 
2886   b(L_first_loop);
2887 
2888   bind(L_one_y);
2889   ldrw(y_idx, Address(y,  0));
2890   b(L_multiply);
2891 
2892   bind(L_one_x);
2893   ldrw(x_xstart, Address(x,  0));
2894   b(L_first_loop);
2895 
2896   bind(L_first_loop_exit);
2897 }
2898 
2899 /**
2900  * Multiply 128 bit by 128. Unrolled inner loop.
2901  *
2902  */
2903 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2904                                              Register carry, Register carry2,
2905                                              Register idx, Register jdx,
2906                                              Register yz_idx1, Register yz_idx2,
2907                                              Register tmp, Register tmp3, Register tmp4,
2908                                              Register tmp6, Register product_hi) {
2909 
2910   //   jlong carry, x[], y[], z[];
2911   //   int kdx = ystart+1;
2912   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2913   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2914   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2915   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2916   //     carry  = (jlong)(tmp4 >>> 64);
2917   //     z[kdx+idx+1] = (jlong)tmp3;
2918   //     z[kdx+idx] = (jlong)tmp4;
2919   //   }
2920   //   idx += 2;
2921   //   if (idx > 0) {
2922   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2923   //     z[kdx+idx] = (jlong)yz_idx1;
2924   //     carry  = (jlong)(yz_idx1 >>> 64);
2925   //   }
2926   //
2927 
2928   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2929 
2930   lsrw(jdx, idx, 2);
2931 
2932   bind(L_third_loop);
2933 
2934   subsw(jdx, jdx, 1);
2935   br(Assembler::MI, L_third_loop_exit);
2936   subw(idx, idx, 4);
2937 
2938   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2939 
2940   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2941 
2942   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2943 
2944   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2945   ror(yz_idx2, yz_idx2, 32);
2946 
2947   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2948 
2949   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2950   umulh(tmp4, product_hi, yz_idx1);
2951 
2952   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2953   ror(rscratch2, rscratch2, 32);
2954 
2955   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2956   umulh(carry2, product_hi, yz_idx2);
2957 
2958   // propagate sum of both multiplications into carry:tmp4:tmp3
2959   adds(tmp3, tmp3, carry);
2960   adc(tmp4, tmp4, zr);
2961   adds(tmp3, tmp3, rscratch1);
2962   adcs(tmp4, tmp4, tmp);
2963   adc(carry, carry2, zr);
2964   adds(tmp4, tmp4, rscratch2);
2965   adc(carry, carry, zr);
2966 
2967   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2968   ror(tmp4, tmp4, 32);
2969   stp(tmp4, tmp3, Address(tmp6, 0));
2970 
2971   b(L_third_loop);
2972   bind (L_third_loop_exit);
2973 
2974   andw (idx, idx, 0x3);
2975   cbz(idx, L_post_third_loop_done);
2976 
2977   Label L_check_1;
2978   subsw(idx, idx, 2);
2979   br(Assembler::MI, L_check_1);
2980 
2981   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2982   ldr(yz_idx1, Address(rscratch1, 0));
2983   ror(yz_idx1, yz_idx1, 32);
2984   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2985   umulh(tmp4, product_hi, yz_idx1);
2986   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2987   ldr(yz_idx2, Address(rscratch1, 0));
2988   ror(yz_idx2, yz_idx2, 32);
2989 
2990   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2991 
2992   ror(tmp3, tmp3, 32);
2993   str(tmp3, Address(rscratch1, 0));
2994 
2995   bind (L_check_1);
2996 
2997   andw (idx, idx, 0x1);
2998   subsw(idx, idx, 1);
2999   br(Assembler::MI, L_post_third_loop_done);
3000   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3001   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3002   umulh(carry2, tmp4, product_hi);
3003   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3004 
3005   add2_with_carry(carry2, tmp3, tmp4, carry);
3006 
3007   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3008   extr(carry, carry2, tmp3, 32);
3009 
3010   bind(L_post_third_loop_done);
3011 }
3012 
3013 /**
3014  * Code for BigInteger::multiplyToLen() instrinsic.
3015  *
3016  * r0: x
3017  * r1: xlen
3018  * r2: y
3019  * r3: ylen
3020  * r4:  z
3021  * r5: zlen
3022  * r10: tmp1
3023  * r11: tmp2
3024  * r12: tmp3
3025  * r13: tmp4
3026  * r14: tmp5
3027  * r15: tmp6
3028  * r16: tmp7
3029  *
3030  */
3031 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3032                                      Register z, Register zlen,
3033                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3034                                      Register tmp5, Register tmp6, Register product_hi) {
3035 
3036   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3037 
3038   const Register idx = tmp1;
3039   const Register kdx = tmp2;
3040   const Register xstart = tmp3;
3041 
3042   const Register y_idx = tmp4;
3043   const Register carry = tmp5;
3044   const Register product  = xlen;
3045   const Register x_xstart = zlen;  // reuse register
3046 
3047   // First Loop.
3048   //
3049   //  final static long LONG_MASK = 0xffffffffL;
3050   //  int xstart = xlen - 1;
3051   //  int ystart = ylen - 1;
3052   //  long carry = 0;
3053   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3054   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3055   //    z[kdx] = (int)product;
3056   //    carry = product >>> 32;
3057   //  }
3058   //  z[xstart] = (int)carry;
3059   //
3060 
3061   movw(idx, ylen);      // idx = ylen;
3062   movw(kdx, zlen);      // kdx = xlen+ylen;
3063   mov(carry, zr);       // carry = 0;
3064 
3065   Label L_done;
3066 
3067   movw(xstart, xlen);
3068   subsw(xstart, xstart, 1);
3069   br(Assembler::MI, L_done);
3070 
3071   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3072 
3073   Label L_second_loop;
3074   cbzw(kdx, L_second_loop);
3075 
3076   Label L_carry;
3077   subw(kdx, kdx, 1);
3078   cbzw(kdx, L_carry);
3079 
3080   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3081   lsr(carry, carry, 32);
3082   subw(kdx, kdx, 1);
3083 
3084   bind(L_carry);
3085   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3086 
3087   // Second and third (nested) loops.
3088   //
3089   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3090   //   carry = 0;
3091   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3092   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3093   //                    (z[k] & LONG_MASK) + carry;
3094   //     z[k] = (int)product;
3095   //     carry = product >>> 32;
3096   //   }
3097   //   z[i] = (int)carry;
3098   // }
3099   //
3100   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3101 
3102   const Register jdx = tmp1;
3103 
3104   bind(L_second_loop);
3105   mov(carry, zr);                // carry = 0;
3106   movw(jdx, ylen);               // j = ystart+1
3107 
3108   subsw(xstart, xstart, 1);      // i = xstart-1;
3109   br(Assembler::MI, L_done);
3110 
3111   str(z, Address(pre(sp, -4 * wordSize)));
3112 
3113   Label L_last_x;
3114   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3115   subsw(xstart, xstart, 1);       // i = xstart-1;
3116   br(Assembler::MI, L_last_x);
3117 
3118   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3119   ldr(product_hi, Address(rscratch1));
3120   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3121 
3122   Label L_third_loop_prologue;
3123   bind(L_third_loop_prologue);
3124 
3125   str(ylen, Address(sp, wordSize));
3126   stp(x, xstart, Address(sp, 2 * wordSize));
3127   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3128                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3129   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3130   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3131 
3132   addw(tmp3, xlen, 1);
3133   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3134   subsw(tmp3, tmp3, 1);
3135   br(Assembler::MI, L_done);
3136 
3137   lsr(carry, carry, 32);
3138   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3139   b(L_second_loop);
3140 
3141   // Next infrequent code is moved outside loops.
3142   bind(L_last_x);
3143   ldrw(product_hi, Address(x,  0));
3144   b(L_third_loop_prologue);
3145 
3146   bind(L_done);
3147 }
3148 
3149 // Code for BigInteger::mulAdd instrinsic
3150 // out     = r0
3151 // in      = r1
3152 // offset  = r2  (already out.length-offset)
3153 // len     = r3
3154 // k       = r4
3155 //
3156 // pseudo code from java implementation:
3157 // carry = 0;
3158 // offset = out.length-offset - 1;
3159 // for (int j=len-1; j >= 0; j--) {
3160 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3161 //     out[offset--] = (int)product;
3162 //     carry = product >>> 32;
3163 // }
3164 // return (int)carry;
3165 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3166       Register len, Register k) {
3167     Label LOOP, END;
3168     // pre-loop
3169     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3170     csel(out, zr, out, Assembler::EQ);
3171     br(Assembler::EQ, END);
3172     add(in, in, len, LSL, 2); // in[j+1] address
3173     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3174     mov(out, zr); // used to keep carry now
3175     BIND(LOOP);
3176     ldrw(rscratch1, Address(pre(in, -4)));
3177     madd(rscratch1, rscratch1, k, out);
3178     ldrw(rscratch2, Address(pre(offset, -4)));
3179     add(rscratch1, rscratch1, rscratch2);
3180     strw(rscratch1, Address(offset));
3181     lsr(out, rscratch1, 32);
3182     subs(len, len, 1);
3183     br(Assembler::NE, LOOP);
3184     BIND(END);
3185 }
3186 
3187 /**
3188  * Emits code to update CRC-32 with a byte value according to constants in table
3189  *
3190  * @param [in,out]crc   Register containing the crc.
3191  * @param [in]val       Register containing the byte to fold into the CRC.
3192  * @param [in]table     Register containing the table of crc constants.
3193  *
3194  * uint32_t crc;
3195  * val = crc_table[(val ^ crc) & 0xFF];
3196  * crc = val ^ (crc >> 8);
3197  *
3198  */
3199 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3200   eor(val, val, crc);
3201   andr(val, val, 0xff);
3202   ldrw(val, Address(table, val, Address::lsl(2)));
3203   eor(crc, val, crc, Assembler::LSR, 8);
3204 }
3205 
3206 /**
3207  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3208  *
3209  * @param [in,out]crc   Register containing the crc.
3210  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3211  * @param [in]table0    Register containing table 0 of crc constants.
3212  * @param [in]table1    Register containing table 1 of crc constants.
3213  * @param [in]table2    Register containing table 2 of crc constants.
3214  * @param [in]table3    Register containing table 3 of crc constants.
3215  *
3216  * uint32_t crc;
3217  *   v = crc ^ v
3218  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3219  *
3220  */
3221 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3222         Register table0, Register table1, Register table2, Register table3,
3223         bool upper) {
3224   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3225   uxtb(tmp, v);
3226   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3227   ubfx(tmp, v, 8, 8);
3228   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3229   eor(crc, crc, tmp);
3230   ubfx(tmp, v, 16, 8);
3231   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3232   eor(crc, crc, tmp);
3233   ubfx(tmp, v, 24, 8);
3234   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3235   eor(crc, crc, tmp);
3236 }
3237 
3238 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3239         Register len, Register tmp0, Register tmp1, Register tmp2,
3240         Register tmp3) {
3241     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3242     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3243 
3244     mvnw(crc, crc);
3245 
3246     subs(len, len, 128);
3247     br(Assembler::GE, CRC_by64_pre);
3248   BIND(CRC_less64);
3249     adds(len, len, 128-32);
3250     br(Assembler::GE, CRC_by32_loop);
3251   BIND(CRC_less32);
3252     adds(len, len, 32-4);
3253     br(Assembler::GE, CRC_by4_loop);
3254     adds(len, len, 4);
3255     br(Assembler::GT, CRC_by1_loop);
3256     b(L_exit);
3257 
3258   BIND(CRC_by32_loop);
3259     ldp(tmp0, tmp1, Address(post(buf, 16)));
3260     subs(len, len, 32);
3261     crc32x(crc, crc, tmp0);
3262     ldr(tmp2, Address(post(buf, 8)));
3263     crc32x(crc, crc, tmp1);
3264     ldr(tmp3, Address(post(buf, 8)));
3265     crc32x(crc, crc, tmp2);
3266     crc32x(crc, crc, tmp3);
3267     br(Assembler::GE, CRC_by32_loop);
3268     cmn(len, 32);
3269     br(Assembler::NE, CRC_less32);
3270     b(L_exit);
3271 
3272   BIND(CRC_by4_loop);
3273     ldrw(tmp0, Address(post(buf, 4)));
3274     subs(len, len, 4);
3275     crc32w(crc, crc, tmp0);
3276     br(Assembler::GE, CRC_by4_loop);
3277     adds(len, len, 4);
3278     br(Assembler::LE, L_exit);
3279   BIND(CRC_by1_loop);
3280     ldrb(tmp0, Address(post(buf, 1)));
3281     subs(len, len, 1);
3282     crc32b(crc, crc, tmp0);
3283     br(Assembler::GT, CRC_by1_loop);
3284     b(L_exit);
3285 
3286   BIND(CRC_by64_pre);
3287     sub(buf, buf, 8);
3288     ldp(tmp0, tmp1, Address(buf, 8));
3289     crc32x(crc, crc, tmp0);
3290     ldr(tmp2, Address(buf, 24));
3291     crc32x(crc, crc, tmp1);
3292     ldr(tmp3, Address(buf, 32));
3293     crc32x(crc, crc, tmp2);
3294     ldr(tmp0, Address(buf, 40));
3295     crc32x(crc, crc, tmp3);
3296     ldr(tmp1, Address(buf, 48));
3297     crc32x(crc, crc, tmp0);
3298     ldr(tmp2, Address(buf, 56));
3299     crc32x(crc, crc, tmp1);
3300     ldr(tmp3, Address(pre(buf, 64)));
3301 
3302     b(CRC_by64_loop);
3303 
3304     align(CodeEntryAlignment);
3305   BIND(CRC_by64_loop);
3306     subs(len, len, 64);
3307     crc32x(crc, crc, tmp2);
3308     ldr(tmp0, Address(buf, 8));
3309     crc32x(crc, crc, tmp3);
3310     ldr(tmp1, Address(buf, 16));
3311     crc32x(crc, crc, tmp0);
3312     ldr(tmp2, Address(buf, 24));
3313     crc32x(crc, crc, tmp1);
3314     ldr(tmp3, Address(buf, 32));
3315     crc32x(crc, crc, tmp2);
3316     ldr(tmp0, Address(buf, 40));
3317     crc32x(crc, crc, tmp3);
3318     ldr(tmp1, Address(buf, 48));
3319     crc32x(crc, crc, tmp0);
3320     ldr(tmp2, Address(buf, 56));
3321     crc32x(crc, crc, tmp1);
3322     ldr(tmp3, Address(pre(buf, 64)));
3323     br(Assembler::GE, CRC_by64_loop);
3324 
3325     // post-loop
3326     crc32x(crc, crc, tmp2);
3327     crc32x(crc, crc, tmp3);
3328 
3329     sub(len, len, 64);
3330     add(buf, buf, 8);
3331     cmn(len, 128);
3332     br(Assembler::NE, CRC_less64);
3333   BIND(L_exit);
3334     mvnw(crc, crc);
3335 }
3336 
3337 /**
3338  * @param crc   register containing existing CRC (32-bit)
3339  * @param buf   register pointing to input byte buffer (byte*)
3340  * @param len   register containing number of bytes
3341  * @param table register that will contain address of CRC table
3342  * @param tmp   scratch register
3343  */
3344 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3345         Register table0, Register table1, Register table2, Register table3,
3346         Register tmp, Register tmp2, Register tmp3) {
3347   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3348   unsigned long offset;
3349 
3350   if (UseCRC32) {
3351       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3352       return;
3353   }
3354 
3355     mvnw(crc, crc);
3356 
3357     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3358     if (offset) add(table0, table0, offset);
3359     add(table1, table0, 1*256*sizeof(juint));
3360     add(table2, table0, 2*256*sizeof(juint));
3361     add(table3, table0, 3*256*sizeof(juint));
3362 
3363   if (UseNeon) {
3364       cmp(len, (u1)64);
3365       br(Assembler::LT, L_by16);
3366       eor(v16, T16B, v16, v16);
3367 
3368     Label L_fold;
3369 
3370       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3371 
3372       ld1(v0, v1, T2D, post(buf, 32));
3373       ld1r(v4, T2D, post(tmp, 8));
3374       ld1r(v5, T2D, post(tmp, 8));
3375       ld1r(v6, T2D, post(tmp, 8));
3376       ld1r(v7, T2D, post(tmp, 8));
3377       mov(v16, T4S, 0, crc);
3378 
3379       eor(v0, T16B, v0, v16);
3380       sub(len, len, 64);
3381 
3382     BIND(L_fold);
3383       pmull(v22, T8H, v0, v5, T8B);
3384       pmull(v20, T8H, v0, v7, T8B);
3385       pmull(v23, T8H, v0, v4, T8B);
3386       pmull(v21, T8H, v0, v6, T8B);
3387 
3388       pmull2(v18, T8H, v0, v5, T16B);
3389       pmull2(v16, T8H, v0, v7, T16B);
3390       pmull2(v19, T8H, v0, v4, T16B);
3391       pmull2(v17, T8H, v0, v6, T16B);
3392 
3393       uzp1(v24, T8H, v20, v22);
3394       uzp2(v25, T8H, v20, v22);
3395       eor(v20, T16B, v24, v25);
3396 
3397       uzp1(v26, T8H, v16, v18);
3398       uzp2(v27, T8H, v16, v18);
3399       eor(v16, T16B, v26, v27);
3400 
3401       ushll2(v22, T4S, v20, T8H, 8);
3402       ushll(v20, T4S, v20, T4H, 8);
3403 
3404       ushll2(v18, T4S, v16, T8H, 8);
3405       ushll(v16, T4S, v16, T4H, 8);
3406 
3407       eor(v22, T16B, v23, v22);
3408       eor(v18, T16B, v19, v18);
3409       eor(v20, T16B, v21, v20);
3410       eor(v16, T16B, v17, v16);
3411 
3412       uzp1(v17, T2D, v16, v20);
3413       uzp2(v21, T2D, v16, v20);
3414       eor(v17, T16B, v17, v21);
3415 
3416       ushll2(v20, T2D, v17, T4S, 16);
3417       ushll(v16, T2D, v17, T2S, 16);
3418 
3419       eor(v20, T16B, v20, v22);
3420       eor(v16, T16B, v16, v18);
3421 
3422       uzp1(v17, T2D, v20, v16);
3423       uzp2(v21, T2D, v20, v16);
3424       eor(v28, T16B, v17, v21);
3425 
3426       pmull(v22, T8H, v1, v5, T8B);
3427       pmull(v20, T8H, v1, v7, T8B);
3428       pmull(v23, T8H, v1, v4, T8B);
3429       pmull(v21, T8H, v1, v6, T8B);
3430 
3431       pmull2(v18, T8H, v1, v5, T16B);
3432       pmull2(v16, T8H, v1, v7, T16B);
3433       pmull2(v19, T8H, v1, v4, T16B);
3434       pmull2(v17, T8H, v1, v6, T16B);
3435 
3436       ld1(v0, v1, T2D, post(buf, 32));
3437 
3438       uzp1(v24, T8H, v20, v22);
3439       uzp2(v25, T8H, v20, v22);
3440       eor(v20, T16B, v24, v25);
3441 
3442       uzp1(v26, T8H, v16, v18);
3443       uzp2(v27, T8H, v16, v18);
3444       eor(v16, T16B, v26, v27);
3445 
3446       ushll2(v22, T4S, v20, T8H, 8);
3447       ushll(v20, T4S, v20, T4H, 8);
3448 
3449       ushll2(v18, T4S, v16, T8H, 8);
3450       ushll(v16, T4S, v16, T4H, 8);
3451 
3452       eor(v22, T16B, v23, v22);
3453       eor(v18, T16B, v19, v18);
3454       eor(v20, T16B, v21, v20);
3455       eor(v16, T16B, v17, v16);
3456 
3457       uzp1(v17, T2D, v16, v20);
3458       uzp2(v21, T2D, v16, v20);
3459       eor(v16, T16B, v17, v21);
3460 
3461       ushll2(v20, T2D, v16, T4S, 16);
3462       ushll(v16, T2D, v16, T2S, 16);
3463 
3464       eor(v20, T16B, v22, v20);
3465       eor(v16, T16B, v16, v18);
3466 
3467       uzp1(v17, T2D, v20, v16);
3468       uzp2(v21, T2D, v20, v16);
3469       eor(v20, T16B, v17, v21);
3470 
3471       shl(v16, T2D, v28, 1);
3472       shl(v17, T2D, v20, 1);
3473 
3474       eor(v0, T16B, v0, v16);
3475       eor(v1, T16B, v1, v17);
3476 
3477       subs(len, len, 32);
3478       br(Assembler::GE, L_fold);
3479 
3480       mov(crc, 0);
3481       mov(tmp, v0, T1D, 0);
3482       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3483       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3484       mov(tmp, v0, T1D, 1);
3485       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3486       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3487       mov(tmp, v1, T1D, 0);
3488       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3489       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3490       mov(tmp, v1, T1D, 1);
3491       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3492       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3493 
3494       add(len, len, 32);
3495   }
3496 
3497   BIND(L_by16);
3498     subs(len, len, 16);
3499     br(Assembler::GE, L_by16_loop);
3500     adds(len, len, 16-4);
3501     br(Assembler::GE, L_by4_loop);
3502     adds(len, len, 4);
3503     br(Assembler::GT, L_by1_loop);
3504     b(L_exit);
3505 
3506   BIND(L_by4_loop);
3507     ldrw(tmp, Address(post(buf, 4)));
3508     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3509     subs(len, len, 4);
3510     br(Assembler::GE, L_by4_loop);
3511     adds(len, len, 4);
3512     br(Assembler::LE, L_exit);
3513   BIND(L_by1_loop);
3514     subs(len, len, 1);
3515     ldrb(tmp, Address(post(buf, 1)));
3516     update_byte_crc32(crc, tmp, table0);
3517     br(Assembler::GT, L_by1_loop);
3518     b(L_exit);
3519 
3520     align(CodeEntryAlignment);
3521   BIND(L_by16_loop);
3522     subs(len, len, 16);
3523     ldp(tmp, tmp3, Address(post(buf, 16)));
3524     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3525     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3526     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3527     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3528     br(Assembler::GE, L_by16_loop);
3529     adds(len, len, 16-4);
3530     br(Assembler::GE, L_by4_loop);
3531     adds(len, len, 4);
3532     br(Assembler::GT, L_by1_loop);
3533   BIND(L_exit);
3534     mvnw(crc, crc);
3535 }
3536 
3537 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3538         Register len, Register tmp0, Register tmp1, Register tmp2,
3539         Register tmp3) {
3540     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3541     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3542 
3543     subs(len, len, 128);
3544     br(Assembler::GE, CRC_by64_pre);
3545   BIND(CRC_less64);
3546     adds(len, len, 128-32);
3547     br(Assembler::GE, CRC_by32_loop);
3548   BIND(CRC_less32);
3549     adds(len, len, 32-4);
3550     br(Assembler::GE, CRC_by4_loop);
3551     adds(len, len, 4);
3552     br(Assembler::GT, CRC_by1_loop);
3553     b(L_exit);
3554 
3555   BIND(CRC_by32_loop);
3556     ldp(tmp0, tmp1, Address(post(buf, 16)));
3557     subs(len, len, 32);
3558     crc32cx(crc, crc, tmp0);
3559     ldr(tmp2, Address(post(buf, 8)));
3560     crc32cx(crc, crc, tmp1);
3561     ldr(tmp3, Address(post(buf, 8)));
3562     crc32cx(crc, crc, tmp2);
3563     crc32cx(crc, crc, tmp3);
3564     br(Assembler::GE, CRC_by32_loop);
3565     cmn(len, 32);
3566     br(Assembler::NE, CRC_less32);
3567     b(L_exit);
3568 
3569   BIND(CRC_by4_loop);
3570     ldrw(tmp0, Address(post(buf, 4)));
3571     subs(len, len, 4);
3572     crc32cw(crc, crc, tmp0);
3573     br(Assembler::GE, CRC_by4_loop);
3574     adds(len, len, 4);
3575     br(Assembler::LE, L_exit);
3576   BIND(CRC_by1_loop);
3577     ldrb(tmp0, Address(post(buf, 1)));
3578     subs(len, len, 1);
3579     crc32cb(crc, crc, tmp0);
3580     br(Assembler::GT, CRC_by1_loop);
3581     b(L_exit);
3582 
3583   BIND(CRC_by64_pre);
3584     sub(buf, buf, 8);
3585     ldp(tmp0, tmp1, Address(buf, 8));
3586     crc32cx(crc, crc, tmp0);
3587     ldr(tmp2, Address(buf, 24));
3588     crc32cx(crc, crc, tmp1);
3589     ldr(tmp3, Address(buf, 32));
3590     crc32cx(crc, crc, tmp2);
3591     ldr(tmp0, Address(buf, 40));
3592     crc32cx(crc, crc, tmp3);
3593     ldr(tmp1, Address(buf, 48));
3594     crc32cx(crc, crc, tmp0);
3595     ldr(tmp2, Address(buf, 56));
3596     crc32cx(crc, crc, tmp1);
3597     ldr(tmp3, Address(pre(buf, 64)));
3598 
3599     b(CRC_by64_loop);
3600 
3601     align(CodeEntryAlignment);
3602   BIND(CRC_by64_loop);
3603     subs(len, len, 64);
3604     crc32cx(crc, crc, tmp2);
3605     ldr(tmp0, Address(buf, 8));
3606     crc32cx(crc, crc, tmp3);
3607     ldr(tmp1, Address(buf, 16));
3608     crc32cx(crc, crc, tmp0);
3609     ldr(tmp2, Address(buf, 24));
3610     crc32cx(crc, crc, tmp1);
3611     ldr(tmp3, Address(buf, 32));
3612     crc32cx(crc, crc, tmp2);
3613     ldr(tmp0, Address(buf, 40));
3614     crc32cx(crc, crc, tmp3);
3615     ldr(tmp1, Address(buf, 48));
3616     crc32cx(crc, crc, tmp0);
3617     ldr(tmp2, Address(buf, 56));
3618     crc32cx(crc, crc, tmp1);
3619     ldr(tmp3, Address(pre(buf, 64)));
3620     br(Assembler::GE, CRC_by64_loop);
3621 
3622     // post-loop
3623     crc32cx(crc, crc, tmp2);
3624     crc32cx(crc, crc, tmp3);
3625 
3626     sub(len, len, 64);
3627     add(buf, buf, 8);
3628     cmn(len, 128);
3629     br(Assembler::NE, CRC_less64);
3630   BIND(L_exit);
3631 }
3632 
3633 /**
3634  * @param crc   register containing existing CRC (32-bit)
3635  * @param buf   register pointing to input byte buffer (byte*)
3636  * @param len   register containing number of bytes
3637  * @param table register that will contain address of CRC table
3638  * @param tmp   scratch register
3639  */
3640 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3641         Register table0, Register table1, Register table2, Register table3,
3642         Register tmp, Register tmp2, Register tmp3) {
3643   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3644 }
3645 
3646 
3647 SkipIfEqual::SkipIfEqual(
3648     MacroAssembler* masm, const bool* flag_addr, bool value) {
3649   _masm = masm;
3650   unsigned long offset;
3651   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3652   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3653   _masm->cbzw(rscratch1, _label);
3654 }
3655 
3656 SkipIfEqual::~SkipIfEqual() {
3657   _masm->bind(_label);
3658 }
3659 
3660 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3661   Address adr;
3662   switch(dst.getMode()) {
3663   case Address::base_plus_offset:
3664     // This is the expected mode, although we allow all the other
3665     // forms below.
3666     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3667     break;
3668   default:
3669     lea(rscratch2, dst);
3670     adr = Address(rscratch2);
3671     break;
3672   }
3673   ldr(rscratch1, adr);
3674   add(rscratch1, rscratch1, src);
3675   str(rscratch1, adr);
3676 }
3677 
3678 void MacroAssembler::cmpptr(Register src1, Address src2) {
3679   unsigned long offset;
3680   adrp(rscratch1, src2, offset);
3681   ldr(rscratch1, Address(rscratch1, offset));
3682   cmp(src1, rscratch1);
3683 }
3684 
3685 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3686   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3687   bs->obj_equals(this, obj1, obj2);
3688 }
3689 
3690 void MacroAssembler::load_method_holder(Register holder, Register method) {
3691   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3692   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3693   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3694 }
3695 
3696 void MacroAssembler::load_klass(Register dst, Register src) {
3697   if (UseCompressedClassPointers) {
3698     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3699   } else {
3700     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3701   }
3702 }
3703 
3704 void MacroAssembler::load_klass(Register dst, Register src) {
3705   load_metadata(dst, src);
3706   if (UseCompressedClassPointers) {
3707     andr(dst, dst, oopDesc::compressed_klass_mask());
3708     decode_klass_not_null(dst);
3709   } else {
3710     ubfm(dst, dst, 0, 63 - oopDesc::storage_props_nof_bits);
3711   }
3712 }
3713 
3714 // ((OopHandle)result).resolve();
3715 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3716   // OopHandle::resolve is an indirection.
3717   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3718 }
3719 
3720 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3721   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3722   ldr(dst, Address(rmethod, Method::const_offset()));
3723   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3724   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3725   ldr(dst, Address(dst, mirror_offset));
3726   resolve_oop_handle(dst, tmp);
3727 }
3728 
3729 void MacroAssembler::load_storage_props(Register dst, Register src) {
3730   load_metadata(dst, src);
3731   if (UseCompressedClassPointers) {
3732     asrw(dst, dst, oopDesc::narrow_storage_props_shift);
3733   } else {
3734     asr(dst, dst, oopDesc::wide_storage_props_shift);
3735   }
3736 }
3737 
3738 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3739   if (UseCompressedClassPointers) {
3740     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3741     if (CompressedKlassPointers::base() == NULL) {
3742       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3743       return;
3744     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3745                && CompressedKlassPointers::shift() == 0) {
3746       // Only the bottom 32 bits matter
3747       cmpw(trial_klass, tmp);
3748       return;
3749     }
3750     decode_klass_not_null(tmp);
3751   } else {
3752     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3753   }
3754   cmp(trial_klass, tmp);
3755 }
3756 
3757 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3758   load_klass(dst, src);
3759   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3760 }
3761 
3762 void MacroAssembler::store_klass(Register dst, Register src) {
3763   // FIXME: Should this be a store release?  concurrent gcs assumes
3764   // klass length is valid if klass field is not null.
3765   if (UseCompressedClassPointers) {
3766     encode_klass_not_null(src);
3767     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3768   } else {
3769     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3770   }
3771 }
3772 
3773 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3774   if (UseCompressedClassPointers) {
3775     // Store to klass gap in destination
3776     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3777   }
3778 }
3779 
3780 // Algorithm must match CompressedOops::encode.
3781 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3782 #ifdef ASSERT
3783   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3784 #endif
3785   verify_oop(s, "broken oop in encode_heap_oop");
3786   if (CompressedOops::base() == NULL) {
3787     if (CompressedOops::shift() != 0) {
3788       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3789       lsr(d, s, LogMinObjAlignmentInBytes);
3790     } else {
3791       mov(d, s);
3792     }
3793   } else {
3794     subs(d, s, rheapbase);
3795     csel(d, d, zr, Assembler::HS);
3796     lsr(d, d, LogMinObjAlignmentInBytes);
3797 
3798     /*  Old algorithm: is this any worse?
3799     Label nonnull;
3800     cbnz(r, nonnull);
3801     sub(r, r, rheapbase);
3802     bind(nonnull);
3803     lsr(r, r, LogMinObjAlignmentInBytes);
3804     */
3805   }
3806 }
3807 
3808 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3809 #ifdef ASSERT
3810   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3811   if (CheckCompressedOops) {
3812     Label ok;
3813     cbnz(r, ok);
3814     stop("null oop passed to encode_heap_oop_not_null");
3815     bind(ok);
3816   }
3817 #endif
3818   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3819   if (CompressedOops::base() != NULL) {
3820     sub(r, r, rheapbase);
3821   }
3822   if (CompressedOops::shift() != 0) {
3823     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3824     lsr(r, r, LogMinObjAlignmentInBytes);
3825   }
3826 }
3827 
3828 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3829 #ifdef ASSERT
3830   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3831   if (CheckCompressedOops) {
3832     Label ok;
3833     cbnz(src, ok);
3834     stop("null oop passed to encode_heap_oop_not_null2");
3835     bind(ok);
3836   }
3837 #endif
3838   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3839 
3840   Register data = src;
3841   if (CompressedOops::base() != NULL) {
3842     sub(dst, src, rheapbase);
3843     data = dst;
3844   }
3845   if (CompressedOops::shift() != 0) {
3846     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3847     lsr(dst, data, LogMinObjAlignmentInBytes);
3848     data = dst;
3849   }
3850   if (data == src)
3851     mov(dst, src);
3852 }
3853 
3854 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3855 #ifdef ASSERT
3856   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3857 #endif
3858   if (CompressedOops::base() == NULL) {
3859     if (CompressedOops::shift() != 0 || d != s) {
3860       lsl(d, s, CompressedOops::shift());
3861     }
3862   } else {
3863     Label done;
3864     if (d != s)
3865       mov(d, s);
3866     cbz(s, done);
3867     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3868     bind(done);
3869   }
3870   verify_oop(d, "broken oop in decode_heap_oop");
3871 }
3872 
3873 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3874   assert (UseCompressedOops, "should only be used for compressed headers");
3875   assert (Universe::heap() != NULL, "java heap should be initialized");
3876   // Cannot assert, unverified entry point counts instructions (see .ad file)
3877   // vtableStubs also counts instructions in pd_code_size_limit.
3878   // Also do not verify_oop as this is called by verify_oop.
3879   if (CompressedOops::shift() != 0) {
3880     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3881     if (CompressedOops::base() != NULL) {
3882       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3883     } else {
3884       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3885     }
3886   } else {
3887     assert (CompressedOops::base() == NULL, "sanity");
3888   }
3889 }
3890 
3891 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3892   assert (UseCompressedOops, "should only be used for compressed headers");
3893   assert (Universe::heap() != NULL, "java heap should be initialized");
3894   // Cannot assert, unverified entry point counts instructions (see .ad file)
3895   // vtableStubs also counts instructions in pd_code_size_limit.
3896   // Also do not verify_oop as this is called by verify_oop.
3897   if (CompressedOops::shift() != 0) {
3898     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3899     if (CompressedOops::base() != NULL) {
3900       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3901     } else {
3902       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3903     }
3904   } else {
3905     assert (CompressedOops::base() == NULL, "sanity");
3906     if (dst != src) {
3907       mov(dst, src);
3908     }
3909   }
3910 }
3911 
3912 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3913   if (CompressedKlassPointers::base() == NULL) {
3914     if (CompressedKlassPointers::shift() != 0) {
3915       assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3916       lsr(dst, src, LogKlassAlignmentInBytes);
3917     } else {
3918       if (dst != src) mov(dst, src);
3919     }
3920     return;
3921   }
3922 
3923   if (use_XOR_for_compressed_class_base) {
3924     if (CompressedKlassPointers::shift() != 0) {
3925       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3926       lsr(dst, dst, LogKlassAlignmentInBytes);
3927     } else {
3928       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3929     }
3930     return;
3931   }
3932 
3933   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3934       && CompressedKlassPointers::shift() == 0) {
3935     movw(dst, src);
3936     return;
3937   }
3938 
3939 #ifdef ASSERT
3940   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3941 #endif
3942 
3943   Register rbase = dst;
3944   if (dst == src) rbase = rheapbase;
3945   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3946   sub(dst, src, rbase);
3947   if (CompressedKlassPointers::shift() != 0) {
3948     assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3949     lsr(dst, dst, LogKlassAlignmentInBytes);
3950   }
3951   if (dst == src) reinit_heapbase();
3952 }
3953 
3954 void MacroAssembler::encode_klass_not_null(Register r) {
3955   encode_klass_not_null(r, r);
3956 }
3957 
3958 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3959   Register rbase = dst;
3960   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3961 
3962   if (CompressedKlassPointers::base() == NULL) {
3963     if (CompressedKlassPointers::shift() != 0) {
3964       assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3965       lsl(dst, src, LogKlassAlignmentInBytes);
3966     } else {
3967       if (dst != src) mov(dst, src);
3968     }
3969     return;
3970   }
3971 
3972   if (use_XOR_for_compressed_class_base) {
3973     if (CompressedKlassPointers::shift() != 0) {
3974       lsl(dst, src, LogKlassAlignmentInBytes);
3975       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3976     } else {
3977       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3978     }
3979     return;
3980   }
3981 
3982   if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3983       && CompressedKlassPointers::shift() == 0) {
3984     if (dst != src)
3985       movw(dst, src);
3986     movk(dst, (uint64_t)CompressedKlassPointers::base() >> 32, 32);
3987     return;
3988   }
3989 
3990   // Cannot assert, unverified entry point counts instructions (see .ad file)
3991   // vtableStubs also counts instructions in pd_code_size_limit.
3992   // Also do not verify_oop as this is called by verify_oop.
3993   if (dst == src) rbase = rheapbase;
3994   mov(rbase, (uint64_t)CompressedKlassPointers::base());
3995   if (CompressedKlassPointers::shift() != 0) {
3996     assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong");
3997     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3998   } else {
3999     add(dst, rbase, src);
4000   }
4001   if (dst == src) reinit_heapbase();
4002 }
4003 
4004 void  MacroAssembler::decode_klass_not_null(Register r) {
4005   decode_klass_not_null(r, r);
4006 }
4007 
4008 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4009 #ifdef ASSERT
4010   {
4011     ThreadInVMfromUnknown tiv;
4012     assert (UseCompressedOops, "should only be used for compressed oops");
4013     assert (Universe::heap() != NULL, "java heap should be initialized");
4014     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4015     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4016   }
4017 #endif
4018   int oop_index = oop_recorder()->find_index(obj);
4019   InstructionMark im(this);
4020   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4021   code_section()->relocate(inst_mark(), rspec);
4022   movz(dst, 0xDEAD, 16);
4023   movk(dst, 0xBEEF);
4024 }
4025 
4026 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4027   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4028   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4029   int index = oop_recorder()->find_index(k);
4030   assert(! Universe::heap()->is_in(k), "should not be an oop");
4031 
4032   InstructionMark im(this);
4033   RelocationHolder rspec = metadata_Relocation::spec(index);
4034   code_section()->relocate(inst_mark(), rspec);
4035   narrowKlass nk = CompressedKlassPointers::encode(k);
4036   movz(dst, (nk >> 16), 16);
4037   movk(dst, nk & 0xffff);
4038 }
4039 
4040 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4041                                     Register dst, Address src,
4042                                     Register tmp1, Register thread_tmp) {
4043   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4044   decorators = AccessInternal::decorator_fixup(decorators);
4045   bool as_raw = (decorators & AS_RAW) != 0;
4046   if (as_raw) {
4047     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4048   } else {
4049     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4050   }
4051 }
4052 
4053 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4054                                      Address dst, Register src,
4055                                      Register tmp1, Register thread_tmp, Register tmp3) {
4056 
4057   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4058   decorators = AccessInternal::decorator_fixup(decorators);
4059   bool as_raw = (decorators & AS_RAW) != 0;
4060   if (as_raw) {
4061     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4062   } else {
4063     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp, tmp3);
4064   }
4065 }
4066 
4067 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4068   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4069   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4070     decorators |= ACCESS_READ | ACCESS_WRITE;
4071   }
4072   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4073   return bs->resolve(this, decorators, obj);
4074 }
4075 
4076 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4077                                    Register thread_tmp, DecoratorSet decorators) {
4078   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4079 }
4080 
4081 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4082                                             Register thread_tmp, DecoratorSet decorators) {
4083   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4084 }
4085 
4086 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4087                                     Register thread_tmp, Register tmp3, DecoratorSet decorators) {
4088   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp, tmp3);
4089 }
4090 
4091 // Used for storing NULLs.
4092 void MacroAssembler::store_heap_oop_null(Address dst) {
4093   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg, noreg);
4094 }
4095 
4096 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4097   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4098   int index = oop_recorder()->allocate_metadata_index(obj);
4099   RelocationHolder rspec = metadata_Relocation::spec(index);
4100   return Address((address)obj, rspec);
4101 }
4102 
4103 // Move an oop into a register.  immediate is true if we want
4104 // immediate instrcutions, i.e. we are not going to patch this
4105 // instruction while the code is being executed by another thread.  In
4106 // that case we can use move immediates rather than the constant pool.
4107 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4108   int oop_index;
4109   if (obj == NULL) {
4110     oop_index = oop_recorder()->allocate_oop_index(obj);
4111   } else {
4112 #ifdef ASSERT
4113     {
4114       ThreadInVMfromUnknown tiv;
4115       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4116     }
4117 #endif
4118     oop_index = oop_recorder()->find_index(obj);
4119   }
4120   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4121   if (! immediate) {
4122     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4123     ldr_constant(dst, Address(dummy, rspec));
4124   } else
4125     mov(dst, Address((address)obj, rspec));
4126 }
4127 
4128 // Move a metadata address into a register.
4129 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4130   int oop_index;
4131   if (obj == NULL) {
4132     oop_index = oop_recorder()->allocate_metadata_index(obj);
4133   } else {
4134     oop_index = oop_recorder()->find_index(obj);
4135   }
4136   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4137   mov(dst, Address((address)obj, rspec));
4138 }
4139 
4140 Address MacroAssembler::constant_oop_address(jobject obj) {
4141 #ifdef ASSERT
4142   {
4143     ThreadInVMfromUnknown tiv;
4144     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4145     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4146   }
4147 #endif
4148   int oop_index = oop_recorder()->find_index(obj);
4149   return Address((address)obj, oop_Relocation::spec(oop_index));
4150 }
4151 
4152 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4153 void MacroAssembler::tlab_allocate(Register obj,
4154                                    Register var_size_in_bytes,
4155                                    int con_size_in_bytes,
4156                                    Register t1,
4157                                    Register t2,
4158                                    Label& slow_case) {
4159   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4160   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4161 }
4162 
4163 // Defines obj, preserves var_size_in_bytes
4164 void MacroAssembler::eden_allocate(Register obj,
4165                                    Register var_size_in_bytes,
4166                                    int con_size_in_bytes,
4167                                    Register t1,
4168                                    Label& slow_case) {
4169   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4170   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4171 }
4172 
4173 // Zero words; len is in bytes
4174 // Destroys all registers except addr
4175 // len must be a nonzero multiple of wordSize
4176 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4177   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4178 
4179 #ifdef ASSERT
4180   { Label L;
4181     tst(len, BytesPerWord - 1);
4182     br(Assembler::EQ, L);
4183     stop("len is not a multiple of BytesPerWord");
4184     bind(L);
4185   }
4186 #endif
4187 
4188 #ifndef PRODUCT
4189   block_comment("zero memory");
4190 #endif
4191 
4192   Label loop;
4193   Label entry;
4194 
4195 //  Algorithm:
4196 //
4197 //    scratch1 = cnt & 7;
4198 //    cnt -= scratch1;
4199 //    p += scratch1;
4200 //    switch (scratch1) {
4201 //      do {
4202 //        cnt -= 8;
4203 //          p[-8] = 0;
4204 //        case 7:
4205 //          p[-7] = 0;
4206 //        case 6:
4207 //          p[-6] = 0;
4208 //          // ...
4209 //        case 1:
4210 //          p[-1] = 0;
4211 //        case 0:
4212 //          p += 8;
4213 //      } while (cnt);
4214 //    }
4215 
4216   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4217 
4218   lsr(len, len, LogBytesPerWord);
4219   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4220   sub(len, len, rscratch1);      // cnt -= unroll
4221   // t1 always points to the end of the region we're about to zero
4222   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4223   adr(rscratch2, entry);
4224   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4225   br(rscratch2);
4226   bind(loop);
4227   sub(len, len, unroll);
4228   for (int i = -unroll; i < 0; i++)
4229     Assembler::str(zr, Address(t1, i * wordSize));
4230   bind(entry);
4231   add(t1, t1, unroll * wordSize);
4232   cbnz(len, loop);
4233 }
4234 
4235 void MacroAssembler::verify_tlab() {
4236 #ifdef ASSERT
4237   if (UseTLAB && VerifyOops) {
4238     Label next, ok;
4239 
4240     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4241 
4242     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4243     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4244     cmp(rscratch2, rscratch1);
4245     br(Assembler::HS, next);
4246     STOP("assert(top >= start)");
4247     should_not_reach_here();
4248 
4249     bind(next);
4250     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4251     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4252     cmp(rscratch2, rscratch1);
4253     br(Assembler::HS, ok);
4254     STOP("assert(top <= end)");
4255     should_not_reach_here();
4256 
4257     bind(ok);
4258     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4259   }
4260 #endif
4261 }
4262 
4263 // Writes to stack successive pages until offset reached to check for
4264 // stack overflow + shadow pages.  This clobbers tmp.
4265 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4266   assert_different_registers(tmp, size, rscratch1);
4267   mov(tmp, sp);
4268   // Bang stack for total size given plus shadow page size.
4269   // Bang one page at a time because large size can bang beyond yellow and
4270   // red zones.
4271   Label loop;
4272   mov(rscratch1, os::vm_page_size());
4273   bind(loop);
4274   lea(tmp, Address(tmp, -os::vm_page_size()));
4275   subsw(size, size, rscratch1);
4276   str(size, Address(tmp));
4277   br(Assembler::GT, loop);
4278 
4279   // Bang down shadow pages too.
4280   // At this point, (tmp-0) is the last address touched, so don't
4281   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4282   // was post-decremented.)  Skip this address by starting at i=1, and
4283   // touch a few more pages below.  N.B.  It is important to touch all
4284   // the way down to and including i=StackShadowPages.
4285   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4286     // this could be any sized move but this is can be a debugging crumb
4287     // so the bigger the better.
4288     lea(tmp, Address(tmp, -os::vm_page_size()));
4289     str(size, Address(tmp));
4290   }
4291 }
4292 
4293 
4294 // Move the address of the polling page into dest.
4295 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4296   if (SafepointMechanism::uses_thread_local_poll()) {
4297     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4298   } else {
4299     unsigned long off;
4300     adrp(dest, Address(page, rtype), off);
4301     assert(off == 0, "polling page must be page aligned");
4302   }
4303 }
4304 
4305 // Move the address of the polling page into r, then read the polling
4306 // page.
4307 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4308   get_polling_page(r, page, rtype);
4309   return read_polling_page(r, rtype);
4310 }
4311 
4312 // Read the polling page.  The address of the polling page must
4313 // already be in r.
4314 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4315   InstructionMark im(this);
4316   code_section()->relocate(inst_mark(), rtype);
4317   ldrw(zr, Address(r, 0));
4318   return inst_mark();
4319 }
4320 
4321 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4322   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4323   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4324   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4325   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4326   long offset_low = dest_page - low_page;
4327   long offset_high = dest_page - high_page;
4328 
4329   assert(is_valid_AArch64_address(dest.target()), "bad address");
4330   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4331 
4332   InstructionMark im(this);
4333   code_section()->relocate(inst_mark(), dest.rspec());
4334   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4335   // the code cache so that if it is relocated we know it will still reach
4336   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4337     _adrp(reg1, dest.target());
4338   } else {
4339     unsigned long target = (unsigned long)dest.target();
4340     unsigned long adrp_target
4341       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4342 
4343     _adrp(reg1, (address)adrp_target);
4344     movk(reg1, target >> 32, 32);
4345   }
4346   byte_offset = (unsigned long)dest.target() & 0xfff;
4347 }
4348 
4349 void MacroAssembler::load_byte_map_base(Register reg) {
4350   CardTable::CardValue* byte_map_base =
4351     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4352 
4353   if (is_valid_AArch64_address((address)byte_map_base)) {
4354     // Strictly speaking the byte_map_base isn't an address at all,
4355     // and it might even be negative.
4356     unsigned long offset;
4357     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4358     // We expect offset to be zero with most collectors.
4359     if (offset != 0) {
4360       add(reg, reg, offset);
4361     }
4362   } else {
4363     mov(reg, (uint64_t)byte_map_base);
4364   }
4365 }
4366 
4367 void MacroAssembler::build_frame(int framesize) {
4368   assert(framesize > 0, "framesize must be > 0");
4369   if (framesize < ((1 << 9) + 2 * wordSize)) {
4370     sub(sp, sp, framesize);
4371     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4372     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4373   } else {
4374     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4375     if (PreserveFramePointer) mov(rfp, sp);
4376     if (framesize < ((1 << 12) + 2 * wordSize))
4377       sub(sp, sp, framesize - 2 * wordSize);
4378     else {
4379       mov(rscratch1, framesize - 2 * wordSize);
4380       sub(sp, sp, rscratch1);
4381     }
4382   }
4383 }
4384 
4385 void MacroAssembler::remove_frame(int framesize) {
4386   assert(framesize > 0, "framesize must be > 0");
4387   if (framesize < ((1 << 9) + 2 * wordSize)) {
4388     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4389     add(sp, sp, framesize);
4390   } else {
4391     if (framesize < ((1 << 12) + 2 * wordSize))
4392       add(sp, sp, framesize - 2 * wordSize);
4393     else {
4394       mov(rscratch1, framesize - 2 * wordSize);
4395       add(sp, sp, rscratch1);
4396     }
4397     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4398   }
4399 }
4400 
4401 #ifdef COMPILER2
4402 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4403 
4404 // Search for str1 in str2 and return index or -1
4405 void MacroAssembler::string_indexof(Register str2, Register str1,
4406                                     Register cnt2, Register cnt1,
4407                                     Register tmp1, Register tmp2,
4408                                     Register tmp3, Register tmp4,
4409                                     Register tmp5, Register tmp6,
4410                                     int icnt1, Register result, int ae) {
4411   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4412   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4413 
4414   Register ch1 = rscratch1;
4415   Register ch2 = rscratch2;
4416   Register cnt1tmp = tmp1;
4417   Register cnt2tmp = tmp2;
4418   Register cnt1_neg = cnt1;
4419   Register cnt2_neg = cnt2;
4420   Register result_tmp = tmp4;
4421 
4422   bool isL = ae == StrIntrinsicNode::LL;
4423 
4424   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4425   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4426   int str1_chr_shift = str1_isL ? 0:1;
4427   int str2_chr_shift = str2_isL ? 0:1;
4428   int str1_chr_size = str1_isL ? 1:2;
4429   int str2_chr_size = str2_isL ? 1:2;
4430   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4431                                       (chr_insn)&MacroAssembler::ldrh;
4432   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4433                                       (chr_insn)&MacroAssembler::ldrh;
4434   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4435   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4436 
4437   // Note, inline_string_indexOf() generates checks:
4438   // if (substr.count > string.count) return -1;
4439   // if (substr.count == 0) return 0;
4440 
4441   // We have two strings, a source string in str2, cnt2 and a pattern string
4442   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4443 
4444   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4445   // With a small pattern and source we use linear scan.
4446 
4447   if (icnt1 == -1) {
4448     sub(result_tmp, cnt2, cnt1);
4449     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4450     br(LT, LINEARSEARCH);
4451     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4452     subs(zr, cnt1, 256);
4453     lsr(tmp1, cnt2, 2);
4454     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4455     br(GE, LINEARSTUB);
4456   }
4457 
4458 // The Boyer Moore alogorithm is based on the description here:-
4459 //
4460 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4461 //
4462 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4463 // and the 'Good Suffix' rule.
4464 //
4465 // These rules are essentially heuristics for how far we can shift the
4466 // pattern along the search string.
4467 //
4468 // The implementation here uses the 'Bad Character' rule only because of the
4469 // complexity of initialisation for the 'Good Suffix' rule.
4470 //
4471 // This is also known as the Boyer-Moore-Horspool algorithm:-
4472 //
4473 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4474 //
4475 // This particular implementation has few java-specific optimizations.
4476 //
4477 // #define ASIZE 256
4478 //
4479 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4480 //       int i, j;
4481 //       unsigned c;
4482 //       unsigned char bc[ASIZE];
4483 //
4484 //       /* Preprocessing */
4485 //       for (i = 0; i < ASIZE; ++i)
4486 //          bc[i] = m;
4487 //       for (i = 0; i < m - 1; ) {
4488 //          c = x[i];
4489 //          ++i;
4490 //          // c < 256 for Latin1 string, so, no need for branch
4491 //          #ifdef PATTERN_STRING_IS_LATIN1
4492 //          bc[c] = m - i;
4493 //          #else
4494 //          if (c < ASIZE) bc[c] = m - i;
4495 //          #endif
4496 //       }
4497 //
4498 //       /* Searching */
4499 //       j = 0;
4500 //       while (j <= n - m) {
4501 //          c = y[i+j];
4502 //          if (x[m-1] == c)
4503 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4504 //          if (i < 0) return j;
4505 //          // c < 256 for Latin1 string, so, no need for branch
4506 //          #ifdef SOURCE_STRING_IS_LATIN1
4507 //          // LL case: (c< 256) always true. Remove branch
4508 //          j += bc[y[j+m-1]];
4509 //          #endif
4510 //          #ifndef PATTERN_STRING_IS_UTF
4511 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4512 //          if (c < ASIZE)
4513 //            j += bc[y[j+m-1]];
4514 //          else
4515 //            j += 1
4516 //          #endif
4517 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4518 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4519 //          if (c < ASIZE)
4520 //            j += bc[y[j+m-1]];
4521 //          else
4522 //            j += m
4523 //          #endif
4524 //       }
4525 //    }
4526 
4527   if (icnt1 == -1) {
4528     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4529         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4530     Register cnt1end = tmp2;
4531     Register str2end = cnt2;
4532     Register skipch = tmp2;
4533 
4534     // str1 length is >=8, so, we can read at least 1 register for cases when
4535     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4536     // UL case. We'll re-read last character in inner pre-loop code to have
4537     // single outer pre-loop load
4538     const int firstStep = isL ? 7 : 3;
4539 
4540     const int ASIZE = 256;
4541     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4542     sub(sp, sp, ASIZE);
4543     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4544     mov(ch1, sp);
4545     BIND(BM_INIT_LOOP);
4546       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4547       subs(tmp5, tmp5, 1);
4548       br(GT, BM_INIT_LOOP);
4549 
4550       sub(cnt1tmp, cnt1, 1);
4551       mov(tmp5, str2);
4552       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4553       sub(ch2, cnt1, 1);
4554       mov(tmp3, str1);
4555     BIND(BCLOOP);
4556       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4557       if (!str1_isL) {
4558         subs(zr, ch1, ASIZE);
4559         br(HS, BCSKIP);
4560       }
4561       strb(ch2, Address(sp, ch1));
4562     BIND(BCSKIP);
4563       subs(ch2, ch2, 1);
4564       br(GT, BCLOOP);
4565 
4566       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4567       if (str1_isL == str2_isL) {
4568         // load last 8 bytes (8LL/4UU symbols)
4569         ldr(tmp6, Address(tmp6, -wordSize));
4570       } else {
4571         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4572         // convert Latin1 to UTF. We'll have to wait until load completed, but
4573         // it's still faster than per-character loads+checks
4574         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4575         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4576         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4577         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4578         orr(ch2, ch1, ch2, LSL, 16);
4579         orr(tmp6, tmp6, tmp3, LSL, 48);
4580         orr(tmp6, tmp6, ch2, LSL, 16);
4581       }
4582     BIND(BMLOOPSTR2);
4583       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4584       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4585       if (str1_isL == str2_isL) {
4586         // re-init tmp3. It's for free because it's executed in parallel with
4587         // load above. Alternative is to initialize it before loop, but it'll
4588         // affect performance on in-order systems with 2 or more ld/st pipelines
4589         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4590       }
4591       if (!isL) { // UU/UL case
4592         lsl(ch2, cnt1tmp, 1); // offset in bytes
4593       }
4594       cmp(tmp3, skipch);
4595       br(NE, BMSKIP);
4596       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4597       mov(ch1, tmp6);
4598       if (isL) {
4599         b(BMLOOPSTR1_AFTER_LOAD);
4600       } else {
4601         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4602         b(BMLOOPSTR1_CMP);
4603       }
4604     BIND(BMLOOPSTR1);
4605       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4606       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4607     BIND(BMLOOPSTR1_AFTER_LOAD);
4608       subs(cnt1tmp, cnt1tmp, 1);
4609       br(LT, BMLOOPSTR1_LASTCMP);
4610     BIND(BMLOOPSTR1_CMP);
4611       cmp(ch1, ch2);
4612       br(EQ, BMLOOPSTR1);
4613     BIND(BMSKIP);
4614       if (!isL) {
4615         // if we've met UTF symbol while searching Latin1 pattern, then we can
4616         // skip cnt1 symbols
4617         if (str1_isL != str2_isL) {
4618           mov(result_tmp, cnt1);
4619         } else {
4620           mov(result_tmp, 1);
4621         }
4622         subs(zr, skipch, ASIZE);
4623         br(HS, BMADV);
4624       }
4625       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4626     BIND(BMADV);
4627       sub(cnt1tmp, cnt1, 1);
4628       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4629       cmp(str2, str2end);
4630       br(LE, BMLOOPSTR2);
4631       add(sp, sp, ASIZE);
4632       b(NOMATCH);
4633     BIND(BMLOOPSTR1_LASTCMP);
4634       cmp(ch1, ch2);
4635       br(NE, BMSKIP);
4636     BIND(BMMATCH);
4637       sub(result, str2, tmp5);
4638       if (!str2_isL) lsr(result, result, 1);
4639       add(sp, sp, ASIZE);
4640       b(DONE);
4641 
4642     BIND(LINEARSTUB);
4643     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4644     br(LT, LINEAR_MEDIUM);
4645     mov(result, zr);
4646     RuntimeAddress stub = NULL;
4647     if (isL) {
4648       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4649       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4650     } else if (str1_isL) {
4651       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4652        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4653     } else {
4654       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4655       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4656     }
4657     trampoline_call(stub);
4658     b(DONE);
4659   }
4660 
4661   BIND(LINEARSEARCH);
4662   {
4663     Label DO1, DO2, DO3;
4664 
4665     Register str2tmp = tmp2;
4666     Register first = tmp3;
4667 
4668     if (icnt1 == -1)
4669     {
4670         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4671 
4672         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4673         br(LT, DOSHORT);
4674       BIND(LINEAR_MEDIUM);
4675         (this->*str1_load_1chr)(first, Address(str1));
4676         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4677         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4678         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4679         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4680 
4681       BIND(FIRST_LOOP);
4682         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4683         cmp(first, ch2);
4684         br(EQ, STR1_LOOP);
4685       BIND(STR2_NEXT);
4686         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4687         br(LE, FIRST_LOOP);
4688         b(NOMATCH);
4689 
4690       BIND(STR1_LOOP);
4691         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4692         add(cnt2tmp, cnt2_neg, str2_chr_size);
4693         br(GE, MATCH);
4694 
4695       BIND(STR1_NEXT);
4696         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4697         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4698         cmp(ch1, ch2);
4699         br(NE, STR2_NEXT);
4700         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4701         add(cnt2tmp, cnt2tmp, str2_chr_size);
4702         br(LT, STR1_NEXT);
4703         b(MATCH);
4704 
4705       BIND(DOSHORT);
4706       if (str1_isL == str2_isL) {
4707         cmp(cnt1, (u1)2);
4708         br(LT, DO1);
4709         br(GT, DO3);
4710       }
4711     }
4712 
4713     if (icnt1 == 4) {
4714       Label CH1_LOOP;
4715 
4716         (this->*load_4chr)(ch1, str1);
4717         sub(result_tmp, cnt2, 4);
4718         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4719         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4720 
4721       BIND(CH1_LOOP);
4722         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4723         cmp(ch1, ch2);
4724         br(EQ, MATCH);
4725         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4726         br(LE, CH1_LOOP);
4727         b(NOMATCH);
4728       }
4729 
4730     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4731       Label CH1_LOOP;
4732 
4733       BIND(DO2);
4734         (this->*load_2chr)(ch1, str1);
4735         if (icnt1 == 2) {
4736           sub(result_tmp, cnt2, 2);
4737         }
4738         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4739         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4740       BIND(CH1_LOOP);
4741         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4742         cmp(ch1, ch2);
4743         br(EQ, MATCH);
4744         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4745         br(LE, CH1_LOOP);
4746         b(NOMATCH);
4747     }
4748 
4749     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4750       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4751 
4752       BIND(DO3);
4753         (this->*load_2chr)(first, str1);
4754         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4755         if (icnt1 == 3) {
4756           sub(result_tmp, cnt2, 3);
4757         }
4758         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4759         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4760       BIND(FIRST_LOOP);
4761         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4762         cmpw(first, ch2);
4763         br(EQ, STR1_LOOP);
4764       BIND(STR2_NEXT);
4765         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4766         br(LE, FIRST_LOOP);
4767         b(NOMATCH);
4768 
4769       BIND(STR1_LOOP);
4770         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4771         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4772         cmp(ch1, ch2);
4773         br(NE, STR2_NEXT);
4774         b(MATCH);
4775     }
4776 
4777     if (icnt1 == -1 || icnt1 == 1) {
4778       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4779 
4780       BIND(DO1);
4781         (this->*str1_load_1chr)(ch1, str1);
4782         cmp(cnt2, (u1)8);
4783         br(LT, DO1_SHORT);
4784 
4785         sub(result_tmp, cnt2, 8/str2_chr_size);
4786         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4787         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4788         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4789 
4790         if (str2_isL) {
4791           orr(ch1, ch1, ch1, LSL, 8);
4792         }
4793         orr(ch1, ch1, ch1, LSL, 16);
4794         orr(ch1, ch1, ch1, LSL, 32);
4795       BIND(CH1_LOOP);
4796         ldr(ch2, Address(str2, cnt2_neg));
4797         eor(ch2, ch1, ch2);
4798         sub(tmp1, ch2, tmp3);
4799         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4800         bics(tmp1, tmp1, tmp2);
4801         br(NE, HAS_ZERO);
4802         adds(cnt2_neg, cnt2_neg, 8);
4803         br(LT, CH1_LOOP);
4804 
4805         cmp(cnt2_neg, (u1)8);
4806         mov(cnt2_neg, 0);
4807         br(LT, CH1_LOOP);
4808         b(NOMATCH);
4809 
4810       BIND(HAS_ZERO);
4811         rev(tmp1, tmp1);
4812         clz(tmp1, tmp1);
4813         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4814         b(MATCH);
4815 
4816       BIND(DO1_SHORT);
4817         mov(result_tmp, cnt2);
4818         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4819         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4820       BIND(DO1_LOOP);
4821         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4822         cmpw(ch1, ch2);
4823         br(EQ, MATCH);
4824         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4825         br(LT, DO1_LOOP);
4826     }
4827   }
4828   BIND(NOMATCH);
4829     mov(result, -1);
4830     b(DONE);
4831   BIND(MATCH);
4832     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4833   BIND(DONE);
4834 }
4835 
4836 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4837 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4838 
4839 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4840                                          Register ch, Register result,
4841                                          Register tmp1, Register tmp2, Register tmp3)
4842 {
4843   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4844   Register cnt1_neg = cnt1;
4845   Register ch1 = rscratch1;
4846   Register result_tmp = rscratch2;
4847 
4848   cmp(cnt1, (u1)4);
4849   br(LT, DO1_SHORT);
4850 
4851   orr(ch, ch, ch, LSL, 16);
4852   orr(ch, ch, ch, LSL, 32);
4853 
4854   sub(cnt1, cnt1, 4);
4855   mov(result_tmp, cnt1);
4856   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4857   sub(cnt1_neg, zr, cnt1, LSL, 1);
4858 
4859   mov(tmp3, 0x0001000100010001);
4860 
4861   BIND(CH1_LOOP);
4862     ldr(ch1, Address(str1, cnt1_neg));
4863     eor(ch1, ch, ch1);
4864     sub(tmp1, ch1, tmp3);
4865     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4866     bics(tmp1, tmp1, tmp2);
4867     br(NE, HAS_ZERO);
4868     adds(cnt1_neg, cnt1_neg, 8);
4869     br(LT, CH1_LOOP);
4870 
4871     cmp(cnt1_neg, (u1)8);
4872     mov(cnt1_neg, 0);
4873     br(LT, CH1_LOOP);
4874     b(NOMATCH);
4875 
4876   BIND(HAS_ZERO);
4877     rev(tmp1, tmp1);
4878     clz(tmp1, tmp1);
4879     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4880     b(MATCH);
4881 
4882   BIND(DO1_SHORT);
4883     mov(result_tmp, cnt1);
4884     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4885     sub(cnt1_neg, zr, cnt1, LSL, 1);
4886   BIND(DO1_LOOP);
4887     ldrh(ch1, Address(str1, cnt1_neg));
4888     cmpw(ch, ch1);
4889     br(EQ, MATCH);
4890     adds(cnt1_neg, cnt1_neg, 2);
4891     br(LT, DO1_LOOP);
4892   BIND(NOMATCH);
4893     mov(result, -1);
4894     b(DONE);
4895   BIND(MATCH);
4896     add(result, result_tmp, cnt1_neg, ASR, 1);
4897   BIND(DONE);
4898 }
4899 
4900 // Compare strings.
4901 void MacroAssembler::string_compare(Register str1, Register str2,
4902     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4903     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4904   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4905       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4906       SHORT_LOOP_START, TAIL_CHECK;
4907 
4908   const u1 STUB_THRESHOLD = 64 + 8;
4909   bool isLL = ae == StrIntrinsicNode::LL;
4910   bool isLU = ae == StrIntrinsicNode::LU;
4911   bool isUL = ae == StrIntrinsicNode::UL;
4912 
4913   bool str1_isL = isLL || isLU;
4914   bool str2_isL = isLL || isUL;
4915 
4916   int str1_chr_shift = str1_isL ? 0 : 1;
4917   int str2_chr_shift = str2_isL ? 0 : 1;
4918   int str1_chr_size = str1_isL ? 1 : 2;
4919   int str2_chr_size = str2_isL ? 1 : 2;
4920   int minCharsInWord = isLL ? wordSize : wordSize/2;
4921 
4922   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4923   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4924                                       (chr_insn)&MacroAssembler::ldrh;
4925   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4926                                       (chr_insn)&MacroAssembler::ldrh;
4927   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4928                             (uxt_insn)&MacroAssembler::uxthw;
4929 
4930   BLOCK_COMMENT("string_compare {");
4931 
4932   // Bizzarely, the counts are passed in bytes, regardless of whether they
4933   // are L or U strings, however the result is always in characters.
4934   if (!str1_isL) asrw(cnt1, cnt1, 1);
4935   if (!str2_isL) asrw(cnt2, cnt2, 1);
4936 
4937   // Compute the minimum of the string lengths and save the difference.
4938   subsw(result, cnt1, cnt2);
4939   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4940 
4941   // A very short string
4942   cmpw(cnt2, minCharsInWord);
4943   br(Assembler::LE, SHORT_STRING);
4944 
4945   // Compare longwords
4946   // load first parts of strings and finish initialization while loading
4947   {
4948     if (str1_isL == str2_isL) { // LL or UU
4949       ldr(tmp1, Address(str1));
4950       cmp(str1, str2);
4951       br(Assembler::EQ, DONE);
4952       ldr(tmp2, Address(str2));
4953       cmp(cnt2, STUB_THRESHOLD);
4954       br(GE, STUB);
4955       subsw(cnt2, cnt2, minCharsInWord);
4956       br(EQ, TAIL_CHECK);
4957       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4958       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4959       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4960     } else if (isLU) {
4961       ldrs(vtmp, Address(str1));
4962       cmp(str1, str2);
4963       br(Assembler::EQ, DONE);
4964       ldr(tmp2, Address(str2));
4965       cmp(cnt2, STUB_THRESHOLD);
4966       br(GE, STUB);
4967       subw(cnt2, cnt2, 4);
4968       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4969       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4970       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4971       zip1(vtmp, T8B, vtmp, vtmpZ);
4972       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4973       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4974       add(cnt1, cnt1, 4);
4975       fmovd(tmp1, vtmp);
4976     } else { // UL case
4977       ldr(tmp1, Address(str1));
4978       cmp(str1, str2);
4979       br(Assembler::EQ, DONE);
4980       ldrs(vtmp, Address(str2));
4981       cmp(cnt2, STUB_THRESHOLD);
4982       br(GE, STUB);
4983       subw(cnt2, cnt2, 4);
4984       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4985       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4986       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4987       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4988       zip1(vtmp, T8B, vtmp, vtmpZ);
4989       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4990       add(cnt1, cnt1, 8);
4991       fmovd(tmp2, vtmp);
4992     }
4993     adds(cnt2, cnt2, isUL ? 4 : 8);
4994     br(GE, TAIL);
4995     eor(rscratch2, tmp1, tmp2);
4996     cbnz(rscratch2, DIFFERENCE);
4997     // main loop
4998     bind(NEXT_WORD);
4999     if (str1_isL == str2_isL) {
5000       ldr(tmp1, Address(str1, cnt2));
5001       ldr(tmp2, Address(str2, cnt2));
5002       adds(cnt2, cnt2, 8);
5003     } else if (isLU) {
5004       ldrs(vtmp, Address(str1, cnt1));
5005       ldr(tmp2, Address(str2, cnt2));
5006       add(cnt1, cnt1, 4);
5007       zip1(vtmp, T8B, vtmp, vtmpZ);
5008       fmovd(tmp1, vtmp);
5009       adds(cnt2, cnt2, 8);
5010     } else { // UL
5011       ldrs(vtmp, Address(str2, cnt2));
5012       ldr(tmp1, Address(str1, cnt1));
5013       zip1(vtmp, T8B, vtmp, vtmpZ);
5014       add(cnt1, cnt1, 8);
5015       fmovd(tmp2, vtmp);
5016       adds(cnt2, cnt2, 4);
5017     }
5018     br(GE, TAIL);
5019 
5020     eor(rscratch2, tmp1, tmp2);
5021     cbz(rscratch2, NEXT_WORD);
5022     b(DIFFERENCE);
5023     bind(TAIL);
5024     eor(rscratch2, tmp1, tmp2);
5025     cbnz(rscratch2, DIFFERENCE);
5026     // Last longword.  In the case where length == 4 we compare the
5027     // same longword twice, but that's still faster than another
5028     // conditional branch.
5029     if (str1_isL == str2_isL) {
5030       ldr(tmp1, Address(str1));
5031       ldr(tmp2, Address(str2));
5032     } else if (isLU) {
5033       ldrs(vtmp, Address(str1));
5034       ldr(tmp2, Address(str2));
5035       zip1(vtmp, T8B, vtmp, vtmpZ);
5036       fmovd(tmp1, vtmp);
5037     } else { // UL
5038       ldrs(vtmp, Address(str2));
5039       ldr(tmp1, Address(str1));
5040       zip1(vtmp, T8B, vtmp, vtmpZ);
5041       fmovd(tmp2, vtmp);
5042     }
5043     bind(TAIL_CHECK);
5044     eor(rscratch2, tmp1, tmp2);
5045     cbz(rscratch2, DONE);
5046 
5047     // Find the first different characters in the longwords and
5048     // compute their difference.
5049     bind(DIFFERENCE);
5050     rev(rscratch2, rscratch2);
5051     clz(rscratch2, rscratch2);
5052     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5053     lsrv(tmp1, tmp1, rscratch2);
5054     (this->*ext_chr)(tmp1, tmp1);
5055     lsrv(tmp2, tmp2, rscratch2);
5056     (this->*ext_chr)(tmp2, tmp2);
5057     subw(result, tmp1, tmp2);
5058     b(DONE);
5059   }
5060 
5061   bind(STUB);
5062     RuntimeAddress stub = NULL;
5063     switch(ae) {
5064       case StrIntrinsicNode::LL:
5065         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5066         break;
5067       case StrIntrinsicNode::UU:
5068         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5069         break;
5070       case StrIntrinsicNode::LU:
5071         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5072         break;
5073       case StrIntrinsicNode::UL:
5074         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5075         break;
5076       default:
5077         ShouldNotReachHere();
5078      }
5079     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5080     trampoline_call(stub);
5081     b(DONE);
5082 
5083   bind(SHORT_STRING);
5084   // Is the minimum length zero?
5085   cbz(cnt2, DONE);
5086   // arrange code to do most branches while loading and loading next characters
5087   // while comparing previous
5088   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5089   subs(cnt2, cnt2, 1);
5090   br(EQ, SHORT_LAST_INIT);
5091   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5092   b(SHORT_LOOP_START);
5093   bind(SHORT_LOOP);
5094   subs(cnt2, cnt2, 1);
5095   br(EQ, SHORT_LAST);
5096   bind(SHORT_LOOP_START);
5097   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5098   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5099   cmp(tmp1, cnt1);
5100   br(NE, SHORT_LOOP_TAIL);
5101   subs(cnt2, cnt2, 1);
5102   br(EQ, SHORT_LAST2);
5103   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5104   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5105   cmp(tmp2, rscratch1);
5106   br(EQ, SHORT_LOOP);
5107   sub(result, tmp2, rscratch1);
5108   b(DONE);
5109   bind(SHORT_LOOP_TAIL);
5110   sub(result, tmp1, cnt1);
5111   b(DONE);
5112   bind(SHORT_LAST2);
5113   cmp(tmp2, rscratch1);
5114   br(EQ, DONE);
5115   sub(result, tmp2, rscratch1);
5116 
5117   b(DONE);
5118   bind(SHORT_LAST_INIT);
5119   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5120   bind(SHORT_LAST);
5121   cmp(tmp1, cnt1);
5122   br(EQ, DONE);
5123   sub(result, tmp1, cnt1);
5124 
5125   bind(DONE);
5126 
5127   BLOCK_COMMENT("} string_compare");
5128 }
5129 #endif // COMPILER2
5130 
5131 // This method checks if provided byte array contains byte with highest bit set.
5132 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5133     // Simple and most common case of aligned small array which is not at the
5134     // end of memory page is placed here. All other cases are in stub.
5135     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5136     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5137     assert_different_registers(ary1, len, result);
5138 
5139     cmpw(len, 0);
5140     br(LE, SET_RESULT);
5141     cmpw(len, 4 * wordSize);
5142     br(GE, STUB_LONG); // size > 32 then go to stub
5143 
5144     int shift = 64 - exact_log2(os::vm_page_size());
5145     lsl(rscratch1, ary1, shift);
5146     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5147     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5148     br(CS, STUB); // at the end of page then go to stub
5149     subs(len, len, wordSize);
5150     br(LT, END);
5151 
5152   BIND(LOOP);
5153     ldr(rscratch1, Address(post(ary1, wordSize)));
5154     tst(rscratch1, UPPER_BIT_MASK);
5155     br(NE, SET_RESULT);
5156     subs(len, len, wordSize);
5157     br(GE, LOOP);
5158     cmpw(len, -wordSize);
5159     br(EQ, SET_RESULT);
5160 
5161   BIND(END);
5162     ldr(result, Address(ary1));
5163     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5164     lslv(result, result, len);
5165     tst(result, UPPER_BIT_MASK);
5166     b(SET_RESULT);
5167 
5168   BIND(STUB);
5169     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5170     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5171     trampoline_call(has_neg);
5172     b(DONE);
5173 
5174   BIND(STUB_LONG);
5175     RuntimeAddress has_neg_long =  RuntimeAddress(
5176             StubRoutines::aarch64::has_negatives_long());
5177     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5178     trampoline_call(has_neg_long);
5179     b(DONE);
5180 
5181   BIND(SET_RESULT);
5182     cset(result, NE); // set true or false
5183 
5184   BIND(DONE);
5185 }
5186 
5187 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5188                                    Register tmp4, Register tmp5, Register result,
5189                                    Register cnt1, int elem_size) {
5190   Label DONE, SAME;
5191   Register tmp1 = rscratch1;
5192   Register tmp2 = rscratch2;
5193   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5194   int elem_per_word = wordSize/elem_size;
5195   int log_elem_size = exact_log2(elem_size);
5196   int length_offset = arrayOopDesc::length_offset_in_bytes();
5197   int base_offset
5198     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5199   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5200 
5201   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5202   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5203 
5204 #ifndef PRODUCT
5205   {
5206     const char kind = (elem_size == 2) ? 'U' : 'L';
5207     char comment[64];
5208     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5209     BLOCK_COMMENT(comment);
5210   }
5211 #endif
5212 
5213   // if (a1 == a2)
5214   //     return true;
5215   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5216   br(EQ, SAME);
5217 
5218   if (UseSimpleArrayEquals) {
5219     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5220     // if (a1 == null || a2 == null)
5221     //     return false;
5222     // a1 & a2 == 0 means (some-pointer is null) or
5223     // (very-rare-or-even-probably-impossible-pointer-values)
5224     // so, we can save one branch in most cases
5225     tst(a1, a2);
5226     mov(result, false);
5227     br(EQ, A_MIGHT_BE_NULL);
5228     // if (a1.length != a2.length)
5229     //      return false;
5230     bind(A_IS_NOT_NULL);
5231     ldrw(cnt1, Address(a1, length_offset));
5232     ldrw(cnt2, Address(a2, length_offset));
5233     eorw(tmp5, cnt1, cnt2);
5234     cbnzw(tmp5, DONE);
5235     lea(a1, Address(a1, base_offset));
5236     lea(a2, Address(a2, base_offset));
5237     // Check for short strings, i.e. smaller than wordSize.
5238     subs(cnt1, cnt1, elem_per_word);
5239     br(Assembler::LT, SHORT);
5240     // Main 8 byte comparison loop.
5241     bind(NEXT_WORD); {
5242       ldr(tmp1, Address(post(a1, wordSize)));
5243       ldr(tmp2, Address(post(a2, wordSize)));
5244       subs(cnt1, cnt1, elem_per_word);
5245       eor(tmp5, tmp1, tmp2);
5246       cbnz(tmp5, DONE);
5247     } br(GT, NEXT_WORD);
5248     // Last longword.  In the case where length == 4 we compare the
5249     // same longword twice, but that's still faster than another
5250     // conditional branch.
5251     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5252     // length == 4.
5253     if (log_elem_size > 0)
5254       lsl(cnt1, cnt1, log_elem_size);
5255     ldr(tmp3, Address(a1, cnt1));
5256     ldr(tmp4, Address(a2, cnt1));
5257     eor(tmp5, tmp3, tmp4);
5258     cbnz(tmp5, DONE);
5259     b(SAME);
5260     bind(A_MIGHT_BE_NULL);
5261     // in case both a1 and a2 are not-null, proceed with loads
5262     cbz(a1, DONE);
5263     cbz(a2, DONE);
5264     b(A_IS_NOT_NULL);
5265     bind(SHORT);
5266 
5267     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5268     {
5269       ldrw(tmp1, Address(post(a1, 4)));
5270       ldrw(tmp2, Address(post(a2, 4)));
5271       eorw(tmp5, tmp1, tmp2);
5272       cbnzw(tmp5, DONE);
5273     }
5274     bind(TAIL03);
5275     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5276     {
5277       ldrh(tmp3, Address(post(a1, 2)));
5278       ldrh(tmp4, Address(post(a2, 2)));
5279       eorw(tmp5, tmp3, tmp4);
5280       cbnzw(tmp5, DONE);
5281     }
5282     bind(TAIL01);
5283     if (elem_size == 1) { // Only needed when comparing byte arrays.
5284       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5285       {
5286         ldrb(tmp1, a1);
5287         ldrb(tmp2, a2);
5288         eorw(tmp5, tmp1, tmp2);
5289         cbnzw(tmp5, DONE);
5290       }
5291     }
5292   } else {
5293     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5294         CSET_EQ, LAST_CHECK;
5295     mov(result, false);
5296     cbz(a1, DONE);
5297     ldrw(cnt1, Address(a1, length_offset));
5298     cbz(a2, DONE);
5299     ldrw(cnt2, Address(a2, length_offset));
5300     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5301     // faster to perform another branch before comparing a1 and a2
5302     cmp(cnt1, (u1)elem_per_word);
5303     br(LE, SHORT); // short or same
5304     ldr(tmp3, Address(pre(a1, base_offset)));
5305     subs(zr, cnt1, stubBytesThreshold);
5306     br(GE, STUB);
5307     ldr(tmp4, Address(pre(a2, base_offset)));
5308     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5309     cmp(cnt2, cnt1);
5310     br(NE, DONE);
5311 
5312     // Main 16 byte comparison loop with 2 exits
5313     bind(NEXT_DWORD); {
5314       ldr(tmp1, Address(pre(a1, wordSize)));
5315       ldr(tmp2, Address(pre(a2, wordSize)));
5316       subs(cnt1, cnt1, 2 * elem_per_word);
5317       br(LE, TAIL);
5318       eor(tmp4, tmp3, tmp4);
5319       cbnz(tmp4, DONE);
5320       ldr(tmp3, Address(pre(a1, wordSize)));
5321       ldr(tmp4, Address(pre(a2, wordSize)));
5322       cmp(cnt1, (u1)elem_per_word);
5323       br(LE, TAIL2);
5324       cmp(tmp1, tmp2);
5325     } br(EQ, NEXT_DWORD);
5326     b(DONE);
5327 
5328     bind(TAIL);
5329     eor(tmp4, tmp3, tmp4);
5330     eor(tmp2, tmp1, tmp2);
5331     lslv(tmp2, tmp2, tmp5);
5332     orr(tmp5, tmp4, tmp2);
5333     cmp(tmp5, zr);
5334     b(CSET_EQ);
5335 
5336     bind(TAIL2);
5337     eor(tmp2, tmp1, tmp2);
5338     cbnz(tmp2, DONE);
5339     b(LAST_CHECK);
5340 
5341     bind(STUB);
5342     ldr(tmp4, Address(pre(a2, base_offset)));
5343     cmp(cnt2, cnt1);
5344     br(NE, DONE);
5345     if (elem_size == 2) { // convert to byte counter
5346       lsl(cnt1, cnt1, 1);
5347     }
5348     eor(tmp5, tmp3, tmp4);
5349     cbnz(tmp5, DONE);
5350     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5351     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5352     trampoline_call(stub);
5353     b(DONE);
5354 
5355     bind(EARLY_OUT);
5356     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5357     // so, if a2 == null => return false(0), else return true, so we can return a2
5358     mov(result, a2);
5359     b(DONE);
5360     bind(SHORT);
5361     cmp(cnt2, cnt1);
5362     br(NE, DONE);
5363     cbz(cnt1, SAME);
5364     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5365     ldr(tmp3, Address(a1, base_offset));
5366     ldr(tmp4, Address(a2, base_offset));
5367     bind(LAST_CHECK);
5368     eor(tmp4, tmp3, tmp4);
5369     lslv(tmp5, tmp4, tmp5);
5370     cmp(tmp5, zr);
5371     bind(CSET_EQ);
5372     cset(result, EQ);
5373     b(DONE);
5374   }
5375 
5376   bind(SAME);
5377   mov(result, true);
5378   // That's it.
5379   bind(DONE);
5380 
5381   BLOCK_COMMENT("} array_equals");
5382 }
5383 
5384 // Compare Strings
5385 
5386 // For Strings we're passed the address of the first characters in a1
5387 // and a2 and the length in cnt1.
5388 // elem_size is the element size in bytes: either 1 or 2.
5389 // There are two implementations.  For arrays >= 8 bytes, all
5390 // comparisons (including the final one, which may overlap) are
5391 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5392 // halfword, then a short, and then a byte.
5393 
5394 void MacroAssembler::string_equals(Register a1, Register a2,
5395                                    Register result, Register cnt1, int elem_size)
5396 {
5397   Label SAME, DONE, SHORT, NEXT_WORD;
5398   Register tmp1 = rscratch1;
5399   Register tmp2 = rscratch2;
5400   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5401 
5402   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5403   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5404 
5405 #ifndef PRODUCT
5406   {
5407     const char kind = (elem_size == 2) ? 'U' : 'L';
5408     char comment[64];
5409     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5410     BLOCK_COMMENT(comment);
5411   }
5412 #endif
5413 
5414   mov(result, false);
5415 
5416   // Check for short strings, i.e. smaller than wordSize.
5417   subs(cnt1, cnt1, wordSize);
5418   br(Assembler::LT, SHORT);
5419   // Main 8 byte comparison loop.
5420   bind(NEXT_WORD); {
5421     ldr(tmp1, Address(post(a1, wordSize)));
5422     ldr(tmp2, Address(post(a2, wordSize)));
5423     subs(cnt1, cnt1, wordSize);
5424     eor(tmp1, tmp1, tmp2);
5425     cbnz(tmp1, DONE);
5426   } br(GT, NEXT_WORD);
5427   // Last longword.  In the case where length == 4 we compare the
5428   // same longword twice, but that's still faster than another
5429   // conditional branch.
5430   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5431   // length == 4.
5432   ldr(tmp1, Address(a1, cnt1));
5433   ldr(tmp2, Address(a2, cnt1));
5434   eor(tmp2, tmp1, tmp2);
5435   cbnz(tmp2, DONE);
5436   b(SAME);
5437 
5438   bind(SHORT);
5439   Label TAIL03, TAIL01;
5440 
5441   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5442   {
5443     ldrw(tmp1, Address(post(a1, 4)));
5444     ldrw(tmp2, Address(post(a2, 4)));
5445     eorw(tmp1, tmp1, tmp2);
5446     cbnzw(tmp1, DONE);
5447   }
5448   bind(TAIL03);
5449   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5450   {
5451     ldrh(tmp1, Address(post(a1, 2)));
5452     ldrh(tmp2, Address(post(a2, 2)));
5453     eorw(tmp1, tmp1, tmp2);
5454     cbnzw(tmp1, DONE);
5455   }
5456   bind(TAIL01);
5457   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5458     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5459     {
5460       ldrb(tmp1, a1);
5461       ldrb(tmp2, a2);
5462       eorw(tmp1, tmp1, tmp2);
5463       cbnzw(tmp1, DONE);
5464     }
5465   }
5466   // Arrays are equal.
5467   bind(SAME);
5468   mov(result, true);
5469 
5470   // That's it.
5471   bind(DONE);
5472   BLOCK_COMMENT("} string_equals");
5473 }
5474 
5475 
5476 // The size of the blocks erased by the zero_blocks stub.  We must
5477 // handle anything smaller than this ourselves in zero_words().
5478 const int MacroAssembler::zero_words_block_size = 8;
5479 
5480 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5481 // possible, handling small word counts locally and delegating
5482 // anything larger to the zero_blocks stub.  It is expanded many times
5483 // in compiled code, so it is important to keep it short.
5484 
5485 // ptr:   Address of a buffer to be zeroed.
5486 // cnt:   Count in HeapWords.
5487 //
5488 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5489 void MacroAssembler::zero_words(Register ptr, Register cnt)
5490 {
5491   assert(is_power_of_2(zero_words_block_size), "adjust this");
5492   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5493 
5494   BLOCK_COMMENT("zero_words {");
5495   cmp(cnt, (u1)zero_words_block_size);
5496   Label around;
5497   br(LO, around);
5498   {
5499     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5500     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5501     if (StubRoutines::aarch64::complete()) {
5502       trampoline_call(zero_blocks);
5503     } else {
5504       bl(zero_blocks);
5505     }
5506   }
5507   bind(around);
5508   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5509     Label l;
5510     tbz(cnt, exact_log2(i), l);
5511     for (int j = 0; j < i; j += 2) {
5512       stp(zr, zr, post(ptr, 16));
5513     }
5514     bind(l);
5515   }
5516   {
5517     Label l;
5518     tbz(cnt, 0, l);
5519     str(zr, Address(ptr));
5520     bind(l);
5521   }
5522   BLOCK_COMMENT("} zero_words");
5523 }
5524 
5525 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5526 // cnt:          Immediate count in HeapWords.
5527 #define SmallArraySize (18 * BytesPerLong)
5528 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5529 {
5530   BLOCK_COMMENT("zero_words {");
5531   int i = cnt & 1;  // store any odd word to start
5532   if (i) str(zr, Address(base));
5533 
5534   if (cnt <= SmallArraySize / BytesPerLong) {
5535     for (; i < (int)cnt; i += 2)
5536       stp(zr, zr, Address(base, i * wordSize));
5537   } else {
5538     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5539     int remainder = cnt % (2 * unroll);
5540     for (; i < remainder; i += 2)
5541       stp(zr, zr, Address(base, i * wordSize));
5542 
5543     Label loop;
5544     Register cnt_reg = rscratch1;
5545     Register loop_base = rscratch2;
5546     cnt = cnt - remainder;
5547     mov(cnt_reg, cnt);
5548     // adjust base and prebias by -2 * wordSize so we can pre-increment
5549     add(loop_base, base, (remainder - 2) * wordSize);
5550     bind(loop);
5551     sub(cnt_reg, cnt_reg, 2 * unroll);
5552     for (i = 1; i < unroll; i++)
5553       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5554     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5555     cbnz(cnt_reg, loop);
5556   }
5557   BLOCK_COMMENT("} zero_words");
5558 }
5559 
5560 // Zero blocks of memory by using DC ZVA.
5561 //
5562 // Aligns the base address first sufficently for DC ZVA, then uses
5563 // DC ZVA repeatedly for every full block.  cnt is the size to be
5564 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5565 // in cnt.
5566 //
5567 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5568 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5569 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5570   Register tmp = rscratch1;
5571   Register tmp2 = rscratch2;
5572   int zva_length = VM_Version::zva_length();
5573   Label initial_table_end, loop_zva;
5574   Label fini;
5575 
5576   // Base must be 16 byte aligned. If not just return and let caller handle it
5577   tst(base, 0x0f);
5578   br(Assembler::NE, fini);
5579   // Align base with ZVA length.
5580   neg(tmp, base);
5581   andr(tmp, tmp, zva_length - 1);
5582 
5583   // tmp: the number of bytes to be filled to align the base with ZVA length.
5584   add(base, base, tmp);
5585   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5586   adr(tmp2, initial_table_end);
5587   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5588   br(tmp2);
5589 
5590   for (int i = -zva_length + 16; i < 0; i += 16)
5591     stp(zr, zr, Address(base, i));
5592   bind(initial_table_end);
5593 
5594   sub(cnt, cnt, zva_length >> 3);
5595   bind(loop_zva);
5596   dc(Assembler::ZVA, base);
5597   subs(cnt, cnt, zva_length >> 3);
5598   add(base, base, zva_length);
5599   br(Assembler::GE, loop_zva);
5600   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5601   bind(fini);
5602 }
5603 
5604 // base:   Address of a buffer to be filled, 8 bytes aligned.
5605 // cnt:    Count in 8-byte unit.
5606 // value:  Value to be filled with.
5607 // base will point to the end of the buffer after filling.
5608 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5609 {
5610 //  Algorithm:
5611 //
5612 //    scratch1 = cnt & 7;
5613 //    cnt -= scratch1;
5614 //    p += scratch1;
5615 //    switch (scratch1) {
5616 //      do {
5617 //        cnt -= 8;
5618 //          p[-8] = v;
5619 //        case 7:
5620 //          p[-7] = v;
5621 //        case 6:
5622 //          p[-6] = v;
5623 //          // ...
5624 //        case 1:
5625 //          p[-1] = v;
5626 //        case 0:
5627 //          p += 8;
5628 //      } while (cnt);
5629 //    }
5630 
5631   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5632 
5633   Label fini, skip, entry, loop;
5634   const int unroll = 8; // Number of stp instructions we'll unroll
5635 
5636   cbz(cnt, fini);
5637   tbz(base, 3, skip);
5638   str(value, Address(post(base, 8)));
5639   sub(cnt, cnt, 1);
5640   bind(skip);
5641 
5642   andr(rscratch1, cnt, (unroll-1) * 2);
5643   sub(cnt, cnt, rscratch1);
5644   add(base, base, rscratch1, Assembler::LSL, 3);
5645   adr(rscratch2, entry);
5646   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5647   br(rscratch2);
5648 
5649   bind(loop);
5650   add(base, base, unroll * 16);
5651   for (int i = -unroll; i < 0; i++)
5652     stp(value, value, Address(base, i * 16));
5653   bind(entry);
5654   subs(cnt, cnt, unroll * 2);
5655   br(Assembler::GE, loop);
5656 
5657   tbz(cnt, 0, fini);
5658   str(value, Address(post(base, 8)));
5659   bind(fini);
5660 }
5661 
5662 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5663 // java/lang/StringUTF16.compress.
5664 void MacroAssembler::encode_iso_array(Register src, Register dst,
5665                       Register len, Register result,
5666                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5667                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5668 {
5669     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5670         NEXT_32_START, NEXT_32_PRFM_START;
5671     Register tmp1 = rscratch1, tmp2 = rscratch2;
5672 
5673       mov(result, len); // Save initial len
5674 
5675       cmp(len, (u1)8); // handle shortest strings first
5676       br(LT, LOOP_1);
5677       cmp(len, (u1)32);
5678       br(LT, NEXT_8);
5679       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5680       // to convert chars to bytes
5681       if (SoftwarePrefetchHintDistance >= 0) {
5682         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5683         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5684         br(LE, NEXT_32_START);
5685         b(NEXT_32_PRFM_START);
5686         BIND(NEXT_32_PRFM);
5687           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5688         BIND(NEXT_32_PRFM_START);
5689           prfm(Address(src, SoftwarePrefetchHintDistance));
5690           orr(v4, T16B, Vtmp1, Vtmp2);
5691           orr(v5, T16B, Vtmp3, Vtmp4);
5692           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5693           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5694           uzp2(v5, T16B, v4, v5); // high bytes
5695           umov(tmp2, v5, D, 1);
5696           fmovd(tmp1, v5);
5697           orr(tmp1, tmp1, tmp2);
5698           cbnz(tmp1, LOOP_8);
5699           stpq(Vtmp1, Vtmp3, dst);
5700           sub(len, len, 32);
5701           add(dst, dst, 32);
5702           add(src, src, 64);
5703           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5704           br(GE, NEXT_32_PRFM);
5705           cmp(len, (u1)32);
5706           br(LT, LOOP_8);
5707         BIND(NEXT_32);
5708           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5709         BIND(NEXT_32_START);
5710       } else {
5711         BIND(NEXT_32);
5712           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5713       }
5714       prfm(Address(src, SoftwarePrefetchHintDistance));
5715       uzp1(v4, T16B, Vtmp1, Vtmp2);
5716       uzp1(v5, T16B, Vtmp3, Vtmp4);
5717       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5718       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5719       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5720       umov(tmp2, Vtmp1, D, 1);
5721       fmovd(tmp1, Vtmp1);
5722       orr(tmp1, tmp1, tmp2);
5723       cbnz(tmp1, LOOP_8);
5724       stpq(v4, v5, dst);
5725       sub(len, len, 32);
5726       add(dst, dst, 32);
5727       add(src, src, 64);
5728       cmp(len, (u1)32);
5729       br(GE, NEXT_32);
5730       cbz(len, DONE);
5731 
5732     BIND(LOOP_8);
5733       cmp(len, (u1)8);
5734       br(LT, LOOP_1);
5735     BIND(NEXT_8);
5736       ld1(Vtmp1, T8H, src);
5737       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5738       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5739       fmovd(tmp1, Vtmp3);
5740       cbnz(tmp1, NEXT_1);
5741       strd(Vtmp2, dst);
5742 
5743       sub(len, len, 8);
5744       add(dst, dst, 8);
5745       add(src, src, 16);
5746       cmp(len, (u1)8);
5747       br(GE, NEXT_8);
5748 
5749     BIND(LOOP_1);
5750 
5751     cbz(len, DONE);
5752     BIND(NEXT_1);
5753       ldrh(tmp1, Address(post(src, 2)));
5754       tst(tmp1, 0xff00);
5755       br(NE, SET_RESULT);
5756       strb(tmp1, Address(post(dst, 1)));
5757       subs(len, len, 1);
5758       br(GT, NEXT_1);
5759 
5760     BIND(SET_RESULT);
5761       sub(result, result, len); // Return index where we stopped
5762                                 // Return len == 0 if we processed all
5763                                 // characters
5764     BIND(DONE);
5765 }
5766 
5767 
5768 // Inflate byte[] array to char[].
5769 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5770                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5771                                         Register tmp4) {
5772   Label big, done, after_init, to_stub;
5773 
5774   assert_different_registers(src, dst, len, tmp4, rscratch1);
5775 
5776   fmovd(vtmp1, zr);
5777   lsrw(tmp4, len, 3);
5778   bind(after_init);
5779   cbnzw(tmp4, big);
5780   // Short string: less than 8 bytes.
5781   {
5782     Label loop, tiny;
5783 
5784     cmpw(len, 4);
5785     br(LT, tiny);
5786     // Use SIMD to do 4 bytes.
5787     ldrs(vtmp2, post(src, 4));
5788     zip1(vtmp3, T8B, vtmp2, vtmp1);
5789     subw(len, len, 4);
5790     strd(vtmp3, post(dst, 8));
5791 
5792     cbzw(len, done);
5793 
5794     // Do the remaining bytes by steam.
5795     bind(loop);
5796     ldrb(tmp4, post(src, 1));
5797     strh(tmp4, post(dst, 2));
5798     subw(len, len, 1);
5799 
5800     bind(tiny);
5801     cbnz(len, loop);
5802 
5803     b(done);
5804   }
5805 
5806   if (SoftwarePrefetchHintDistance >= 0) {
5807     bind(to_stub);
5808       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5809       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5810       trampoline_call(stub);
5811       b(after_init);
5812   }
5813 
5814   // Unpack the bytes 8 at a time.
5815   bind(big);
5816   {
5817     Label loop, around, loop_last, loop_start;
5818 
5819     if (SoftwarePrefetchHintDistance >= 0) {
5820       const int large_loop_threshold = (64 + 16)/8;
5821       ldrd(vtmp2, post(src, 8));
5822       andw(len, len, 7);
5823       cmp(tmp4, (u1)large_loop_threshold);
5824       br(GE, to_stub);
5825       b(loop_start);
5826 
5827       bind(loop);
5828       ldrd(vtmp2, post(src, 8));
5829       bind(loop_start);
5830       subs(tmp4, tmp4, 1);
5831       br(EQ, loop_last);
5832       zip1(vtmp2, T16B, vtmp2, vtmp1);
5833       ldrd(vtmp3, post(src, 8));
5834       st1(vtmp2, T8H, post(dst, 16));
5835       subs(tmp4, tmp4, 1);
5836       zip1(vtmp3, T16B, vtmp3, vtmp1);
5837       st1(vtmp3, T8H, post(dst, 16));
5838       br(NE, loop);
5839       b(around);
5840       bind(loop_last);
5841       zip1(vtmp2, T16B, vtmp2, vtmp1);
5842       st1(vtmp2, T8H, post(dst, 16));
5843       bind(around);
5844       cbz(len, done);
5845     } else {
5846       andw(len, len, 7);
5847       bind(loop);
5848       ldrd(vtmp2, post(src, 8));
5849       sub(tmp4, tmp4, 1);
5850       zip1(vtmp3, T16B, vtmp2, vtmp1);
5851       st1(vtmp3, T8H, post(dst, 16));
5852       cbnz(tmp4, loop);
5853     }
5854   }
5855 
5856   // Do the tail of up to 8 bytes.
5857   add(src, src, len);
5858   ldrd(vtmp3, Address(src, -8));
5859   add(dst, dst, len, ext::uxtw, 1);
5860   zip1(vtmp3, T16B, vtmp3, vtmp1);
5861   strq(vtmp3, Address(dst, -16));
5862 
5863   bind(done);
5864 }
5865 
5866 // Compress char[] array to byte[].
5867 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5868                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5869                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5870                                          Register result) {
5871   encode_iso_array(src, dst, len, result,
5872                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5873   cmp(len, zr);
5874   csel(result, result, zr, EQ);
5875 }
5876 
5877 // get_thread() can be called anywhere inside generated code so we
5878 // need to save whatever non-callee save context might get clobbered
5879 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5880 // the call setup code.
5881 //
5882 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5883 //
5884 void MacroAssembler::get_thread(Register dst) {
5885   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5886   push(saved_regs, sp);
5887 
5888   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5889   blr(lr);
5890   if (dst != c_rarg0) {
5891     mov(dst, c_rarg0);
5892   }
5893 
5894   pop(saved_regs, sp);
5895 }
5896 
5897 // C2 compiled method's prolog code 
5898 // Moved here from aarch64.ad to support Valhalla code belows
5899 void MacroAssembler::verified_entry(Compile* C, int sp_inc) {
5900 
5901 // n.b. frame size includes space for return pc and rfp
5902   const long framesize = C->frame_size_in_bytes();
5903   assert(framesize % (2 * wordSize) == 0, "must preserve 2 * wordSize alignment");
5904 
5905   // insert a nop at the start of the prolog so we can patch in a
5906   // branch if we need to invalidate the method later
5907   nop();
5908 
5909   int bangsize = C->bang_size_in_bytes();
5910   if (C->need_stack_bang(bangsize) && UseStackBanging)
5911      generate_stack_overflow_check(bangsize);
5912 
5913   build_frame(framesize);
5914 
5915   if (NotifySimulator) {
5916     notify(Assembler::method_entry);
5917   }
5918 
5919   if (VerifyStackAtCalls) {
5920     Unimplemented();
5921   }
5922 }
5923 
5924 int MacroAssembler::store_value_type_fields_to_buf(ciValueKlass* vk, bool from_interpreter) {
5925   // A value type might be returned. If fields are in registers we
5926   // need to allocate a value type instance and initialize it with
5927   // the value of the fields.
5928   Label skip;
5929   // We only need a new buffered value if a new one is not returned
5930   cmp(r0, (u1) 1);
5931   br(Assembler::EQ, skip);
5932   int call_offset = -1;
5933 
5934   Label slow_case;
5935 
5936   // Try to allocate a new buffered value (from the heap)
5937   if (UseTLAB) {
5938 
5939     if (vk != NULL) {
5940       // Called from C1, where the return type is statically known.
5941       mov(r1, (intptr_t)vk->get_ValueKlass());
5942       jint lh = vk->layout_helper();
5943       assert(lh != Klass::_lh_neutral_value, "inline class in return type must have been resolved");
5944       mov(r14, lh);
5945     } else {
5946        // Call from interpreter. R0 contains ((the ValueKlass* of the return type) | 0x01)
5947        andr(r1, r0, -2);
5948        // get obj size
5949        ldrw(r14, Address(rscratch1 /*klass*/, Klass::layout_helper_offset()));
5950     }
5951 
5952      ldr(r13, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
5953  
5954      // check whether we have space in TLAB, 
5955      // rscratch1 contains pointer to just allocated obj
5956       lea(r14, Address(r13, r14)); 
5957       ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
5958 
5959       cmp(r14, rscratch1);
5960       br(Assembler::GT, slow_case);
5961 
5962       // OK we have room in TLAB, 
5963       // Set new TLAB top
5964       str(r14, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 
5965 
5966       // Set new class always locked
5967       mov(rscratch1, (uint64_t) markWord::always_locked_prototype().value());
5968       str(rscratch1, Address(r13, oopDesc::mark_offset_in_bytes()));
5969 
5970       store_klass_gap(r13, zr);  // zero klass gap for compressed oops
5971       if (vk == NULL) {
5972         // store_klass corrupts rbx, so save it in rax for later use (interpreter case only).
5973          mov(r0, r1);
5974       }
5975       
5976       store_klass(r13, r1);  // klass
5977 
5978       if (vk != NULL) {
5979         // FIXME -- do the packing in-line to avoid the runtime call
5980         mov(r0, r13);
5981         far_call(RuntimeAddress(vk->pack_handler())); // no need for call info as this will not safepoint.
5982       } else {
5983 
5984         // We have our new buffered value, initialize its fields with a
5985         // value class specific handler
5986         ldr(r1, Address(r0, InstanceKlass::adr_valueklass_fixed_block_offset()));
5987         ldr(r1, Address(r1, ValueKlass::pack_handler_offset()));
5988 
5989         // Mov new class to r0 and call pack_handler
5990         mov(r0, r13);
5991         blr(r1);
5992       }
5993       b(skip);
5994   }
5995 
5996   bind(slow_case);
5997   // We failed to allocate a new value, fall back to a runtime
5998   // call. Some oop field may be live in some registers but we can't
5999   // tell. That runtime call will take care of preserving them
6000   // across a GC if there's one.
6001 
6002 
6003   if (from_interpreter) {
6004     super_call_VM_leaf(StubRoutines::store_value_type_fields_to_buf());
6005   } else {
6006     ldr(rscratch1, RuntimeAddress(StubRoutines::store_value_type_fields_to_buf()));
6007     blr(rscratch1);
6008     call_offset = offset();
6009   }
6010 
6011   bind(skip);
6012   return call_offset;
6013 }
6014 
6015 // Move a value between registers/stack slots and update the reg_state
6016 bool MacroAssembler::move_helper(VMReg from, VMReg to, BasicType bt, RegState reg_state[], int ret_off, int extra_stack_offset) {
6017   if (reg_state[to->value()] == reg_written) {
6018     return true; // Already written
6019   }
6020 
6021   if (from != to && bt != T_VOID) {
6022     if (reg_state[to->value()] == reg_readonly) {
6023       return false; // Not yet writable
6024     }
6025     if (from->is_reg()) {
6026       if (to->is_reg()) {
6027         mov(to->as_Register(), from->as_Register());
6028       } else {
6029         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6030         Address to_addr = Address(sp, st_off);
6031         if (from->is_FloatRegister()) {
6032           if (bt == T_DOUBLE) {
6033              strd(from->as_FloatRegister(), to_addr);
6034           } else {
6035              assert(bt == T_FLOAT, "must be float");
6036              strs(from->as_FloatRegister(), to_addr);
6037           }
6038         } else {
6039           str(from->as_Register(), to_addr); 
6040         }
6041       }
6042     } else {
6043       Address from_addr = Address(sp, from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset);
6044       if (to->is_reg()) {
6045         if (to->is_FloatRegister()) {
6046           if (bt == T_DOUBLE) {
6047              ldrd(to->as_FloatRegister(), from_addr);
6048           } else {
6049             assert(bt == T_FLOAT, "must be float");
6050             ldrs(to->as_FloatRegister(), from_addr);
6051           }
6052         } else {
6053           ldr(to->as_Register(), from_addr); 
6054         }
6055       } else {
6056         int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6057         ldr(rscratch1, from_addr); 
6058         str(rscratch1, Address(sp, st_off));
6059       }
6060     }
6061   }
6062 
6063   // Update register states
6064   reg_state[from->value()] = reg_writable;
6065   reg_state[to->value()] = reg_written;
6066   return true;
6067 }
6068 
6069 // Read all fields from a value type oop and store the values in registers/stack slots
6070 bool MacroAssembler::unpack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, VMReg from, VMRegPair* regs_to,
6071                                          int& to_index, RegState reg_state[], int ret_off, int extra_stack_offset) {
6072   Register fromReg = from->is_reg() ? from->as_Register() : noreg;
6073   assert(sig->at(sig_index)._bt == T_VOID, "should be at end delimiter");
6074 
6075 
6076   int vt = 1;
6077   bool done = true;
6078   bool mark_done = true;
6079   do {
6080     sig_index--;
6081     BasicType bt = sig->at(sig_index)._bt;
6082     if (bt == T_VALUETYPE) {
6083       vt--;
6084     } else if (bt == T_VOID &&
6085                sig->at(sig_index-1)._bt != T_LONG &&
6086                sig->at(sig_index-1)._bt != T_DOUBLE) {
6087       vt++;
6088     } else if (SigEntry::is_reserved_entry(sig, sig_index)) {
6089       to_index--; // Ignore this
6090     } else {
6091       assert(to_index >= 0, "invalid to_index");
6092       VMRegPair pair_to = regs_to[to_index--];
6093       VMReg to = pair_to.first();
6094 
6095       if (bt == T_VOID) continue;
6096 
6097       int idx = (int) to->value();
6098       if (reg_state[idx] == reg_readonly) {
6099          if (idx != from->value()) {
6100            mark_done = false;
6101          }
6102          done = false;
6103          continue;
6104       } else if (reg_state[idx] == reg_written) {
6105         continue;
6106       } else {
6107         assert(reg_state[idx] == reg_writable, "must be writable");
6108         reg_state[idx] = reg_written;
6109       }
6110 
6111       if (fromReg == noreg) {
6112         int st_off = from->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6113         ldr(rscratch2, Address(sp, st_off)); 
6114         fromReg = rscratch2;
6115       }
6116 
6117       int off = sig->at(sig_index)._offset;
6118       assert(off > 0, "offset in object should be positive");
6119       bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6120 
6121       Address fromAddr = Address(fromReg, off);
6122       bool is_signed = (bt != T_CHAR) && (bt != T_BOOLEAN);
6123 
6124       if (!to->is_FloatRegister()) {
6125 
6126         Register dst = to->is_stack() ? rscratch1 : to->as_Register();
6127 
6128         if (is_oop) {
6129           load_heap_oop(dst, fromAddr);
6130         } else {
6131           load_sized_value(dst, fromAddr, type2aelembytes(bt), is_signed);
6132         }
6133         if (to->is_stack()) {
6134           int st_off = to->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6135           str(dst, Address(sp, st_off));
6136         }
6137       } else {
6138         if (bt == T_DOUBLE) {
6139           ldrd(to->as_FloatRegister(), fromAddr);
6140         } else {
6141           assert(bt == T_FLOAT, "must be float");
6142           ldrs(to->as_FloatRegister(), fromAddr);
6143         }
6144      }
6145 
6146     }
6147 
6148   } while (vt != 0);
6149 
6150   if (mark_done && reg_state[from->value()] != reg_written) {
6151     // This is okay because no one else will write to that slot
6152     reg_state[from->value()] = reg_writable;
6153   }
6154   return done;
6155 }
6156 
6157 // Pack fields back into a value type oop
6158 bool MacroAssembler::pack_value_helper(const GrowableArray<SigEntry>* sig, int& sig_index, int vtarg_index,
6159                                        VMReg to, VMRegPair* regs_from, int regs_from_count, int& from_index, RegState reg_state[],
6160                                        int ret_off, int extra_stack_offset) {
6161   assert(sig->at(sig_index)._bt == T_VALUETYPE, "should be at end delimiter");
6162   assert(to->is_valid(), "must be");
6163 
6164   if (reg_state[to->value()] == reg_written) {
6165     skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6166     return true; // Already written
6167   }
6168 
6169   Register val_array = r0;
6170   Register val_obj_tmp = r11;
6171   Register from_reg_tmp = r10;
6172   Register tmp1 = r14;
6173   Register tmp2 = r13;
6174   Register tmp3 = r1;
6175   Register val_obj = to->is_stack() ? val_obj_tmp : to->as_Register();
6176 
6177   if (reg_state[to->value()] == reg_readonly) {
6178     if (!is_reg_in_unpacked_fields(sig, sig_index, to, regs_from, regs_from_count, from_index)) {
6179       skip_unpacked_fields(sig, sig_index, regs_from, regs_from_count, from_index);
6180       return false; // Not yet writable
6181     }
6182     val_obj = val_obj_tmp;
6183   }
6184 
6185   int index = arrayOopDesc::base_offset_in_bytes(T_OBJECT) + vtarg_index * type2aelembytes(T_VALUETYPE);
6186   load_heap_oop(val_obj, Address(val_array, index));
6187 
6188   ScalarizedValueArgsStream stream(sig, sig_index, regs_from, regs_from_count, from_index);
6189   VMRegPair from_pair;
6190   BasicType bt;
6191 
6192   while (stream.next(from_pair, bt)) {
6193     int off = sig->at(stream.sig_cc_index())._offset;
6194     assert(off > 0, "offset in object should be positive");
6195     bool is_oop = (bt == T_OBJECT || bt == T_ARRAY);
6196     size_t size_in_bytes = is_java_primitive(bt) ? type2aelembytes(bt) : wordSize;
6197 
6198     VMReg from_r1 = from_pair.first();
6199     VMReg from_r2 = from_pair.second();
6200 
6201     // Pack the scalarized field into the value object.
6202     Address dst(val_obj, off);
6203 
6204     if (!from_r1->is_FloatRegister()) {
6205       Register from_reg;
6206       if (from_r1->is_stack()) {
6207         from_reg = from_reg_tmp;
6208         int ld_off = from_r1->reg2stack() * VMRegImpl::stack_slot_size + extra_stack_offset;
6209         load_sized_value(from_reg, Address(sp, ld_off), size_in_bytes, /* is_signed */ false);
6210       } else {
6211         from_reg = from_r1->as_Register();
6212       }
6213 
6214       if (is_oop) {
6215         DecoratorSet decorators = IN_HEAP | ACCESS_WRITE;
6216         store_heap_oop(dst, from_reg, tmp1, tmp2, tmp3, decorators);
6217       } else {
6218         store_sized_value(dst, from_reg, size_in_bytes);
6219       }
6220     } else { 
6221       if (from_r2->is_valid()) {
6222         strd(from_r1->as_FloatRegister(), dst);
6223       } else {
6224         strs(from_r1->as_FloatRegister(), dst);
6225       }
6226     }
6227 
6228     reg_state[from_r1->value()] = reg_writable;
6229   }
6230   sig_index = stream.sig_cc_index();
6231   from_index = stream.regs_cc_index();
6232 
6233   assert(reg_state[to->value()] == reg_writable, "must have already been read");
6234   bool success = move_helper(val_obj->as_VMReg(), to, T_OBJECT, reg_state, ret_off, extra_stack_offset);
6235   assert(success, "to register must be writeable");
6236 
6237   return true;
6238 }
6239 
6240 // Unpack all value type arguments passed as oops
6241 void MacroAssembler::unpack_value_args(Compile* C, bool receiver_only) {
6242   int sp_inc = unpack_value_args_common(C, receiver_only);
6243   // Emit code for verified entry and save increment for stack repair on return
6244   verified_entry(C, sp_inc);
6245 }
6246 
6247 int MacroAssembler::shuffle_value_args(bool is_packing, bool receiver_only, int extra_stack_offset,
6248                                        BasicType* sig_bt, const GrowableArray<SigEntry>* sig_cc,
6249                                        int args_passed, int args_on_stack, VMRegPair* regs,            // from
6250                                        int args_passed_to, int args_on_stack_to, VMRegPair* regs_to) { // to
6251   // Check if we need to extend the stack for packing/unpacking
6252   int sp_inc = (args_on_stack_to - args_on_stack) * VMRegImpl::stack_slot_size;
6253   if (sp_inc > 0) {
6254     sp_inc = align_up(sp_inc, StackAlignmentInBytes);
6255     if (!is_packing) {
6256       // Save the return address, adjust the stack (make sure it is properly
6257       // 16-byte aligned) and copy the return address to the new top of the stack.
6258       // (Note: C1 does this in C1_MacroAssembler::scalarized_entry).
6259       // FIXME: We need not to preserve return address on aarch64
6260       pop(rscratch1);
6261       sub(sp, sp, sp_inc); 
6262       push(rscratch1);
6263     }
6264   } else {
6265     // The scalarized calling convention needs less stack space than the unscalarized one.
6266     // No need to extend the stack, the caller will take care of these adjustments.
6267     sp_inc = 0;
6268   }
6269 
6270   int ret_off; // make sure we don't overwrite the return address
6271   if (is_packing) {
6272     // For C1 code, the VVEP doesn't have reserved slots, so we store the returned address at
6273     // rsp[0] during shuffling.
6274     ret_off = 0;
6275   } else {
6276     // C2 code ensures that sp_inc is a reserved slot.
6277     ret_off = sp_inc;
6278   }
6279 
6280   return shuffle_value_args_common(is_packing, receiver_only, extra_stack_offset,
6281                                    sig_bt, sig_cc,
6282                                    args_passed, args_on_stack, regs,
6283                                    args_passed_to, args_on_stack_to, regs_to,
6284                                    sp_inc, ret_off);
6285 }
6286 
6287 VMReg MacroAssembler::spill_reg_for(VMReg reg) {
6288   return (reg->is_FloatRegister()) ? v0->as_VMReg() : r14->as_VMReg();
6289 }
6290 
6291 void MacroAssembler::cache_wb(Address line) {
6292   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
6293   assert(line.index() == noreg, "index should be noreg");
6294   assert(line.offset() == 0, "offset should be 0");
6295   // would like to assert this
6296   // assert(line._ext.shift == 0, "shift should be zero");
6297   if (VM_Version::supports_dcpop()) {
6298     // writeback using clear virtual address to point of persistence
6299     dc(Assembler::CVAP, line.base());
6300   } else {
6301     // no need to generate anything as Unsafe.writebackMemory should
6302     // never invoke this stub
6303   }
6304 }
6305 
6306 void MacroAssembler::cache_wbsync(bool is_pre) {
6307   // we only need a barrier post sync
6308   if (!is_pre) {
6309     membar(Assembler::AnyAny);
6310   }
6311 }