1 /*
   2  * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "memory/universe.hpp"
  40 #include "nativeInst_aarch64.hpp"
  41 #include "oops/accessDecorators.hpp"
  42 #include "oops/compressedOops.inline.hpp"
  43 #include "oops/klass.inline.hpp"
  44 #include "runtime/biasedLocking.hpp"
  45 #include "runtime/icache.hpp"
  46 #include "runtime/interfaceSupport.inline.hpp"
  47 #include "runtime/jniHandles.inline.hpp"
  48 #include "runtime/sharedRuntime.hpp"
  49 #include "runtime/thread.hpp"
  50 #include "utilities/powerOfTwo.hpp"
  51 #ifdef COMPILER1
  52 #include "c1/c1_LIRAssembler.hpp"
  53 #endif
  54 #ifdef COMPILER2
  55 #include "oops/oop.hpp"
  56 #include "opto/compile.hpp"
  57 #include "opto/intrinsicnode.hpp"
  58 #include "opto/node.hpp"
  59 #endif
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #define STOP(error) stop(error)
  64 #else
  65 #define BLOCK_COMMENT(str) block_comment(str)
  66 #define STOP(error) block_comment(error); stop(error)
  67 #endif
  68 
  69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  70 
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   int instructions = 1;
  75   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  76   long offset = (target - branch) >> 2;
  77   unsigned insn = *(unsigned*)branch;
  78   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  79     // Load register (literal)
  80     Instruction_aarch64::spatch(branch, 23, 5, offset);
  81   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  82     // Unconditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 25, 0, offset);
  84   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  85     // Conditional branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  88     // Compare & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 23, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  91     // Test & branch (immediate)
  92     Instruction_aarch64::spatch(branch, 18, 5, offset);
  93   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  94     // PC-rel. addressing
  95     offset = target-branch;
  96     int shift = Instruction_aarch64::extract(insn, 31, 31);
  97     if (shift) {
  98       u_int64_t dest = (u_int64_t)target;
  99       uint64_t pc_page = (uint64_t)branch >> 12;
 100       uint64_t adr_page = (uint64_t)target >> 12;
 101       unsigned offset_lo = dest & 0xfff;
 102       offset = adr_page - pc_page;
 103 
 104       // We handle 4 types of PC relative addressing
 105       //   1 - adrp    Rx, target_page
 106       //       ldr/str Ry, [Rx, #offset_in_page]
 107       //   2 - adrp    Rx, target_page
 108       //       add     Ry, Rx, #offset_in_page
 109       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       //       movk    Rx, #imm16<<32
 111       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 112       // In the first 3 cases we must check that Rx is the same in the adrp and the
 113       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 114       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 115       // to be followed by a random unrelated ldr/str, add or movk instruction.
 116       //
 117       unsigned insn2 = ((unsigned*)branch)[1];
 118       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 9, 5)) {
 121         // Load/store register (unsigned immediate)
 122         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 123         Instruction_aarch64::patch(branch + sizeof (unsigned),
 124                                     21, 10, offset_lo >> size);
 125         guarantee(((dest >> size) << size) == dest, "misaligned target");
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 128                 Instruction_aarch64::extract(insn, 4, 0) ==
 129                         Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // add (immediate)
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                    21, 10, offset_lo);
 133         instructions = 2;
 134       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 135                    Instruction_aarch64::extract(insn, 4, 0) ==
 136                      Instruction_aarch64::extract(insn2, 4, 0)) {
 137         // movk #imm16<<32
 138         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 139         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 140         long pc_page = (long)branch >> 12;
 141         long adr_page = (long)dest >> 12;
 142         offset = adr_page - pc_page;
 143         instructions = 2;
 144       }
 145     }
 146     int offset_lo = offset & 3;
 147     offset >>= 2;
 148     Instruction_aarch64::spatch(branch, 23, 5, offset);
 149     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 150   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 151     u_int64_t dest = (u_int64_t)target;
 152     // Move wide constant
 153     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 154     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 155     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 156     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 157     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 158     assert(target_addr_for_insn(branch) == target, "should be");
 159     instructions = 3;
 160   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 161              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 162     // nothing to do
 163     assert(target == 0, "did not expect to relocate target for polling page load");
 164   } else {
 165     ShouldNotReachHere();
 166   }
 167   return instructions * NativeInstruction::instruction_size;
 168 }
 169 
 170 int MacroAssembler::patch_oop(address insn_addr, address o) {
 171   int instructions;
 172   unsigned insn = *(unsigned*)insn_addr;
 173   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 174 
 175   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 176   // narrow OOPs by setting the upper 16 bits in the first
 177   // instruction.
 178   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 179     // Move narrow OOP
 180     narrowOop n = CompressedOops::encode((oop)o);
 181     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 182     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 183     instructions = 2;
 184   } else {
 185     // Move wide OOP
 186     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 187     uintptr_t dest = (uintptr_t)o;
 188     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 190     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 191     instructions = 3;
 192   }
 193   return instructions * NativeInstruction::instruction_size;
 194 }
 195 
 196 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 197   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 198   // We encode narrow ones by setting the upper 16 bits in the first
 199   // instruction.
 200   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 201   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 202          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 203 
 204   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 205   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 206   return 2 * NativeInstruction::instruction_size;
 207 }
 208 
 209 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 210   long offset = 0;
 211   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 212     // Load register (literal)
 213     offset = Instruction_aarch64::sextract(insn, 23, 5);
 214     return address(((uint64_t)insn_addr + (offset << 2)));
 215   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 216     // Unconditional branch (immediate)
 217     offset = Instruction_aarch64::sextract(insn, 25, 0);
 218   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 219     // Conditional branch (immediate)
 220     offset = Instruction_aarch64::sextract(insn, 23, 5);
 221   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 222     // Compare & branch (immediate)
 223     offset = Instruction_aarch64::sextract(insn, 23, 5);
 224    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 225     // Test & branch (immediate)
 226     offset = Instruction_aarch64::sextract(insn, 18, 5);
 227   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 228     // PC-rel. addressing
 229     offset = Instruction_aarch64::extract(insn, 30, 29);
 230     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 231     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 232     if (shift) {
 233       offset <<= shift;
 234       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 235       target_page &= ((uint64_t)-1) << shift;
 236       // Return the target address for the following sequences
 237       //   1 - adrp    Rx, target_page
 238       //       ldr/str Ry, [Rx, #offset_in_page]
 239       //   2 - adrp    Rx, target_page
 240       //       add     Ry, Rx, #offset_in_page
 241       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //       movk    Rx, #imm12<<32
 243       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 244       //
 245       // In the first two cases  we check that the register is the same and
 246       // return the target_page + the offset within the page.
 247       // Otherwise we assume it is a page aligned relocation and return
 248       // the target page only.
 249       //
 250       unsigned insn2 = ((unsigned*)insn_addr)[1];
 251       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 252                 Instruction_aarch64::extract(insn, 4, 0) ==
 253                         Instruction_aarch64::extract(insn2, 9, 5)) {
 254         // Load/store register (unsigned immediate)
 255         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 256         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 257         return address(target_page + (byte_offset << size));
 258       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 259                 Instruction_aarch64::extract(insn, 4, 0) ==
 260                         Instruction_aarch64::extract(insn2, 4, 0)) {
 261         // add (immediate)
 262         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 263         return address(target_page + byte_offset);
 264       } else {
 265         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 266                Instruction_aarch64::extract(insn, 4, 0) ==
 267                  Instruction_aarch64::extract(insn2, 4, 0)) {
 268           target_page = (target_page & 0xffffffff) |
 269                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 270         }
 271         return (address)target_page;
 272       }
 273     } else {
 274       ShouldNotReachHere();
 275     }
 276   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 277     u_int32_t *insns = (u_int32_t *)insn_addr;
 278     // Move wide constant: movz, movk, movk.  See movptr().
 279     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 280     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 281     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 282                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 283                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 284   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 285              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 286     return 0;
 287   } else {
 288     ShouldNotReachHere();
 289   }
 290   return address(((uint64_t)insn_addr + (offset << 2)));
 291 }
 292 
 293 void MacroAssembler::safepoint_poll(Label& slow_path) {
 294   if (SafepointMechanism::uses_thread_local_poll()) {
 295     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 296     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 297   } else {
 298     unsigned long offset;
 299     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 300     ldrw(rscratch1, Address(rscratch1, offset));
 301     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 302     cbnz(rscratch1, slow_path);
 303   }
 304 }
 305 
 306 // Just like safepoint_poll, but use an acquiring load for thread-
 307 // local polling.
 308 //
 309 // We need an acquire here to ensure that any subsequent load of the
 310 // global SafepointSynchronize::_state flag is ordered after this load
 311 // of the local Thread::_polling page.  We don't want this poll to
 312 // return false (i.e. not safepointing) and a later poll of the global
 313 // SafepointSynchronize::_state spuriously to return true.
 314 //
 315 // This is to avoid a race when we're in a native->Java transition
 316 // racing the code which wakes up from a safepoint.
 317 //
 318 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 319   if (SafepointMechanism::uses_thread_local_poll()) {
 320     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 321     ldar(rscratch1, rscratch1);
 322     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 323   } else {
 324     safepoint_poll(slow_path);
 325   }
 326 }
 327 
 328 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 329   // we must set sp to zero to clear frame
 330   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 331 
 332   // must clear fp, so that compiled frames are not confused; it is
 333   // possible that we need it only for debugging
 334   if (clear_fp) {
 335     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 336   }
 337 
 338   // Always clear the pc because it could have been set by make_walkable()
 339   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 340 }
 341 
 342 // Calls to C land
 343 //
 344 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 345 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 346 // has to be reset to 0. This is required to allow proper stack traversal.
 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 348                                          Register last_java_fp,
 349                                          Register last_java_pc,
 350                                          Register scratch) {
 351 
 352   if (last_java_pc->is_valid()) {
 353       str(last_java_pc, Address(rthread,
 354                                 JavaThread::frame_anchor_offset()
 355                                 + JavaFrameAnchor::last_Java_pc_offset()));
 356     }
 357 
 358   // determine last_java_sp register
 359   if (last_java_sp == sp) {
 360     mov(scratch, sp);
 361     last_java_sp = scratch;
 362   } else if (!last_java_sp->is_valid()) {
 363     last_java_sp = esp;
 364   }
 365 
 366   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 367 
 368   // last_java_fp is optional
 369   if (last_java_fp->is_valid()) {
 370     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 371   }
 372 }
 373 
 374 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 375                                          Register last_java_fp,
 376                                          address  last_java_pc,
 377                                          Register scratch) {
 378   assert(last_java_pc != NULL, "must provide a valid PC");
 379 
 380   adr(scratch, last_java_pc);
 381   str(scratch, Address(rthread,
 382                        JavaThread::frame_anchor_offset()
 383                        + JavaFrameAnchor::last_Java_pc_offset()));
 384 
 385   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 386 }
 387 
 388 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 389                                          Register last_java_fp,
 390                                          Label &L,
 391                                          Register scratch) {
 392   if (L.is_bound()) {
 393     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 394   } else {
 395     InstructionMark im(this);
 396     L.add_patch_at(code(), locator());
 397     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 398   }
 399 }
 400 
 401 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 402   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 403   assert(CodeCache::find_blob(entry.target()) != NULL,
 404          "destination of far call not found in code cache");
 405   if (far_branches()) {
 406     unsigned long offset;
 407     // We can use ADRP here because we know that the total size of
 408     // the code cache cannot exceed 2Gb.
 409     adrp(tmp, entry, offset);
 410     add(tmp, tmp, offset);
 411     if (cbuf) cbuf->set_insts_mark();
 412     blr(tmp);
 413   } else {
 414     if (cbuf) cbuf->set_insts_mark();
 415     bl(entry);
 416   }
 417 }
 418 
 419 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 420   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 421   assert(CodeCache::find_blob(entry.target()) != NULL,
 422          "destination of far call not found in code cache");
 423   if (far_branches()) {
 424     unsigned long offset;
 425     // We can use ADRP here because we know that the total size of
 426     // the code cache cannot exceed 2Gb.
 427     adrp(tmp, entry, offset);
 428     add(tmp, tmp, offset);
 429     if (cbuf) cbuf->set_insts_mark();
 430     br(tmp);
 431   } else {
 432     if (cbuf) cbuf->set_insts_mark();
 433     b(entry);
 434   }
 435 }
 436 
 437 void MacroAssembler::reserved_stack_check() {
 438     // testing if reserved zone needs to be enabled
 439     Label no_reserved_zone_enabling;
 440 
 441     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 442     cmp(sp, rscratch1);
 443     br(Assembler::LO, no_reserved_zone_enabling);
 444 
 445     enter();   // LR and FP are live.
 446     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 447     mov(c_rarg0, rthread);
 448     blr(rscratch1);
 449     leave();
 450 
 451     // We have already removed our own frame.
 452     // throw_delayed_StackOverflowError will think that it's been
 453     // called by our caller.
 454     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 455     br(rscratch1);
 456     should_not_reach_here();
 457 
 458     bind(no_reserved_zone_enabling);
 459 }
 460 
 461 int MacroAssembler::biased_locking_enter(Register lock_reg,
 462                                          Register obj_reg,
 463                                          Register swap_reg,
 464                                          Register tmp_reg,
 465                                          bool swap_reg_contains_mark,
 466                                          Label& done,
 467                                          Label* slow_case,
 468                                          BiasedLockingCounters* counters) {
 469   assert(UseBiasedLocking, "why call this otherwise?");
 470   assert_different_registers(lock_reg, obj_reg, swap_reg);
 471 
 472   if (PrintBiasedLockingStatistics && counters == NULL)
 473     counters = BiasedLocking::counters();
 474 
 475   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 476   assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout");
 477   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 478   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 479   Address saved_mark_addr(lock_reg, 0);
 480 
 481   // Biased locking
 482   // See whether the lock is currently biased toward our thread and
 483   // whether the epoch is still valid
 484   // Note that the runtime guarantees sufficient alignment of JavaThread
 485   // pointers to allow age to be placed into low bits
 486   // First check to see whether biasing is even enabled for this object
 487   Label cas_label;
 488   int null_check_offset = -1;
 489   if (!swap_reg_contains_mark) {
 490     null_check_offset = offset();
 491     ldr(swap_reg, mark_addr);
 492   }
 493   andr(tmp_reg, swap_reg, markWord::biased_lock_mask_in_place);
 494   cmp(tmp_reg, (u1)markWord::biased_lock_pattern);
 495   br(Assembler::NE, cas_label);
 496   // The bias pattern is present in the object's header. Need to check
 497   // whether the bias owner and the epoch are both still current.
 498   load_prototype_header(tmp_reg, obj_reg);
 499   orr(tmp_reg, tmp_reg, rthread);
 500   eor(tmp_reg, swap_reg, tmp_reg);
 501   andr(tmp_reg, tmp_reg, ~((int) markWord::age_mask_in_place));
 502   if (counters != NULL) {
 503     Label around;
 504     cbnz(tmp_reg, around);
 505     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 506     b(done);
 507     bind(around);
 508   } else {
 509     cbz(tmp_reg, done);
 510   }
 511 
 512   Label try_revoke_bias;
 513   Label try_rebias;
 514 
 515   // At this point we know that the header has the bias pattern and
 516   // that we are not the bias owner in the current epoch. We need to
 517   // figure out more details about the state of the header in order to
 518   // know what operations can be legally performed on the object's
 519   // header.
 520 
 521   // If the low three bits in the xor result aren't clear, that means
 522   // the prototype header is no longer biased and we have to revoke
 523   // the bias on this object.
 524   andr(rscratch1, tmp_reg, markWord::biased_lock_mask_in_place);
 525   cbnz(rscratch1, try_revoke_bias);
 526 
 527   // Biasing is still enabled for this data type. See whether the
 528   // epoch of the current bias is still valid, meaning that the epoch
 529   // bits of the mark word are equal to the epoch bits of the
 530   // prototype header. (Note that the prototype header's epoch bits
 531   // only change at a safepoint.) If not, attempt to rebias the object
 532   // toward the current thread. Note that we must be absolutely sure
 533   // that the current epoch is invalid in order to do this because
 534   // otherwise the manipulations it performs on the mark word are
 535   // illegal.
 536   andr(rscratch1, tmp_reg, markWord::epoch_mask_in_place);
 537   cbnz(rscratch1, try_rebias);
 538 
 539   // The epoch of the current bias is still valid but we know nothing
 540   // about the owner; it might be set or it might be clear. Try to
 541   // acquire the bias of the object using an atomic operation. If this
 542   // fails we will go in to the runtime to revoke the object's bias.
 543   // Note that we first construct the presumed unbiased header so we
 544   // don't accidentally blow away another thread's valid bias.
 545   {
 546     Label here;
 547     mov(rscratch1, markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place);
 548     andr(swap_reg, swap_reg, rscratch1);
 549     orr(tmp_reg, swap_reg, rthread);
 550     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 551     // If the biasing toward our thread failed, this means that
 552     // another thread succeeded in biasing it toward itself and we
 553     // need to revoke that bias. The revocation will occur in the
 554     // interpreter runtime in the slow case.
 555     bind(here);
 556     if (counters != NULL) {
 557       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 558                   tmp_reg, rscratch1, rscratch2);
 559     }
 560   }
 561   b(done);
 562 
 563   bind(try_rebias);
 564   // At this point we know the epoch has expired, meaning that the
 565   // current "bias owner", if any, is actually invalid. Under these
 566   // circumstances _only_, we are allowed to use the current header's
 567   // value as the comparison value when doing the cas to acquire the
 568   // bias in the current epoch. In other words, we allow transfer of
 569   // the bias from one thread to another directly in this situation.
 570   //
 571   // FIXME: due to a lack of registers we currently blow away the age
 572   // bits in this situation. Should attempt to preserve them.
 573   {
 574     Label here;
 575     load_prototype_header(tmp_reg, obj_reg);
 576     orr(tmp_reg, rthread, tmp_reg);
 577     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 578     // If the biasing toward our thread failed, then another thread
 579     // succeeded in biasing it toward itself and we need to revoke that
 580     // bias. The revocation will occur in the runtime in the slow case.
 581     bind(here);
 582     if (counters != NULL) {
 583       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 584                   tmp_reg, rscratch1, rscratch2);
 585     }
 586   }
 587   b(done);
 588 
 589   bind(try_revoke_bias);
 590   // The prototype mark in the klass doesn't have the bias bit set any
 591   // more, indicating that objects of this data type are not supposed
 592   // to be biased any more. We are going to try to reset the mark of
 593   // this object to the prototype value and fall through to the
 594   // CAS-based locking scheme. Note that if our CAS fails, it means
 595   // that another thread raced us for the privilege of revoking the
 596   // bias of this particular object, so it's okay to continue in the
 597   // normal locking code.
 598   //
 599   // FIXME: due to a lack of registers we currently blow away the age
 600   // bits in this situation. Should attempt to preserve them.
 601   {
 602     Label here, nope;
 603     load_prototype_header(tmp_reg, obj_reg);
 604     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 605     bind(here);
 606 
 607     // Fall through to the normal CAS-based lock, because no matter what
 608     // the result of the above CAS, some thread must have succeeded in
 609     // removing the bias bit from the object's header.
 610     if (counters != NULL) {
 611       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 612                   rscratch1, rscratch2);
 613     }
 614     bind(nope);
 615   }
 616 
 617   bind(cas_label);
 618 
 619   return null_check_offset;
 620 }
 621 
 622 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 623   assert(UseBiasedLocking, "why call this otherwise?");
 624 
 625   // Check for biased locking unlock case, which is a no-op
 626   // Note: we do not have to check the thread ID for two reasons.
 627   // First, the interpreter checks for IllegalMonitorStateException at
 628   // a higher level. Second, if the bias was revoked while we held the
 629   // lock, the object could not be rebiased toward another thread, so
 630   // the bias bit would be clear.
 631   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 632   andr(temp_reg, temp_reg, markWord::biased_lock_mask_in_place);
 633   cmp(temp_reg, (u1)markWord::biased_lock_pattern);
 634   br(Assembler::EQ, done);
 635 }
 636 
 637 static void pass_arg0(MacroAssembler* masm, Register arg) {
 638   if (c_rarg0 != arg ) {
 639     masm->mov(c_rarg0, arg);
 640   }
 641 }
 642 
 643 static void pass_arg1(MacroAssembler* masm, Register arg) {
 644   if (c_rarg1 != arg ) {
 645     masm->mov(c_rarg1, arg);
 646   }
 647 }
 648 
 649 static void pass_arg2(MacroAssembler* masm, Register arg) {
 650   if (c_rarg2 != arg ) {
 651     masm->mov(c_rarg2, arg);
 652   }
 653 }
 654 
 655 static void pass_arg3(MacroAssembler* masm, Register arg) {
 656   if (c_rarg3 != arg ) {
 657     masm->mov(c_rarg3, arg);
 658   }
 659 }
 660 
 661 void MacroAssembler::call_VM_base(Register oop_result,
 662                                   Register java_thread,
 663                                   Register last_java_sp,
 664                                   address  entry_point,
 665                                   int      number_of_arguments,
 666                                   bool     check_exceptions) {
 667    // determine java_thread register
 668   if (!java_thread->is_valid()) {
 669     java_thread = rthread;
 670   }
 671 
 672   // determine last_java_sp register
 673   if (!last_java_sp->is_valid()) {
 674     last_java_sp = esp;
 675   }
 676 
 677   // debugging support
 678   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 679   assert(java_thread == rthread, "unexpected register");
 680 #ifdef ASSERT
 681   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 682   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 683 #endif // ASSERT
 684 
 685   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 686   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 687 
 688   // push java thread (becomes first argument of C function)
 689 
 690   mov(c_rarg0, java_thread);
 691 
 692   // set last Java frame before call
 693   assert(last_java_sp != rfp, "can't use rfp");
 694 
 695   Label l;
 696   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 697 
 698   // do the call, remove parameters
 699   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 700 
 701   // reset last Java frame
 702   // Only interpreter should have to clear fp
 703   reset_last_Java_frame(true);
 704 
 705    // C++ interp handles this in the interpreter
 706   check_and_handle_popframe(java_thread);
 707   check_and_handle_earlyret(java_thread);
 708 
 709   if (check_exceptions) {
 710     // check for pending exceptions (java_thread is set upon return)
 711     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 712     Label ok;
 713     cbz(rscratch1, ok);
 714     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 715     br(rscratch1);
 716     bind(ok);
 717   }
 718 
 719   // get oop result if there is one and reset the value in the thread
 720   if (oop_result->is_valid()) {
 721     get_vm_result(oop_result, java_thread);
 722   }
 723 }
 724 
 725 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 726   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 727 }
 728 
 729 // Maybe emit a call via a trampoline.  If the code cache is small
 730 // trampolines won't be emitted.
 731 
 732 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 733   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 734   assert(entry.rspec().type() == relocInfo::runtime_call_type
 735          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 736          || entry.rspec().type() == relocInfo::static_call_type
 737          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 738 
 739   // We need a trampoline if branches are far.
 740   if (far_branches()) {
 741     bool in_scratch_emit_size = false;
 742 #ifdef COMPILER2
 743     // We don't want to emit a trampoline if C2 is generating dummy
 744     // code during its branch shortening phase.
 745     CompileTask* task = ciEnv::current()->task();
 746     in_scratch_emit_size =
 747       (task != NULL && is_c2_compile(task->comp_level()) &&
 748        Compile::current()->in_scratch_emit_size());
 749 #endif
 750     if (!in_scratch_emit_size) {
 751       address stub = emit_trampoline_stub(offset(), entry.target());
 752       if (stub == NULL) {
 753         return NULL; // CodeCache is full
 754       }
 755     }
 756   }
 757 
 758   if (cbuf) cbuf->set_insts_mark();
 759   relocate(entry.rspec());
 760   if (!far_branches()) {
 761     bl(entry.target());
 762   } else {
 763     bl(pc());
 764   }
 765   // just need to return a non-null address
 766   return pc();
 767 }
 768 
 769 
 770 // Emit a trampoline stub for a call to a target which is too far away.
 771 //
 772 // code sequences:
 773 //
 774 // call-site:
 775 //   branch-and-link to <destination> or <trampoline stub>
 776 //
 777 // Related trampoline stub for this call site in the stub section:
 778 //   load the call target from the constant pool
 779 //   branch (LR still points to the call site above)
 780 
 781 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 782                                              address dest) {
 783   // Max stub size: alignment nop, TrampolineStub.
 784   address stub = start_a_stub(NativeInstruction::instruction_size
 785                    + NativeCallTrampolineStub::instruction_size);
 786   if (stub == NULL) {
 787     return NULL;  // CodeBuffer::expand failed
 788   }
 789 
 790   // Create a trampoline stub relocation which relates this trampoline stub
 791   // with the call instruction at insts_call_instruction_offset in the
 792   // instructions code-section.
 793   align(wordSize);
 794   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 795                                             + insts_call_instruction_offset));
 796   const int stub_start_offset = offset();
 797 
 798   // Now, create the trampoline stub's code:
 799   // - load the call
 800   // - call
 801   Label target;
 802   ldr(rscratch1, target);
 803   br(rscratch1);
 804   bind(target);
 805   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 806          "should be");
 807   emit_int64((int64_t)dest);
 808 
 809   const address stub_start_addr = addr_at(stub_start_offset);
 810 
 811   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 812 
 813   end_a_stub();
 814   return stub_start_addr;
 815 }
 816 
 817 void MacroAssembler::emit_static_call_stub() {
 818   // CompiledDirectStaticCall::set_to_interpreted knows the
 819   // exact layout of this stub.
 820 
 821   isb();
 822   mov_metadata(rmethod, (Metadata*)NULL);
 823 
 824   // Jump to the entry point of the i2c stub.
 825   movptr(rscratch1, 0);
 826   br(rscratch1);
 827 }
 828 
 829 void MacroAssembler::c2bool(Register x) {
 830   // implements x == 0 ? 0 : 1
 831   // note: must only look at least-significant byte of x
 832   //       since C-style booleans are stored in one byte
 833   //       only! (was bug)
 834   tst(x, 0xff);
 835   cset(x, Assembler::NE);
 836 }
 837 
 838 address MacroAssembler::ic_call(address entry, jint method_index) {
 839   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 840   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 841   // unsigned long offset;
 842   // ldr_constant(rscratch2, const_ptr);
 843   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 844   return trampoline_call(Address(entry, rh));
 845 }
 846 
 847 // Implementation of call_VM versions
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              bool check_exceptions) {
 852   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              bool check_exceptions) {
 859   pass_arg1(this, arg_1);
 860   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 861 }
 862 
 863 void MacroAssembler::call_VM(Register oop_result,
 864                              address entry_point,
 865                              Register arg_1,
 866                              Register arg_2,
 867                              bool check_exceptions) {
 868   assert(arg_1 != c_rarg2, "smashed arg");
 869   pass_arg2(this, arg_2);
 870   pass_arg1(this, arg_1);
 871   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 872 }
 873 
 874 void MacroAssembler::call_VM(Register oop_result,
 875                              address entry_point,
 876                              Register arg_1,
 877                              Register arg_2,
 878                              Register arg_3,
 879                              bool check_exceptions) {
 880   assert(arg_1 != c_rarg3, "smashed arg");
 881   assert(arg_2 != c_rarg3, "smashed arg");
 882   pass_arg3(this, arg_3);
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886 
 887   pass_arg1(this, arg_1);
 888   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              int number_of_arguments,
 895                              bool check_exceptions) {
 896   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 897 }
 898 
 899 void MacroAssembler::call_VM(Register oop_result,
 900                              Register last_java_sp,
 901                              address entry_point,
 902                              Register arg_1,
 903                              bool check_exceptions) {
 904   pass_arg1(this, arg_1);
 905   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 906 }
 907 
 908 void MacroAssembler::call_VM(Register oop_result,
 909                              Register last_java_sp,
 910                              address entry_point,
 911                              Register arg_1,
 912                              Register arg_2,
 913                              bool check_exceptions) {
 914 
 915   assert(arg_1 != c_rarg2, "smashed arg");
 916   pass_arg2(this, arg_2);
 917   pass_arg1(this, arg_1);
 918   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 919 }
 920 
 921 void MacroAssembler::call_VM(Register oop_result,
 922                              Register last_java_sp,
 923                              address entry_point,
 924                              Register arg_1,
 925                              Register arg_2,
 926                              Register arg_3,
 927                              bool check_exceptions) {
 928   assert(arg_1 != c_rarg3, "smashed arg");
 929   assert(arg_2 != c_rarg3, "smashed arg");
 930   pass_arg3(this, arg_3);
 931   assert(arg_1 != c_rarg2, "smashed arg");
 932   pass_arg2(this, arg_2);
 933   pass_arg1(this, arg_1);
 934   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 935 }
 936 
 937 
 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 939   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 940   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 941   verify_oop(oop_result, "broken oop in call_VM_base");
 942 }
 943 
 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 945   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 946   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 947 }
 948 
 949 void MacroAssembler::align(int modulus) {
 950   while (offset() % modulus != 0) nop();
 951 }
 952 
 953 // these are no-ops overridden by InterpreterMacroAssembler
 954 
 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 956 
 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 958 
 959 
 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 961                                                       Register tmp,
 962                                                       int offset) {
 963   intptr_t value = *delayed_value_addr;
 964   if (value != 0)
 965     return RegisterOrConstant(value + offset);
 966 
 967   // load indirectly to solve generation ordering problem
 968   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 969 
 970   if (offset != 0)
 971     add(tmp, tmp, offset);
 972 
 973   return RegisterOrConstant(tmp);
 974 }
 975 
 976 // Look up the method for a megamorphic invokeinterface call.
 977 // The target method is determined by <intf_klass, itable_index>.
 978 // The receiver klass is in recv_klass.
 979 // On success, the result will be in method_result, and execution falls through.
 980 // On failure, execution transfers to the given label.
 981 void MacroAssembler::lookup_interface_method(Register recv_klass,
 982                                              Register intf_klass,
 983                                              RegisterOrConstant itable_index,
 984                                              Register method_result,
 985                                              Register scan_temp,
 986                                              Label& L_no_such_interface,
 987                          bool return_method) {
 988   assert_different_registers(recv_klass, intf_klass, scan_temp);
 989   assert_different_registers(method_result, intf_klass, scan_temp);
 990   assert(recv_klass != method_result || !return_method,
 991      "recv_klass can be destroyed when method isn't needed");
 992   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 993          "caller must use same register for non-constant itable index as for method");
 994 
 995   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 996   int vtable_base = in_bytes(Klass::vtable_start_offset());
 997   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 998   int scan_step   = itableOffsetEntry::size() * wordSize;
 999   int vte_size    = vtableEntry::size_in_bytes();
1000   assert(vte_size == wordSize, "else adjust times_vte_scale");
1001 
1002   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1003 
1004   // %%% Could store the aligned, prescaled offset in the klassoop.
1005   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1006   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1007   add(scan_temp, scan_temp, vtable_base);
1008 
1009   if (return_method) {
1010     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1011     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1012     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1013     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1014     if (itentry_off)
1015       add(recv_klass, recv_klass, itentry_off);
1016   }
1017 
1018   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1019   //   if (scan->interface() == intf) {
1020   //     result = (klass + scan->offset() + itable_index);
1021   //   }
1022   // }
1023   Label search, found_method;
1024 
1025   for (int peel = 1; peel >= 0; peel--) {
1026     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1027     cmp(intf_klass, method_result);
1028 
1029     if (peel) {
1030       br(Assembler::EQ, found_method);
1031     } else {
1032       br(Assembler::NE, search);
1033       // (invert the test to fall through to found_method...)
1034     }
1035 
1036     if (!peel)  break;
1037 
1038     bind(search);
1039 
1040     // Check that the previous entry is non-null.  A null entry means that
1041     // the receiver class doesn't implement the interface, and wasn't the
1042     // same as when the caller was compiled.
1043     cbz(method_result, L_no_such_interface);
1044     add(scan_temp, scan_temp, scan_step);
1045   }
1046 
1047   bind(found_method);
1048 
1049   // Got a hit.
1050   if (return_method) {
1051     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1052     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1053   }
1054 }
1055 
1056 // virtual method calling
1057 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1058                                            RegisterOrConstant vtable_index,
1059                                            Register method_result) {
1060   const int base = in_bytes(Klass::vtable_start_offset());
1061   assert(vtableEntry::size() * wordSize == 8,
1062          "adjust the scaling in the code below");
1063   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1064 
1065   if (vtable_index.is_register()) {
1066     lea(method_result, Address(recv_klass,
1067                                vtable_index.as_register(),
1068                                Address::lsl(LogBytesPerWord)));
1069     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1070   } else {
1071     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1072     ldr(method_result,
1073         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1074   }
1075 }
1076 
1077 void MacroAssembler::check_klass_subtype(Register sub_klass,
1078                            Register super_klass,
1079                            Register temp_reg,
1080                            Label& L_success) {
1081   Label L_failure;
1082   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1083   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1084   bind(L_failure);
1085 }
1086 
1087 
1088 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1089                                                    Register super_klass,
1090                                                    Register temp_reg,
1091                                                    Label* L_success,
1092                                                    Label* L_failure,
1093                                                    Label* L_slow_path,
1094                                         RegisterOrConstant super_check_offset) {
1095   assert_different_registers(sub_klass, super_klass, temp_reg);
1096   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1097   if (super_check_offset.is_register()) {
1098     assert_different_registers(sub_klass, super_klass,
1099                                super_check_offset.as_register());
1100   } else if (must_load_sco) {
1101     assert(temp_reg != noreg, "supply either a temp or a register offset");
1102   }
1103 
1104   Label L_fallthrough;
1105   int label_nulls = 0;
1106   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1107   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1108   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1109   assert(label_nulls <= 1, "at most one NULL in the batch");
1110 
1111   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1112   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1113   Address super_check_offset_addr(super_klass, sco_offset);
1114 
1115   // Hacked jmp, which may only be used just before L_fallthrough.
1116 #define final_jmp(label)                                                \
1117   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1118   else                            b(label)                /*omit semi*/
1119 
1120   // If the pointers are equal, we are done (e.g., String[] elements).
1121   // This self-check enables sharing of secondary supertype arrays among
1122   // non-primary types such as array-of-interface.  Otherwise, each such
1123   // type would need its own customized SSA.
1124   // We move this check to the front of the fast path because many
1125   // type checks are in fact trivially successful in this manner,
1126   // so we get a nicely predicted branch right at the start of the check.
1127   cmp(sub_klass, super_klass);
1128   br(Assembler::EQ, *L_success);
1129 
1130   // Check the supertype display:
1131   if (must_load_sco) {
1132     ldrw(temp_reg, super_check_offset_addr);
1133     super_check_offset = RegisterOrConstant(temp_reg);
1134   }
1135   Address super_check_addr(sub_klass, super_check_offset);
1136   ldr(rscratch1, super_check_addr);
1137   cmp(super_klass, rscratch1); // load displayed supertype
1138 
1139   // This check has worked decisively for primary supers.
1140   // Secondary supers are sought in the super_cache ('super_cache_addr').
1141   // (Secondary supers are interfaces and very deeply nested subtypes.)
1142   // This works in the same check above because of a tricky aliasing
1143   // between the super_cache and the primary super display elements.
1144   // (The 'super_check_addr' can address either, as the case requires.)
1145   // Note that the cache is updated below if it does not help us find
1146   // what we need immediately.
1147   // So if it was a primary super, we can just fail immediately.
1148   // Otherwise, it's the slow path for us (no success at this point).
1149 
1150   if (super_check_offset.is_register()) {
1151     br(Assembler::EQ, *L_success);
1152     subs(zr, super_check_offset.as_register(), sc_offset);
1153     if (L_failure == &L_fallthrough) {
1154       br(Assembler::EQ, *L_slow_path);
1155     } else {
1156       br(Assembler::NE, *L_failure);
1157       final_jmp(*L_slow_path);
1158     }
1159   } else if (super_check_offset.as_constant() == sc_offset) {
1160     // Need a slow path; fast failure is impossible.
1161     if (L_slow_path == &L_fallthrough) {
1162       br(Assembler::EQ, *L_success);
1163     } else {
1164       br(Assembler::NE, *L_slow_path);
1165       final_jmp(*L_success);
1166     }
1167   } else {
1168     // No slow path; it's a fast decision.
1169     if (L_failure == &L_fallthrough) {
1170       br(Assembler::EQ, *L_success);
1171     } else {
1172       br(Assembler::NE, *L_failure);
1173       final_jmp(*L_success);
1174     }
1175   }
1176 
1177   bind(L_fallthrough);
1178 
1179 #undef final_jmp
1180 }
1181 
1182 // These two are taken from x86, but they look generally useful
1183 
1184 // scans count pointer sized words at [addr] for occurence of value,
1185 // generic
1186 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1187                                 Register scratch) {
1188   Label Lloop, Lexit;
1189   cbz(count, Lexit);
1190   bind(Lloop);
1191   ldr(scratch, post(addr, wordSize));
1192   cmp(value, scratch);
1193   br(EQ, Lexit);
1194   sub(count, count, 1);
1195   cbnz(count, Lloop);
1196   bind(Lexit);
1197 }
1198 
1199 // scans count 4 byte words at [addr] for occurence of value,
1200 // generic
1201 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1202                                 Register scratch) {
1203   Label Lloop, Lexit;
1204   cbz(count, Lexit);
1205   bind(Lloop);
1206   ldrw(scratch, post(addr, wordSize));
1207   cmpw(value, scratch);
1208   br(EQ, Lexit);
1209   sub(count, count, 1);
1210   cbnz(count, Lloop);
1211   bind(Lexit);
1212 }
1213 
1214 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1215                                                    Register super_klass,
1216                                                    Register temp_reg,
1217                                                    Register temp2_reg,
1218                                                    Label* L_success,
1219                                                    Label* L_failure,
1220                                                    bool set_cond_codes) {
1221   assert_different_registers(sub_klass, super_klass, temp_reg);
1222   if (temp2_reg != noreg)
1223     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1224 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1225 
1226   Label L_fallthrough;
1227   int label_nulls = 0;
1228   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1229   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1230   assert(label_nulls <= 1, "at most one NULL in the batch");
1231 
1232   // a couple of useful fields in sub_klass:
1233   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1234   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1235   Address secondary_supers_addr(sub_klass, ss_offset);
1236   Address super_cache_addr(     sub_klass, sc_offset);
1237 
1238   BLOCK_COMMENT("check_klass_subtype_slow_path");
1239 
1240   // Do a linear scan of the secondary super-klass chain.
1241   // This code is rarely used, so simplicity is a virtue here.
1242   // The repne_scan instruction uses fixed registers, which we must spill.
1243   // Don't worry too much about pre-existing connections with the input regs.
1244 
1245   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1246   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1247 
1248   RegSet pushed_registers;
1249   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1250   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1251 
1252   if (super_klass != r0 || UseCompressedOops) {
1253     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1254   }
1255 
1256   push(pushed_registers, sp);
1257 
1258   // Get super_klass value into r0 (even if it was in r5 or r2).
1259   if (super_klass != r0) {
1260     mov(r0, super_klass);
1261   }
1262 
1263 #ifndef PRODUCT
1264   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1265   Address pst_counter_addr(rscratch2);
1266   ldr(rscratch1, pst_counter_addr);
1267   add(rscratch1, rscratch1, 1);
1268   str(rscratch1, pst_counter_addr);
1269 #endif //PRODUCT
1270 
1271   // We will consult the secondary-super array.
1272   ldr(r5, secondary_supers_addr);
1273   // Load the array length.
1274   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1275   // Skip to start of data.
1276   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1277 
1278   cmp(sp, zr); // Clear Z flag; SP is never zero
1279   // Scan R2 words at [R5] for an occurrence of R0.
1280   // Set NZ/Z based on last compare.
1281   repne_scan(r5, r0, r2, rscratch1);
1282 
1283   // Unspill the temp. registers:
1284   pop(pushed_registers, sp);
1285 
1286   br(Assembler::NE, *L_failure);
1287 
1288   // Success.  Cache the super we found and proceed in triumph.
1289   str(super_klass, super_cache_addr);
1290 
1291   if (L_success != &L_fallthrough) {
1292     b(*L_success);
1293   }
1294 
1295 #undef IS_A_TEMP
1296 
1297   bind(L_fallthrough);
1298 }
1299 
1300 void MacroAssembler::clinit_barrier(Register klass, Register scratch, Label* L_fast_path, Label* L_slow_path) {
1301   assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required");
1302   assert_different_registers(klass, rthread, scratch);
1303 
1304   Label L_fallthrough, L_tmp;
1305   if (L_fast_path == NULL) {
1306     L_fast_path = &L_fallthrough;
1307   } else if (L_slow_path == NULL) {
1308     L_slow_path = &L_fallthrough;
1309   }
1310   // Fast path check: class is fully initialized
1311   ldrb(scratch, Address(klass, InstanceKlass::init_state_offset()));
1312   subs(zr, scratch, InstanceKlass::fully_initialized);
1313   br(Assembler::EQ, *L_fast_path);
1314 
1315   // Fast path check: current thread is initializer thread
1316   ldr(scratch, Address(klass, InstanceKlass::init_thread_offset()));
1317   cmp(rthread, scratch);
1318 
1319   if (L_slow_path == &L_fallthrough) {
1320     br(Assembler::EQ, *L_fast_path);
1321     bind(*L_slow_path);
1322   } else if (L_fast_path == &L_fallthrough) {
1323     br(Assembler::NE, *L_slow_path);
1324     bind(*L_fast_path);
1325   } else {
1326     Unimplemented();
1327   }
1328 }
1329 
1330 void MacroAssembler::verify_oop(Register reg, const char* s) {
1331   if (!VerifyOops) return;
1332 
1333   // Pass register number to verify_oop_subroutine
1334   const char* b = NULL;
1335   {
1336     ResourceMark rm;
1337     stringStream ss;
1338     ss.print("verify_oop: %s: %s", reg->name(), s);
1339     b = code_string(ss.as_string());
1340   }
1341   BLOCK_COMMENT("verify_oop {");
1342 
1343   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1344   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1345 
1346   mov(r0, reg);
1347   mov(rscratch1, (address)b);
1348 
1349   // call indirectly to solve generation ordering problem
1350   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1351   ldr(rscratch2, Address(rscratch2));
1352   blr(rscratch2);
1353 
1354   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1355   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1356 
1357   BLOCK_COMMENT("} verify_oop");
1358 }
1359 
1360 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1361   if (!VerifyOops) return;
1362 
1363   const char* b = NULL;
1364   {
1365     ResourceMark rm;
1366     stringStream ss;
1367     ss.print("verify_oop_addr: %s", s);
1368     b = code_string(ss.as_string());
1369   }
1370   BLOCK_COMMENT("verify_oop_addr {");
1371 
1372   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1373   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1374 
1375   // addr may contain sp so we will have to adjust it based on the
1376   // pushes that we just did.
1377   if (addr.uses(sp)) {
1378     lea(r0, addr);
1379     ldr(r0, Address(r0, 4 * wordSize));
1380   } else {
1381     ldr(r0, addr);
1382   }
1383   mov(rscratch1, (address)b);
1384 
1385   // call indirectly to solve generation ordering problem
1386   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1387   ldr(rscratch2, Address(rscratch2));
1388   blr(rscratch2);
1389 
1390   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1391   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1392 
1393   BLOCK_COMMENT("} verify_oop_addr");
1394 }
1395 
1396 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1397                                          int extra_slot_offset) {
1398   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1399   int stackElementSize = Interpreter::stackElementSize;
1400   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1401 #ifdef ASSERT
1402   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1403   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1404 #endif
1405   if (arg_slot.is_constant()) {
1406     return Address(esp, arg_slot.as_constant() * stackElementSize
1407                    + offset);
1408   } else {
1409     add(rscratch1, esp, arg_slot.as_register(),
1410         ext::uxtx, exact_log2(stackElementSize));
1411     return Address(rscratch1, offset);
1412   }
1413 }
1414 
1415 void MacroAssembler::call_VM_leaf_base(address entry_point,
1416                                        int number_of_arguments,
1417                                        Label *retaddr) {
1418   Label E, L;
1419 
1420   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1421 
1422   mov(rscratch1, entry_point);
1423   blr(rscratch1);
1424   if (retaddr)
1425     bind(*retaddr);
1426 
1427   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1428   maybe_isb();
1429 }
1430 
1431 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1432   call_VM_leaf_base(entry_point, number_of_arguments);
1433 }
1434 
1435 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1436   pass_arg0(this, arg_0);
1437   call_VM_leaf_base(entry_point, 1);
1438 }
1439 
1440 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1441   pass_arg0(this, arg_0);
1442   pass_arg1(this, arg_1);
1443   call_VM_leaf_base(entry_point, 2);
1444 }
1445 
1446 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1447                                   Register arg_1, Register arg_2) {
1448   pass_arg0(this, arg_0);
1449   pass_arg1(this, arg_1);
1450   pass_arg2(this, arg_2);
1451   call_VM_leaf_base(entry_point, 3);
1452 }
1453 
1454 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1455   pass_arg0(this, arg_0);
1456   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1457 }
1458 
1459 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1460 
1461   assert(arg_0 != c_rarg1, "smashed arg");
1462   pass_arg1(this, arg_1);
1463   pass_arg0(this, arg_0);
1464   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1465 }
1466 
1467 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1468   assert(arg_0 != c_rarg2, "smashed arg");
1469   assert(arg_1 != c_rarg2, "smashed arg");
1470   pass_arg2(this, arg_2);
1471   assert(arg_0 != c_rarg1, "smashed arg");
1472   pass_arg1(this, arg_1);
1473   pass_arg0(this, arg_0);
1474   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1475 }
1476 
1477 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1478   assert(arg_0 != c_rarg3, "smashed arg");
1479   assert(arg_1 != c_rarg3, "smashed arg");
1480   assert(arg_2 != c_rarg3, "smashed arg");
1481   pass_arg3(this, arg_3);
1482   assert(arg_0 != c_rarg2, "smashed arg");
1483   assert(arg_1 != c_rarg2, "smashed arg");
1484   pass_arg2(this, arg_2);
1485   assert(arg_0 != c_rarg1, "smashed arg");
1486   pass_arg1(this, arg_1);
1487   pass_arg0(this, arg_0);
1488   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1489 }
1490 
1491 void MacroAssembler::null_check(Register reg, int offset) {
1492   if (needs_explicit_null_check(offset)) {
1493     // provoke OS NULL exception if reg = NULL by
1494     // accessing M[reg] w/o changing any registers
1495     // NOTE: this is plenty to provoke a segv
1496     ldr(zr, Address(reg));
1497   } else {
1498     // nothing to do, (later) access of M[reg + offset]
1499     // will provoke OS NULL exception if reg = NULL
1500   }
1501 }
1502 
1503 // MacroAssembler protected routines needed to implement
1504 // public methods
1505 
1506 void MacroAssembler::mov(Register r, Address dest) {
1507   code_section()->relocate(pc(), dest.rspec());
1508   u_int64_t imm64 = (u_int64_t)dest.target();
1509   movptr(r, imm64);
1510 }
1511 
1512 // Move a constant pointer into r.  In AArch64 mode the virtual
1513 // address space is 48 bits in size, so we only need three
1514 // instructions to create a patchable instruction sequence that can
1515 // reach anywhere.
1516 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1517 #ifndef PRODUCT
1518   {
1519     char buffer[64];
1520     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1521     block_comment(buffer);
1522   }
1523 #endif
1524   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1525   movz(r, imm64 & 0xffff);
1526   imm64 >>= 16;
1527   movk(r, imm64 & 0xffff, 16);
1528   imm64 >>= 16;
1529   movk(r, imm64 & 0xffff, 32);
1530 }
1531 
1532 // Macro to mov replicated immediate to vector register.
1533 //  Vd will get the following values for different arrangements in T
1534 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1535 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1536 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1537 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1538 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1539 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1540 //   T1D/T2D: invalid
1541 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1542   assert(T != T1D && T != T2D, "invalid arrangement");
1543   if (T == T8B || T == T16B) {
1544     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1545     movi(Vd, T, imm32 & 0xff, 0);
1546     return;
1547   }
1548   u_int32_t nimm32 = ~imm32;
1549   if (T == T4H || T == T8H) {
1550     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1551     imm32 &= 0xffff;
1552     nimm32 &= 0xffff;
1553   }
1554   u_int32_t x = imm32;
1555   int movi_cnt = 0;
1556   int movn_cnt = 0;
1557   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1558   x = nimm32;
1559   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1560   if (movn_cnt < movi_cnt) imm32 = nimm32;
1561   unsigned lsl = 0;
1562   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1563   if (movn_cnt < movi_cnt)
1564     mvni(Vd, T, imm32 & 0xff, lsl);
1565   else
1566     movi(Vd, T, imm32 & 0xff, lsl);
1567   imm32 >>= 8; lsl += 8;
1568   while (imm32) {
1569     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1570     if (movn_cnt < movi_cnt)
1571       bici(Vd, T, imm32 & 0xff, lsl);
1572     else
1573       orri(Vd, T, imm32 & 0xff, lsl);
1574     lsl += 8; imm32 >>= 8;
1575   }
1576 }
1577 
1578 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1579 {
1580 #ifndef PRODUCT
1581   {
1582     char buffer[64];
1583     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1584     block_comment(buffer);
1585   }
1586 #endif
1587   if (operand_valid_for_logical_immediate(false, imm64)) {
1588     orr(dst, zr, imm64);
1589   } else {
1590     // we can use a combination of MOVZ or MOVN with
1591     // MOVK to build up the constant
1592     u_int64_t imm_h[4];
1593     int zero_count = 0;
1594     int neg_count = 0;
1595     int i;
1596     for (i = 0; i < 4; i++) {
1597       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1598       if (imm_h[i] == 0) {
1599         zero_count++;
1600       } else if (imm_h[i] == 0xffffL) {
1601         neg_count++;
1602       }
1603     }
1604     if (zero_count == 4) {
1605       // one MOVZ will do
1606       movz(dst, 0);
1607     } else if (neg_count == 4) {
1608       // one MOVN will do
1609       movn(dst, 0);
1610     } else if (zero_count == 3) {
1611       for (i = 0; i < 4; i++) {
1612         if (imm_h[i] != 0L) {
1613           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1614           break;
1615         }
1616       }
1617     } else if (neg_count == 3) {
1618       // one MOVN will do
1619       for (int i = 0; i < 4; i++) {
1620         if (imm_h[i] != 0xffffL) {
1621           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1622           break;
1623         }
1624       }
1625     } else if (zero_count == 2) {
1626       // one MOVZ and one MOVK will do
1627       for (i = 0; i < 3; i++) {
1628         if (imm_h[i] != 0L) {
1629           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1630           i++;
1631           break;
1632         }
1633       }
1634       for (;i < 4; i++) {
1635         if (imm_h[i] != 0L) {
1636           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1637         }
1638       }
1639     } else if (neg_count == 2) {
1640       // one MOVN and one MOVK will do
1641       for (i = 0; i < 4; i++) {
1642         if (imm_h[i] != 0xffffL) {
1643           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1644           i++;
1645           break;
1646         }
1647       }
1648       for (;i < 4; i++) {
1649         if (imm_h[i] != 0xffffL) {
1650           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1651         }
1652       }
1653     } else if (zero_count == 1) {
1654       // one MOVZ and two MOVKs will do
1655       for (i = 0; i < 4; i++) {
1656         if (imm_h[i] != 0L) {
1657           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1658           i++;
1659           break;
1660         }
1661       }
1662       for (;i < 4; i++) {
1663         if (imm_h[i] != 0x0L) {
1664           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1665         }
1666       }
1667     } else if (neg_count == 1) {
1668       // one MOVN and two MOVKs will do
1669       for (i = 0; i < 4; i++) {
1670         if (imm_h[i] != 0xffffL) {
1671           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1672           i++;
1673           break;
1674         }
1675       }
1676       for (;i < 4; i++) {
1677         if (imm_h[i] != 0xffffL) {
1678           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1679         }
1680       }
1681     } else {
1682       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1683       movz(dst, (u_int32_t)imm_h[0], 0);
1684       for (i = 1; i < 4; i++) {
1685         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1686       }
1687     }
1688   }
1689 }
1690 
1691 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1692 {
1693 #ifndef PRODUCT
1694     {
1695       char buffer[64];
1696       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1697       block_comment(buffer);
1698     }
1699 #endif
1700   if (operand_valid_for_logical_immediate(true, imm32)) {
1701     orrw(dst, zr, imm32);
1702   } else {
1703     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1704     // constant
1705     u_int32_t imm_h[2];
1706     imm_h[0] = imm32 & 0xffff;
1707     imm_h[1] = ((imm32 >> 16) & 0xffff);
1708     if (imm_h[0] == 0) {
1709       movzw(dst, imm_h[1], 16);
1710     } else if (imm_h[0] == 0xffff) {
1711       movnw(dst, imm_h[1] ^ 0xffff, 16);
1712     } else if (imm_h[1] == 0) {
1713       movzw(dst, imm_h[0], 0);
1714     } else if (imm_h[1] == 0xffff) {
1715       movnw(dst, imm_h[0] ^ 0xffff, 0);
1716     } else {
1717       // use a MOVZ and MOVK (makes it easier to debug)
1718       movzw(dst, imm_h[0], 0);
1719       movkw(dst, imm_h[1], 16);
1720     }
1721   }
1722 }
1723 
1724 // Form an address from base + offset in Rd.  Rd may or may
1725 // not actually be used: you must use the Address that is returned.
1726 // It is up to you to ensure that the shift provided matches the size
1727 // of your data.
1728 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1729   if (Address::offset_ok_for_immed(byte_offset, shift))
1730     // It fits; no need for any heroics
1731     return Address(base, byte_offset);
1732 
1733   // Don't do anything clever with negative or misaligned offsets
1734   unsigned mask = (1 << shift) - 1;
1735   if (byte_offset < 0 || byte_offset & mask) {
1736     mov(Rd, byte_offset);
1737     add(Rd, base, Rd);
1738     return Address(Rd);
1739   }
1740 
1741   // See if we can do this with two 12-bit offsets
1742   {
1743     unsigned long word_offset = byte_offset >> shift;
1744     unsigned long masked_offset = word_offset & 0xfff000;
1745     if (Address::offset_ok_for_immed(word_offset - masked_offset, 0)
1746         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1747       add(Rd, base, masked_offset << shift);
1748       word_offset -= masked_offset;
1749       return Address(Rd, word_offset << shift);
1750     }
1751   }
1752 
1753   // Do it the hard way
1754   mov(Rd, byte_offset);
1755   add(Rd, base, Rd);
1756   return Address(Rd);
1757 }
1758 
1759 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1760   if (UseLSE) {
1761     mov(tmp, 1);
1762     ldadd(Assembler::word, tmp, zr, counter_addr);
1763     return;
1764   }
1765   Label retry_load;
1766   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1767     prfm(Address(counter_addr), PSTL1STRM);
1768   bind(retry_load);
1769   // flush and load exclusive from the memory location
1770   ldxrw(tmp, counter_addr);
1771   addw(tmp, tmp, 1);
1772   // if we store+flush with no intervening write tmp wil be zero
1773   stxrw(tmp2, tmp, counter_addr);
1774   cbnzw(tmp2, retry_load);
1775 }
1776 
1777 
1778 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1779                                     bool want_remainder, Register scratch)
1780 {
1781   // Full implementation of Java idiv and irem.  The function
1782   // returns the (pc) offset of the div instruction - may be needed
1783   // for implicit exceptions.
1784   //
1785   // constraint : ra/rb =/= scratch
1786   //         normal case
1787   //
1788   // input : ra: dividend
1789   //         rb: divisor
1790   //
1791   // result: either
1792   //         quotient  (= ra idiv rb)
1793   //         remainder (= ra irem rb)
1794 
1795   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1796 
1797   int idivl_offset = offset();
1798   if (! want_remainder) {
1799     sdivw(result, ra, rb);
1800   } else {
1801     sdivw(scratch, ra, rb);
1802     Assembler::msubw(result, scratch, rb, ra);
1803   }
1804 
1805   return idivl_offset;
1806 }
1807 
1808 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1809                                     bool want_remainder, Register scratch)
1810 {
1811   // Full implementation of Java ldiv and lrem.  The function
1812   // returns the (pc) offset of the div instruction - may be needed
1813   // for implicit exceptions.
1814   //
1815   // constraint : ra/rb =/= scratch
1816   //         normal case
1817   //
1818   // input : ra: dividend
1819   //         rb: divisor
1820   //
1821   // result: either
1822   //         quotient  (= ra idiv rb)
1823   //         remainder (= ra irem rb)
1824 
1825   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1826 
1827   int idivq_offset = offset();
1828   if (! want_remainder) {
1829     sdiv(result, ra, rb);
1830   } else {
1831     sdiv(scratch, ra, rb);
1832     Assembler::msub(result, scratch, rb, ra);
1833   }
1834 
1835   return idivq_offset;
1836 }
1837 
1838 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1839   address prev = pc() - NativeMembar::instruction_size;
1840   address last = code()->last_insn();
1841   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1842     NativeMembar *bar = NativeMembar_at(prev);
1843     // We are merging two memory barrier instructions.  On AArch64 we
1844     // can do this simply by ORing them together.
1845     bar->set_kind(bar->get_kind() | order_constraint);
1846     BLOCK_COMMENT("merged membar");
1847   } else {
1848     code()->set_last_insn(pc());
1849     dmb(Assembler::barrier(order_constraint));
1850   }
1851 }
1852 
1853 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1854   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1855     merge_ldst(rt, adr, size_in_bytes, is_store);
1856     code()->clear_last_insn();
1857     return true;
1858   } else {
1859     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1860     const unsigned mask = size_in_bytes - 1;
1861     if (adr.getMode() == Address::base_plus_offset &&
1862         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1863       code()->set_last_insn(pc());
1864     }
1865     return false;
1866   }
1867 }
1868 
1869 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1870   // We always try to merge two adjacent loads into one ldp.
1871   if (!try_merge_ldst(Rx, adr, 8, false)) {
1872     Assembler::ldr(Rx, adr);
1873   }
1874 }
1875 
1876 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1877   // We always try to merge two adjacent loads into one ldp.
1878   if (!try_merge_ldst(Rw, adr, 4, false)) {
1879     Assembler::ldrw(Rw, adr);
1880   }
1881 }
1882 
1883 void MacroAssembler::str(Register Rx, const Address &adr) {
1884   // We always try to merge two adjacent stores into one stp.
1885   if (!try_merge_ldst(Rx, adr, 8, true)) {
1886     Assembler::str(Rx, adr);
1887   }
1888 }
1889 
1890 void MacroAssembler::strw(Register Rw, const Address &adr) {
1891   // We always try to merge two adjacent stores into one stp.
1892   if (!try_merge_ldst(Rw, adr, 4, true)) {
1893     Assembler::strw(Rw, adr);
1894   }
1895 }
1896 
1897 // MacroAssembler routines found actually to be needed
1898 
1899 void MacroAssembler::push(Register src)
1900 {
1901   str(src, Address(pre(esp, -1 * wordSize)));
1902 }
1903 
1904 void MacroAssembler::pop(Register dst)
1905 {
1906   ldr(dst, Address(post(esp, 1 * wordSize)));
1907 }
1908 
1909 // Note: load_unsigned_short used to be called load_unsigned_word.
1910 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1911   int off = offset();
1912   ldrh(dst, src);
1913   return off;
1914 }
1915 
1916 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1917   int off = offset();
1918   ldrb(dst, src);
1919   return off;
1920 }
1921 
1922 int MacroAssembler::load_signed_short(Register dst, Address src) {
1923   int off = offset();
1924   ldrsh(dst, src);
1925   return off;
1926 }
1927 
1928 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1929   int off = offset();
1930   ldrsb(dst, src);
1931   return off;
1932 }
1933 
1934 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1935   int off = offset();
1936   ldrshw(dst, src);
1937   return off;
1938 }
1939 
1940 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1941   int off = offset();
1942   ldrsbw(dst, src);
1943   return off;
1944 }
1945 
1946 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1947   switch (size_in_bytes) {
1948   case  8:  ldr(dst, src); break;
1949   case  4:  ldrw(dst, src); break;
1950   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1951   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1952   default:  ShouldNotReachHere();
1953   }
1954 }
1955 
1956 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1957   switch (size_in_bytes) {
1958   case  8:  str(src, dst); break;
1959   case  4:  strw(src, dst); break;
1960   case  2:  strh(src, dst); break;
1961   case  1:  strb(src, dst); break;
1962   default:  ShouldNotReachHere();
1963   }
1964 }
1965 
1966 void MacroAssembler::decrementw(Register reg, int value)
1967 {
1968   if (value < 0)  { incrementw(reg, -value);      return; }
1969   if (value == 0) {                               return; }
1970   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1971   /* else */ {
1972     guarantee(reg != rscratch2, "invalid dst for register decrement");
1973     movw(rscratch2, (unsigned)value);
1974     subw(reg, reg, rscratch2);
1975   }
1976 }
1977 
1978 void MacroAssembler::decrement(Register reg, int value)
1979 {
1980   if (value < 0)  { increment(reg, -value);      return; }
1981   if (value == 0) {                              return; }
1982   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1983   /* else */ {
1984     assert(reg != rscratch2, "invalid dst for register decrement");
1985     mov(rscratch2, (unsigned long)value);
1986     sub(reg, reg, rscratch2);
1987   }
1988 }
1989 
1990 void MacroAssembler::decrementw(Address dst, int value)
1991 {
1992   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1993   if (dst.getMode() == Address::literal) {
1994     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1995     lea(rscratch2, dst);
1996     dst = Address(rscratch2);
1997   }
1998   ldrw(rscratch1, dst);
1999   decrementw(rscratch1, value);
2000   strw(rscratch1, dst);
2001 }
2002 
2003 void MacroAssembler::decrement(Address dst, int value)
2004 {
2005   assert(!dst.uses(rscratch1), "invalid address for decrement");
2006   if (dst.getMode() == Address::literal) {
2007     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2008     lea(rscratch2, dst);
2009     dst = Address(rscratch2);
2010   }
2011   ldr(rscratch1, dst);
2012   decrement(rscratch1, value);
2013   str(rscratch1, dst);
2014 }
2015 
2016 void MacroAssembler::incrementw(Register reg, int value)
2017 {
2018   if (value < 0)  { decrementw(reg, -value);      return; }
2019   if (value == 0) {                               return; }
2020   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2021   /* else */ {
2022     assert(reg != rscratch2, "invalid dst for register increment");
2023     movw(rscratch2, (unsigned)value);
2024     addw(reg, reg, rscratch2);
2025   }
2026 }
2027 
2028 void MacroAssembler::increment(Register reg, int value)
2029 {
2030   if (value < 0)  { decrement(reg, -value);      return; }
2031   if (value == 0) {                              return; }
2032   if (value < (1 << 12)) { add(reg, reg, value); return; }
2033   /* else */ {
2034     assert(reg != rscratch2, "invalid dst for register increment");
2035     movw(rscratch2, (unsigned)value);
2036     add(reg, reg, rscratch2);
2037   }
2038 }
2039 
2040 void MacroAssembler::incrementw(Address dst, int value)
2041 {
2042   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2043   if (dst.getMode() == Address::literal) {
2044     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2045     lea(rscratch2, dst);
2046     dst = Address(rscratch2);
2047   }
2048   ldrw(rscratch1, dst);
2049   incrementw(rscratch1, value);
2050   strw(rscratch1, dst);
2051 }
2052 
2053 void MacroAssembler::increment(Address dst, int value)
2054 {
2055   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2056   if (dst.getMode() == Address::literal) {
2057     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2058     lea(rscratch2, dst);
2059     dst = Address(rscratch2);
2060   }
2061   ldr(rscratch1, dst);
2062   increment(rscratch1, value);
2063   str(rscratch1, dst);
2064 }
2065 
2066 
2067 void MacroAssembler::pusha() {
2068   push(0x7fffffff, sp);
2069 }
2070 
2071 void MacroAssembler::popa() {
2072   pop(0x7fffffff, sp);
2073 }
2074 
2075 // Push lots of registers in the bit set supplied.  Don't push sp.
2076 // Return the number of words pushed
2077 int MacroAssembler::push(unsigned int bitset, Register stack) {
2078   int words_pushed = 0;
2079 
2080   // Scan bitset to accumulate register pairs
2081   unsigned char regs[32];
2082   int count = 0;
2083   for (int reg = 0; reg <= 30; reg++) {
2084     if (1 & bitset)
2085       regs[count++] = reg;
2086     bitset >>= 1;
2087   }
2088   regs[count++] = zr->encoding_nocheck();
2089   count &= ~1;  // Only push an even nuber of regs
2090 
2091   if (count) {
2092     stp(as_Register(regs[0]), as_Register(regs[1]),
2093        Address(pre(stack, -count * wordSize)));
2094     words_pushed += 2;
2095   }
2096   for (int i = 2; i < count; i += 2) {
2097     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2098        Address(stack, i * wordSize));
2099     words_pushed += 2;
2100   }
2101 
2102   assert(words_pushed == count, "oops, pushed != count");
2103 
2104   return count;
2105 }
2106 
2107 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2108   int words_pushed = 0;
2109 
2110   // Scan bitset to accumulate register pairs
2111   unsigned char regs[32];
2112   int count = 0;
2113   for (int reg = 0; reg <= 30; reg++) {
2114     if (1 & bitset)
2115       regs[count++] = reg;
2116     bitset >>= 1;
2117   }
2118   regs[count++] = zr->encoding_nocheck();
2119   count &= ~1;
2120 
2121   for (int i = 2; i < count; i += 2) {
2122     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2123        Address(stack, i * wordSize));
2124     words_pushed += 2;
2125   }
2126   if (count) {
2127     ldp(as_Register(regs[0]), as_Register(regs[1]),
2128        Address(post(stack, count * wordSize)));
2129     words_pushed += 2;
2130   }
2131 
2132   assert(words_pushed == count, "oops, pushed != count");
2133 
2134   return count;
2135 }
2136 
2137 // Push lots of registers in the bit set supplied.  Don't push sp.
2138 // Return the number of words pushed
2139 int MacroAssembler::push_fp(unsigned int bitset, Register stack) {
2140   int words_pushed = 0;
2141 
2142   // Scan bitset to accumulate register pairs
2143   unsigned char regs[32];
2144   int count = 0;
2145   for (int reg = 0; reg <= 31; reg++) {
2146     if (1 & bitset)
2147       regs[count++] = reg;
2148     bitset >>= 1;
2149   }
2150   regs[count++] = zr->encoding_nocheck();
2151   count &= ~1;  // Only push an even number of regs
2152 
2153   // Always pushing full 128 bit registers.
2154   if (count) {
2155     stpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(pre(stack, -count * wordSize * 2)));
2156     words_pushed += 2;
2157   }
2158   for (int i = 2; i < count; i += 2) {
2159     stpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2160     words_pushed += 2;
2161   }
2162 
2163   assert(words_pushed == count, "oops, pushed != count");
2164   return count;
2165 }
2166 
2167 int MacroAssembler::pop_fp(unsigned int bitset, Register stack) {
2168   int words_pushed = 0;
2169 
2170   // Scan bitset to accumulate register pairs
2171   unsigned char regs[32];
2172   int count = 0;
2173   for (int reg = 0; reg <= 31; reg++) {
2174     if (1 & bitset)
2175       regs[count++] = reg;
2176     bitset >>= 1;
2177   }
2178   regs[count++] = zr->encoding_nocheck();
2179   count &= ~1;
2180 
2181   for (int i = 2; i < count; i += 2) {
2182     ldpq(as_FloatRegister(regs[i]), as_FloatRegister(regs[i+1]), Address(stack, i * wordSize * 2));
2183     words_pushed += 2;
2184   }
2185   if (count) {
2186     ldpq(as_FloatRegister(regs[0]), as_FloatRegister(regs[1]), Address(post(stack, count * wordSize * 2)));
2187     words_pushed += 2;
2188   }
2189 
2190   assert(words_pushed == count, "oops, pushed != count");
2191 
2192   return count;
2193 }
2194 
2195 #ifdef ASSERT
2196 void MacroAssembler::verify_heapbase(const char* msg) {
2197 #if 0
2198   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2199   assert (Universe::heap() != NULL, "java heap should be initialized");
2200   if (CheckCompressedOops) {
2201     Label ok;
2202     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2203     cmpptr(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2204     br(Assembler::EQ, ok);
2205     stop(msg);
2206     bind(ok);
2207     pop(1 << rscratch1->encoding(), sp);
2208   }
2209 #endif
2210 }
2211 #endif
2212 
2213 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2214   Label done, not_weak;
2215   cbz(value, done);           // Use NULL as-is.
2216 
2217   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2218   tbz(r0, 0, not_weak);    // Test for jweak tag.
2219 
2220   // Resolve jweak.
2221   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2222                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2223   verify_oop(value);
2224   b(done);
2225 
2226   bind(not_weak);
2227   // Resolve (untagged) jobject.
2228   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2229   verify_oop(value);
2230   bind(done);
2231 }
2232 
2233 void MacroAssembler::stop(const char* msg) {
2234   address ip = pc();
2235   pusha();
2236   mov(c_rarg0, (address)msg);
2237   mov(c_rarg1, (address)ip);
2238   mov(c_rarg2, sp);
2239   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2240   blr(c_rarg3);
2241   hlt(0);
2242 }
2243 
2244 void MacroAssembler::warn(const char* msg) {
2245   pusha();
2246   mov(c_rarg0, (address)msg);
2247   mov(lr, CAST_FROM_FN_PTR(address, warning));
2248   blr(lr);
2249   popa();
2250 }
2251 
2252 void MacroAssembler::unimplemented(const char* what) {
2253   const char* buf = NULL;
2254   {
2255     ResourceMark rm;
2256     stringStream ss;
2257     ss.print("unimplemented: %s", what);
2258     buf = code_string(ss.as_string());
2259   }
2260   stop(buf);
2261 }
2262 
2263 // If a constant does not fit in an immediate field, generate some
2264 // number of MOV instructions and then perform the operation.
2265 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2266                                            add_sub_imm_insn insn1,
2267                                            add_sub_reg_insn insn2) {
2268   assert(Rd != zr, "Rd = zr and not setting flags?");
2269   if (operand_valid_for_add_sub_immediate((int)imm)) {
2270     (this->*insn1)(Rd, Rn, imm);
2271   } else {
2272     if (uabs(imm) < (1 << 24)) {
2273        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2274        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2275     } else {
2276        assert_different_registers(Rd, Rn);
2277        mov(Rd, (uint64_t)imm);
2278        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2279     }
2280   }
2281 }
2282 
2283 // Seperate vsn which sets the flags. Optimisations are more restricted
2284 // because we must set the flags correctly.
2285 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2286                                            add_sub_imm_insn insn1,
2287                                            add_sub_reg_insn insn2) {
2288   if (operand_valid_for_add_sub_immediate((int)imm)) {
2289     (this->*insn1)(Rd, Rn, imm);
2290   } else {
2291     assert_different_registers(Rd, Rn);
2292     assert(Rd != zr, "overflow in immediate operand");
2293     mov(Rd, (uint64_t)imm);
2294     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2295   }
2296 }
2297 
2298 
2299 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2300   if (increment.is_register()) {
2301     add(Rd, Rn, increment.as_register());
2302   } else {
2303     add(Rd, Rn, increment.as_constant());
2304   }
2305 }
2306 
2307 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2308   if (increment.is_register()) {
2309     addw(Rd, Rn, increment.as_register());
2310   } else {
2311     addw(Rd, Rn, increment.as_constant());
2312   }
2313 }
2314 
2315 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2316   if (decrement.is_register()) {
2317     sub(Rd, Rn, decrement.as_register());
2318   } else {
2319     sub(Rd, Rn, decrement.as_constant());
2320   }
2321 }
2322 
2323 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2324   if (decrement.is_register()) {
2325     subw(Rd, Rn, decrement.as_register());
2326   } else {
2327     subw(Rd, Rn, decrement.as_constant());
2328   }
2329 }
2330 
2331 void MacroAssembler::reinit_heapbase()
2332 {
2333   if (UseCompressedOops) {
2334     if (Universe::is_fully_initialized()) {
2335       mov(rheapbase, CompressedOops::ptrs_base());
2336     } else {
2337       lea(rheapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr()));
2338       ldr(rheapbase, Address(rheapbase));
2339     }
2340   }
2341 }
2342 
2343 // this simulates the behaviour of the x86 cmpxchg instruction using a
2344 // load linked/store conditional pair. we use the acquire/release
2345 // versions of these instructions so that we flush pending writes as
2346 // per Java semantics.
2347 
2348 // n.b the x86 version assumes the old value to be compared against is
2349 // in rax and updates rax with the value located in memory if the
2350 // cmpxchg fails. we supply a register for the old value explicitly
2351 
2352 // the aarch64 load linked/store conditional instructions do not
2353 // accept an offset. so, unlike x86, we must provide a plain register
2354 // to identify the memory word to be compared/exchanged rather than a
2355 // register+offset Address.
2356 
2357 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2358                                 Label &succeed, Label *fail) {
2359   // oldv holds comparison value
2360   // newv holds value to write in exchange
2361   // addr identifies memory word to compare against/update
2362   if (UseLSE) {
2363     mov(tmp, oldv);
2364     casal(Assembler::xword, oldv, newv, addr);
2365     cmp(tmp, oldv);
2366     br(Assembler::EQ, succeed);
2367     membar(AnyAny);
2368   } else {
2369     Label retry_load, nope;
2370     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2371       prfm(Address(addr), PSTL1STRM);
2372     bind(retry_load);
2373     // flush and load exclusive from the memory location
2374     // and fail if it is not what we expect
2375     ldaxr(tmp, addr);
2376     cmp(tmp, oldv);
2377     br(Assembler::NE, nope);
2378     // if we store+flush with no intervening write tmp wil be zero
2379     stlxr(tmp, newv, addr);
2380     cbzw(tmp, succeed);
2381     // retry so we only ever return after a load fails to compare
2382     // ensures we don't return a stale value after a failed write.
2383     b(retry_load);
2384     // if the memory word differs we return it in oldv and signal a fail
2385     bind(nope);
2386     membar(AnyAny);
2387     mov(oldv, tmp);
2388   }
2389   if (fail)
2390     b(*fail);
2391 }
2392 
2393 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2394                                         Label &succeed, Label *fail) {
2395   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2396   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2397 }
2398 
2399 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2400                                 Label &succeed, Label *fail) {
2401   // oldv holds comparison value
2402   // newv holds value to write in exchange
2403   // addr identifies memory word to compare against/update
2404   // tmp returns 0/1 for success/failure
2405   if (UseLSE) {
2406     mov(tmp, oldv);
2407     casal(Assembler::word, oldv, newv, addr);
2408     cmp(tmp, oldv);
2409     br(Assembler::EQ, succeed);
2410     membar(AnyAny);
2411   } else {
2412     Label retry_load, nope;
2413     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2414       prfm(Address(addr), PSTL1STRM);
2415     bind(retry_load);
2416     // flush and load exclusive from the memory location
2417     // and fail if it is not what we expect
2418     ldaxrw(tmp, addr);
2419     cmp(tmp, oldv);
2420     br(Assembler::NE, nope);
2421     // if we store+flush with no intervening write tmp wil be zero
2422     stlxrw(tmp, newv, addr);
2423     cbzw(tmp, succeed);
2424     // retry so we only ever return after a load fails to compare
2425     // ensures we don't return a stale value after a failed write.
2426     b(retry_load);
2427     // if the memory word differs we return it in oldv and signal a fail
2428     bind(nope);
2429     membar(AnyAny);
2430     mov(oldv, tmp);
2431   }
2432   if (fail)
2433     b(*fail);
2434 }
2435 
2436 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2437 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2438 // Pass a register for the result, otherwise pass noreg.
2439 
2440 // Clobbers rscratch1
2441 void MacroAssembler::cmpxchg(Register addr, Register expected,
2442                              Register new_val,
2443                              enum operand_size size,
2444                              bool acquire, bool release,
2445                              bool weak,
2446                              Register result) {
2447   if (result == noreg)  result = rscratch1;
2448   BLOCK_COMMENT("cmpxchg {");
2449   if (UseLSE) {
2450     mov(result, expected);
2451     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2452     compare_eq(result, expected, size);
2453   } else {
2454     Label retry_load, done;
2455     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2456       prfm(Address(addr), PSTL1STRM);
2457     bind(retry_load);
2458     load_exclusive(result, addr, size, acquire);
2459     compare_eq(result, expected, size);
2460     br(Assembler::NE, done);
2461     store_exclusive(rscratch1, new_val, addr, size, release);
2462     if (weak) {
2463       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2464     } else {
2465       cbnzw(rscratch1, retry_load);
2466     }
2467     bind(done);
2468   }
2469   BLOCK_COMMENT("} cmpxchg");
2470 }
2471 
2472 // A generic comparison. Only compares for equality, clobbers rscratch1.
2473 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2474   if (size == xword) {
2475     cmp(rm, rn);
2476   } else if (size == word) {
2477     cmpw(rm, rn);
2478   } else if (size == halfword) {
2479     eorw(rscratch1, rm, rn);
2480     ands(zr, rscratch1, 0xffff);
2481   } else if (size == byte) {
2482     eorw(rscratch1, rm, rn);
2483     ands(zr, rscratch1, 0xff);
2484   } else {
2485     ShouldNotReachHere();
2486   }
2487 }
2488 
2489 
2490 static bool different(Register a, RegisterOrConstant b, Register c) {
2491   if (b.is_constant())
2492     return a != c;
2493   else
2494     return a != b.as_register() && a != c && b.as_register() != c;
2495 }
2496 
2497 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2498 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2499   if (UseLSE) {                                                         \
2500     prev = prev->is_valid() ? prev : zr;                                \
2501     if (incr.is_register()) {                                           \
2502       AOP(sz, incr.as_register(), prev, addr);                          \
2503     } else {                                                            \
2504       mov(rscratch2, incr.as_constant());                               \
2505       AOP(sz, rscratch2, prev, addr);                                   \
2506     }                                                                   \
2507     return;                                                             \
2508   }                                                                     \
2509   Register result = rscratch2;                                          \
2510   if (prev->is_valid())                                                 \
2511     result = different(prev, incr, addr) ? prev : rscratch2;            \
2512                                                                         \
2513   Label retry_load;                                                     \
2514   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2515     prfm(Address(addr), PSTL1STRM);                                     \
2516   bind(retry_load);                                                     \
2517   LDXR(result, addr);                                                   \
2518   OP(rscratch1, result, incr);                                          \
2519   STXR(rscratch2, rscratch1, addr);                                     \
2520   cbnzw(rscratch2, retry_load);                                         \
2521   if (prev->is_valid() && prev != result) {                             \
2522     IOP(prev, rscratch1, incr);                                         \
2523   }                                                                     \
2524 }
2525 
2526 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2527 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2528 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2529 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2530 
2531 #undef ATOMIC_OP
2532 
2533 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2534 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2535   if (UseLSE) {                                                         \
2536     prev = prev->is_valid() ? prev : zr;                                \
2537     AOP(sz, newv, prev, addr);                                          \
2538     return;                                                             \
2539   }                                                                     \
2540   Register result = rscratch2;                                          \
2541   if (prev->is_valid())                                                 \
2542     result = different(prev, newv, addr) ? prev : rscratch2;            \
2543                                                                         \
2544   Label retry_load;                                                     \
2545   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2546     prfm(Address(addr), PSTL1STRM);                                     \
2547   bind(retry_load);                                                     \
2548   LDXR(result, addr);                                                   \
2549   STXR(rscratch1, newv, addr);                                          \
2550   cbnzw(rscratch1, retry_load);                                         \
2551   if (prev->is_valid() && prev != result)                               \
2552     mov(prev, result);                                                  \
2553 }
2554 
2555 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2556 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2557 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2558 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2559 
2560 #undef ATOMIC_XCHG
2561 
2562 #ifndef PRODUCT
2563 extern "C" void findpc(intptr_t x);
2564 #endif
2565 
2566 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2567 {
2568   // In order to get locks to work, we need to fake a in_VM state
2569   if (ShowMessageBoxOnError ) {
2570     JavaThread* thread = JavaThread::current();
2571     JavaThreadState saved_state = thread->thread_state();
2572     thread->set_thread_state(_thread_in_vm);
2573 #ifndef PRODUCT
2574     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2575       ttyLocker ttyl;
2576       BytecodeCounter::print();
2577     }
2578 #endif
2579     if (os::message_box(msg, "Execution stopped, print registers?")) {
2580       ttyLocker ttyl;
2581       tty->print_cr(" pc = 0x%016lx", pc);
2582 #ifndef PRODUCT
2583       tty->cr();
2584       findpc(pc);
2585       tty->cr();
2586 #endif
2587       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2588       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2589       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2590       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2591       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2592       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2593       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2594       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2595       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2596       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2597       tty->print_cr("r10 = 0x%016lx", regs[10]);
2598       tty->print_cr("r11 = 0x%016lx", regs[11]);
2599       tty->print_cr("r12 = 0x%016lx", regs[12]);
2600       tty->print_cr("r13 = 0x%016lx", regs[13]);
2601       tty->print_cr("r14 = 0x%016lx", regs[14]);
2602       tty->print_cr("r15 = 0x%016lx", regs[15]);
2603       tty->print_cr("r16 = 0x%016lx", regs[16]);
2604       tty->print_cr("r17 = 0x%016lx", regs[17]);
2605       tty->print_cr("r18 = 0x%016lx", regs[18]);
2606       tty->print_cr("r19 = 0x%016lx", regs[19]);
2607       tty->print_cr("r20 = 0x%016lx", regs[20]);
2608       tty->print_cr("r21 = 0x%016lx", regs[21]);
2609       tty->print_cr("r22 = 0x%016lx", regs[22]);
2610       tty->print_cr("r23 = 0x%016lx", regs[23]);
2611       tty->print_cr("r24 = 0x%016lx", regs[24]);
2612       tty->print_cr("r25 = 0x%016lx", regs[25]);
2613       tty->print_cr("r26 = 0x%016lx", regs[26]);
2614       tty->print_cr("r27 = 0x%016lx", regs[27]);
2615       tty->print_cr("r28 = 0x%016lx", regs[28]);
2616       tty->print_cr("r30 = 0x%016lx", regs[30]);
2617       tty->print_cr("r31 = 0x%016lx", regs[31]);
2618       BREAKPOINT;
2619     }
2620   }
2621   fatal("DEBUG MESSAGE: %s", msg);
2622 }
2623 
2624 void MacroAssembler::push_call_clobbered_registers() {
2625   int step = 4 * wordSize;
2626   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2627   sub(sp, sp, step);
2628   mov(rscratch1, -step);
2629   // Push v0-v7, v16-v31.
2630   for (int i = 31; i>= 4; i -= 4) {
2631     if (i <= v7->encoding() || i >= v16->encoding())
2632       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2633           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2634   }
2635   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2636       as_FloatRegister(3), T1D, Address(sp));
2637 }
2638 
2639 void MacroAssembler::pop_call_clobbered_registers() {
2640   for (int i = 0; i < 32; i += 4) {
2641     if (i <= v7->encoding() || i >= v16->encoding())
2642       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2643           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2644   }
2645 
2646   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2647 }
2648 
2649 void MacroAssembler::push_CPU_state(bool save_vectors) {
2650   int step = (save_vectors ? 8 : 4) * wordSize;
2651   push(0x3fffffff, sp);         // integer registers except lr & sp
2652   mov(rscratch1, -step);
2653   sub(sp, sp, step);
2654   for (int i = 28; i >= 4; i -= 4) {
2655     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2656         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2657   }
2658   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2659 }
2660 
2661 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2662   int step = (restore_vectors ? 8 : 4) * wordSize;
2663   for (int i = 0; i <= 28; i += 4)
2664     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2665         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2666   pop(0x3fffffff, sp);         // integer registers except lr & sp
2667 }
2668 
2669 /**
2670  * Helpers for multiply_to_len().
2671  */
2672 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2673                                      Register src1, Register src2) {
2674   adds(dest_lo, dest_lo, src1);
2675   adc(dest_hi, dest_hi, zr);
2676   adds(dest_lo, dest_lo, src2);
2677   adc(final_dest_hi, dest_hi, zr);
2678 }
2679 
2680 // Generate an address from (r + r1 extend offset).  "size" is the
2681 // size of the operand.  The result may be in rscratch2.
2682 Address MacroAssembler::offsetted_address(Register r, Register r1,
2683                                           Address::extend ext, int offset, int size) {
2684   if (offset || (ext.shift() % size != 0)) {
2685     lea(rscratch2, Address(r, r1, ext));
2686     return Address(rscratch2, offset);
2687   } else {
2688     return Address(r, r1, ext);
2689   }
2690 }
2691 
2692 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2693 {
2694   assert(offset >= 0, "spill to negative address?");
2695   // Offset reachable ?
2696   //   Not aligned - 9 bits signed offset
2697   //   Aligned - 12 bits unsigned offset shifted
2698   Register base = sp;
2699   if ((offset & (size-1)) && offset >= (1<<8)) {
2700     add(tmp, base, offset & ((1<<12)-1));
2701     base = tmp;
2702     offset &= -1u<<12;
2703   }
2704 
2705   if (offset >= (1<<12) * size) {
2706     add(tmp, base, offset & (((1<<12)-1)<<12));
2707     base = tmp;
2708     offset &= ~(((1<<12)-1)<<12);
2709   }
2710 
2711   return Address(base, offset);
2712 }
2713 
2714 // Checks whether offset is aligned.
2715 // Returns true if it is, else false.
2716 bool MacroAssembler::merge_alignment_check(Register base,
2717                                            size_t size,
2718                                            long cur_offset,
2719                                            long prev_offset) const {
2720   if (AvoidUnalignedAccesses) {
2721     if (base == sp) {
2722       // Checks whether low offset if aligned to pair of registers.
2723       long pair_mask = size * 2 - 1;
2724       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2725       return (offset & pair_mask) == 0;
2726     } else { // If base is not sp, we can't guarantee the access is aligned.
2727       return false;
2728     }
2729   } else {
2730     long mask = size - 1;
2731     // Load/store pair instruction only supports element size aligned offset.
2732     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2733   }
2734 }
2735 
2736 // Checks whether current and previous loads/stores can be merged.
2737 // Returns true if it can be merged, else false.
2738 bool MacroAssembler::ldst_can_merge(Register rt,
2739                                     const Address &adr,
2740                                     size_t cur_size_in_bytes,
2741                                     bool is_store) const {
2742   address prev = pc() - NativeInstruction::instruction_size;
2743   address last = code()->last_insn();
2744 
2745   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2746     return false;
2747   }
2748 
2749   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2750     return false;
2751   }
2752 
2753   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2754   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2755 
2756   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2757   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2758 
2759   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2760     return false;
2761   }
2762 
2763   long max_offset = 63 * prev_size_in_bytes;
2764   long min_offset = -64 * prev_size_in_bytes;
2765 
2766   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2767 
2768   // Only same base can be merged.
2769   if (adr.base() != prev_ldst->base()) {
2770     return false;
2771   }
2772 
2773   long cur_offset = adr.offset();
2774   long prev_offset = prev_ldst->offset();
2775   size_t diff = abs(cur_offset - prev_offset);
2776   if (diff != prev_size_in_bytes) {
2777     return false;
2778   }
2779 
2780   // Following cases can not be merged:
2781   // ldr x2, [x2, #8]
2782   // ldr x3, [x2, #16]
2783   // or:
2784   // ldr x2, [x3, #8]
2785   // ldr x2, [x3, #16]
2786   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2787   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2788     return false;
2789   }
2790 
2791   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2792   // Offset range must be in ldp/stp instruction's range.
2793   if (low_offset > max_offset || low_offset < min_offset) {
2794     return false;
2795   }
2796 
2797   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2798     return true;
2799   }
2800 
2801   return false;
2802 }
2803 
2804 // Merge current load/store with previous load/store into ldp/stp.
2805 void MacroAssembler::merge_ldst(Register rt,
2806                                 const Address &adr,
2807                                 size_t cur_size_in_bytes,
2808                                 bool is_store) {
2809 
2810   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2811 
2812   Register rt_low, rt_high;
2813   address prev = pc() - NativeInstruction::instruction_size;
2814   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2815 
2816   long offset;
2817 
2818   if (adr.offset() < prev_ldst->offset()) {
2819     offset = adr.offset();
2820     rt_low = rt;
2821     rt_high = prev_ldst->target();
2822   } else {
2823     offset = prev_ldst->offset();
2824     rt_low = prev_ldst->target();
2825     rt_high = rt;
2826   }
2827 
2828   Address adr_p = Address(prev_ldst->base(), offset);
2829   // Overwrite previous generated binary.
2830   code_section()->set_end(prev);
2831 
2832   const int sz = prev_ldst->size_in_bytes();
2833   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2834   if (!is_store) {
2835     BLOCK_COMMENT("merged ldr pair");
2836     if (sz == 8) {
2837       ldp(rt_low, rt_high, adr_p);
2838     } else {
2839       ldpw(rt_low, rt_high, adr_p);
2840     }
2841   } else {
2842     BLOCK_COMMENT("merged str pair");
2843     if (sz == 8) {
2844       stp(rt_low, rt_high, adr_p);
2845     } else {
2846       stpw(rt_low, rt_high, adr_p);
2847     }
2848   }
2849 }
2850 
2851 /**
2852  * Multiply 64 bit by 64 bit first loop.
2853  */
2854 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2855                                            Register y, Register y_idx, Register z,
2856                                            Register carry, Register product,
2857                                            Register idx, Register kdx) {
2858   //
2859   //  jlong carry, x[], y[], z[];
2860   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2861   //    huge_128 product = y[idx] * x[xstart] + carry;
2862   //    z[kdx] = (jlong)product;
2863   //    carry  = (jlong)(product >>> 64);
2864   //  }
2865   //  z[xstart] = carry;
2866   //
2867 
2868   Label L_first_loop, L_first_loop_exit;
2869   Label L_one_x, L_one_y, L_multiply;
2870 
2871   subsw(xstart, xstart, 1);
2872   br(Assembler::MI, L_one_x);
2873 
2874   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2875   ldr(x_xstart, Address(rscratch1));
2876   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2877 
2878   bind(L_first_loop);
2879   subsw(idx, idx, 1);
2880   br(Assembler::MI, L_first_loop_exit);
2881   subsw(idx, idx, 1);
2882   br(Assembler::MI, L_one_y);
2883   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2884   ldr(y_idx, Address(rscratch1));
2885   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2886   bind(L_multiply);
2887 
2888   // AArch64 has a multiply-accumulate instruction that we can't use
2889   // here because it has no way to process carries, so we have to use
2890   // separate add and adc instructions.  Bah.
2891   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2892   mul(product, x_xstart, y_idx);
2893   adds(product, product, carry);
2894   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2895 
2896   subw(kdx, kdx, 2);
2897   ror(product, product, 32); // back to big-endian
2898   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2899 
2900   b(L_first_loop);
2901 
2902   bind(L_one_y);
2903   ldrw(y_idx, Address(y,  0));
2904   b(L_multiply);
2905 
2906   bind(L_one_x);
2907   ldrw(x_xstart, Address(x,  0));
2908   b(L_first_loop);
2909 
2910   bind(L_first_loop_exit);
2911 }
2912 
2913 /**
2914  * Multiply 128 bit by 128. Unrolled inner loop.
2915  *
2916  */
2917 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2918                                              Register carry, Register carry2,
2919                                              Register idx, Register jdx,
2920                                              Register yz_idx1, Register yz_idx2,
2921                                              Register tmp, Register tmp3, Register tmp4,
2922                                              Register tmp6, Register product_hi) {
2923 
2924   //   jlong carry, x[], y[], z[];
2925   //   int kdx = ystart+1;
2926   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2927   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2928   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2929   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2930   //     carry  = (jlong)(tmp4 >>> 64);
2931   //     z[kdx+idx+1] = (jlong)tmp3;
2932   //     z[kdx+idx] = (jlong)tmp4;
2933   //   }
2934   //   idx += 2;
2935   //   if (idx > 0) {
2936   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2937   //     z[kdx+idx] = (jlong)yz_idx1;
2938   //     carry  = (jlong)(yz_idx1 >>> 64);
2939   //   }
2940   //
2941 
2942   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2943 
2944   lsrw(jdx, idx, 2);
2945 
2946   bind(L_third_loop);
2947 
2948   subsw(jdx, jdx, 1);
2949   br(Assembler::MI, L_third_loop_exit);
2950   subw(idx, idx, 4);
2951 
2952   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2953 
2954   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2955 
2956   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2957 
2958   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2959   ror(yz_idx2, yz_idx2, 32);
2960 
2961   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2962 
2963   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2964   umulh(tmp4, product_hi, yz_idx1);
2965 
2966   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2967   ror(rscratch2, rscratch2, 32);
2968 
2969   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2970   umulh(carry2, product_hi, yz_idx2);
2971 
2972   // propagate sum of both multiplications into carry:tmp4:tmp3
2973   adds(tmp3, tmp3, carry);
2974   adc(tmp4, tmp4, zr);
2975   adds(tmp3, tmp3, rscratch1);
2976   adcs(tmp4, tmp4, tmp);
2977   adc(carry, carry2, zr);
2978   adds(tmp4, tmp4, rscratch2);
2979   adc(carry, carry, zr);
2980 
2981   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2982   ror(tmp4, tmp4, 32);
2983   stp(tmp4, tmp3, Address(tmp6, 0));
2984 
2985   b(L_third_loop);
2986   bind (L_third_loop_exit);
2987 
2988   andw (idx, idx, 0x3);
2989   cbz(idx, L_post_third_loop_done);
2990 
2991   Label L_check_1;
2992   subsw(idx, idx, 2);
2993   br(Assembler::MI, L_check_1);
2994 
2995   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2996   ldr(yz_idx1, Address(rscratch1, 0));
2997   ror(yz_idx1, yz_idx1, 32);
2998   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2999   umulh(tmp4, product_hi, yz_idx1);
3000   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3001   ldr(yz_idx2, Address(rscratch1, 0));
3002   ror(yz_idx2, yz_idx2, 32);
3003 
3004   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3005 
3006   ror(tmp3, tmp3, 32);
3007   str(tmp3, Address(rscratch1, 0));
3008 
3009   bind (L_check_1);
3010 
3011   andw (idx, idx, 0x1);
3012   subsw(idx, idx, 1);
3013   br(Assembler::MI, L_post_third_loop_done);
3014   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3015   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3016   umulh(carry2, tmp4, product_hi);
3017   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3018 
3019   add2_with_carry(carry2, tmp3, tmp4, carry);
3020 
3021   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3022   extr(carry, carry2, tmp3, 32);
3023 
3024   bind(L_post_third_loop_done);
3025 }
3026 
3027 /**
3028  * Code for BigInteger::multiplyToLen() instrinsic.
3029  *
3030  * r0: x
3031  * r1: xlen
3032  * r2: y
3033  * r3: ylen
3034  * r4:  z
3035  * r5: zlen
3036  * r10: tmp1
3037  * r11: tmp2
3038  * r12: tmp3
3039  * r13: tmp4
3040  * r14: tmp5
3041  * r15: tmp6
3042  * r16: tmp7
3043  *
3044  */
3045 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3046                                      Register z, Register zlen,
3047                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3048                                      Register tmp5, Register tmp6, Register product_hi) {
3049 
3050   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3051 
3052   const Register idx = tmp1;
3053   const Register kdx = tmp2;
3054   const Register xstart = tmp3;
3055 
3056   const Register y_idx = tmp4;
3057   const Register carry = tmp5;
3058   const Register product  = xlen;
3059   const Register x_xstart = zlen;  // reuse register
3060 
3061   // First Loop.
3062   //
3063   //  final static long LONG_MASK = 0xffffffffL;
3064   //  int xstart = xlen - 1;
3065   //  int ystart = ylen - 1;
3066   //  long carry = 0;
3067   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3068   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3069   //    z[kdx] = (int)product;
3070   //    carry = product >>> 32;
3071   //  }
3072   //  z[xstart] = (int)carry;
3073   //
3074 
3075   movw(idx, ylen);      // idx = ylen;
3076   movw(kdx, zlen);      // kdx = xlen+ylen;
3077   mov(carry, zr);       // carry = 0;
3078 
3079   Label L_done;
3080 
3081   movw(xstart, xlen);
3082   subsw(xstart, xstart, 1);
3083   br(Assembler::MI, L_done);
3084 
3085   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3086 
3087   Label L_second_loop;
3088   cbzw(kdx, L_second_loop);
3089 
3090   Label L_carry;
3091   subw(kdx, kdx, 1);
3092   cbzw(kdx, L_carry);
3093 
3094   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3095   lsr(carry, carry, 32);
3096   subw(kdx, kdx, 1);
3097 
3098   bind(L_carry);
3099   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3100 
3101   // Second and third (nested) loops.
3102   //
3103   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3104   //   carry = 0;
3105   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3106   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3107   //                    (z[k] & LONG_MASK) + carry;
3108   //     z[k] = (int)product;
3109   //     carry = product >>> 32;
3110   //   }
3111   //   z[i] = (int)carry;
3112   // }
3113   //
3114   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3115 
3116   const Register jdx = tmp1;
3117 
3118   bind(L_second_loop);
3119   mov(carry, zr);                // carry = 0;
3120   movw(jdx, ylen);               // j = ystart+1
3121 
3122   subsw(xstart, xstart, 1);      // i = xstart-1;
3123   br(Assembler::MI, L_done);
3124 
3125   str(z, Address(pre(sp, -4 * wordSize)));
3126 
3127   Label L_last_x;
3128   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3129   subsw(xstart, xstart, 1);       // i = xstart-1;
3130   br(Assembler::MI, L_last_x);
3131 
3132   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3133   ldr(product_hi, Address(rscratch1));
3134   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3135 
3136   Label L_third_loop_prologue;
3137   bind(L_third_loop_prologue);
3138 
3139   str(ylen, Address(sp, wordSize));
3140   stp(x, xstart, Address(sp, 2 * wordSize));
3141   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3142                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3143   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3144   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3145 
3146   addw(tmp3, xlen, 1);
3147   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3148   subsw(tmp3, tmp3, 1);
3149   br(Assembler::MI, L_done);
3150 
3151   lsr(carry, carry, 32);
3152   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3153   b(L_second_loop);
3154 
3155   // Next infrequent code is moved outside loops.
3156   bind(L_last_x);
3157   ldrw(product_hi, Address(x,  0));
3158   b(L_third_loop_prologue);
3159 
3160   bind(L_done);
3161 }
3162 
3163 // Code for BigInteger::mulAdd instrinsic
3164 // out     = r0
3165 // in      = r1
3166 // offset  = r2  (already out.length-offset)
3167 // len     = r3
3168 // k       = r4
3169 //
3170 // pseudo code from java implementation:
3171 // carry = 0;
3172 // offset = out.length-offset - 1;
3173 // for (int j=len-1; j >= 0; j--) {
3174 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3175 //     out[offset--] = (int)product;
3176 //     carry = product >>> 32;
3177 // }
3178 // return (int)carry;
3179 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3180       Register len, Register k) {
3181     Label LOOP, END;
3182     // pre-loop
3183     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3184     csel(out, zr, out, Assembler::EQ);
3185     br(Assembler::EQ, END);
3186     add(in, in, len, LSL, 2); // in[j+1] address
3187     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3188     mov(out, zr); // used to keep carry now
3189     BIND(LOOP);
3190     ldrw(rscratch1, Address(pre(in, -4)));
3191     madd(rscratch1, rscratch1, k, out);
3192     ldrw(rscratch2, Address(pre(offset, -4)));
3193     add(rscratch1, rscratch1, rscratch2);
3194     strw(rscratch1, Address(offset));
3195     lsr(out, rscratch1, 32);
3196     subs(len, len, 1);
3197     br(Assembler::NE, LOOP);
3198     BIND(END);
3199 }
3200 
3201 /**
3202  * Emits code to update CRC-32 with a byte value according to constants in table
3203  *
3204  * @param [in,out]crc   Register containing the crc.
3205  * @param [in]val       Register containing the byte to fold into the CRC.
3206  * @param [in]table     Register containing the table of crc constants.
3207  *
3208  * uint32_t crc;
3209  * val = crc_table[(val ^ crc) & 0xFF];
3210  * crc = val ^ (crc >> 8);
3211  *
3212  */
3213 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3214   eor(val, val, crc);
3215   andr(val, val, 0xff);
3216   ldrw(val, Address(table, val, Address::lsl(2)));
3217   eor(crc, val, crc, Assembler::LSR, 8);
3218 }
3219 
3220 /**
3221  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3222  *
3223  * @param [in,out]crc   Register containing the crc.
3224  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3225  * @param [in]table0    Register containing table 0 of crc constants.
3226  * @param [in]table1    Register containing table 1 of crc constants.
3227  * @param [in]table2    Register containing table 2 of crc constants.
3228  * @param [in]table3    Register containing table 3 of crc constants.
3229  *
3230  * uint32_t crc;
3231  *   v = crc ^ v
3232  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3233  *
3234  */
3235 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3236         Register table0, Register table1, Register table2, Register table3,
3237         bool upper) {
3238   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3239   uxtb(tmp, v);
3240   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3241   ubfx(tmp, v, 8, 8);
3242   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3243   eor(crc, crc, tmp);
3244   ubfx(tmp, v, 16, 8);
3245   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3246   eor(crc, crc, tmp);
3247   ubfx(tmp, v, 24, 8);
3248   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3249   eor(crc, crc, tmp);
3250 }
3251 
3252 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3253         Register len, Register tmp0, Register tmp1, Register tmp2,
3254         Register tmp3) {
3255     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3256     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3257 
3258     mvnw(crc, crc);
3259 
3260     subs(len, len, 128);
3261     br(Assembler::GE, CRC_by64_pre);
3262   BIND(CRC_less64);
3263     adds(len, len, 128-32);
3264     br(Assembler::GE, CRC_by32_loop);
3265   BIND(CRC_less32);
3266     adds(len, len, 32-4);
3267     br(Assembler::GE, CRC_by4_loop);
3268     adds(len, len, 4);
3269     br(Assembler::GT, CRC_by1_loop);
3270     b(L_exit);
3271 
3272   BIND(CRC_by32_loop);
3273     ldp(tmp0, tmp1, Address(post(buf, 16)));
3274     subs(len, len, 32);
3275     crc32x(crc, crc, tmp0);
3276     ldr(tmp2, Address(post(buf, 8)));
3277     crc32x(crc, crc, tmp1);
3278     ldr(tmp3, Address(post(buf, 8)));
3279     crc32x(crc, crc, tmp2);
3280     crc32x(crc, crc, tmp3);
3281     br(Assembler::GE, CRC_by32_loop);
3282     cmn(len, 32);
3283     br(Assembler::NE, CRC_less32);
3284     b(L_exit);
3285 
3286   BIND(CRC_by4_loop);
3287     ldrw(tmp0, Address(post(buf, 4)));
3288     subs(len, len, 4);
3289     crc32w(crc, crc, tmp0);
3290     br(Assembler::GE, CRC_by4_loop);
3291     adds(len, len, 4);
3292     br(Assembler::LE, L_exit);
3293   BIND(CRC_by1_loop);
3294     ldrb(tmp0, Address(post(buf, 1)));
3295     subs(len, len, 1);
3296     crc32b(crc, crc, tmp0);
3297     br(Assembler::GT, CRC_by1_loop);
3298     b(L_exit);
3299 
3300   BIND(CRC_by64_pre);
3301     sub(buf, buf, 8);
3302     ldp(tmp0, tmp1, Address(buf, 8));
3303     crc32x(crc, crc, tmp0);
3304     ldr(tmp2, Address(buf, 24));
3305     crc32x(crc, crc, tmp1);
3306     ldr(tmp3, Address(buf, 32));
3307     crc32x(crc, crc, tmp2);
3308     ldr(tmp0, Address(buf, 40));
3309     crc32x(crc, crc, tmp3);
3310     ldr(tmp1, Address(buf, 48));
3311     crc32x(crc, crc, tmp0);
3312     ldr(tmp2, Address(buf, 56));
3313     crc32x(crc, crc, tmp1);
3314     ldr(tmp3, Address(pre(buf, 64)));
3315 
3316     b(CRC_by64_loop);
3317 
3318     align(CodeEntryAlignment);
3319   BIND(CRC_by64_loop);
3320     subs(len, len, 64);
3321     crc32x(crc, crc, tmp2);
3322     ldr(tmp0, Address(buf, 8));
3323     crc32x(crc, crc, tmp3);
3324     ldr(tmp1, Address(buf, 16));
3325     crc32x(crc, crc, tmp0);
3326     ldr(tmp2, Address(buf, 24));
3327     crc32x(crc, crc, tmp1);
3328     ldr(tmp3, Address(buf, 32));
3329     crc32x(crc, crc, tmp2);
3330     ldr(tmp0, Address(buf, 40));
3331     crc32x(crc, crc, tmp3);
3332     ldr(tmp1, Address(buf, 48));
3333     crc32x(crc, crc, tmp0);
3334     ldr(tmp2, Address(buf, 56));
3335     crc32x(crc, crc, tmp1);
3336     ldr(tmp3, Address(pre(buf, 64)));
3337     br(Assembler::GE, CRC_by64_loop);
3338 
3339     // post-loop
3340     crc32x(crc, crc, tmp2);
3341     crc32x(crc, crc, tmp3);
3342 
3343     sub(len, len, 64);
3344     add(buf, buf, 8);
3345     cmn(len, 128);
3346     br(Assembler::NE, CRC_less64);
3347   BIND(L_exit);
3348     mvnw(crc, crc);
3349 }
3350 
3351 /**
3352  * @param crc   register containing existing CRC (32-bit)
3353  * @param buf   register pointing to input byte buffer (byte*)
3354  * @param len   register containing number of bytes
3355  * @param table register that will contain address of CRC table
3356  * @param tmp   scratch register
3357  */
3358 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3359         Register table0, Register table1, Register table2, Register table3,
3360         Register tmp, Register tmp2, Register tmp3) {
3361   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3362   unsigned long offset;
3363 
3364   if (UseCRC32) {
3365       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3366       return;
3367   }
3368 
3369     mvnw(crc, crc);
3370 
3371     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3372     if (offset) add(table0, table0, offset);
3373     add(table1, table0, 1*256*sizeof(juint));
3374     add(table2, table0, 2*256*sizeof(juint));
3375     add(table3, table0, 3*256*sizeof(juint));
3376 
3377   if (UseNeon) {
3378       cmp(len, (u1)64);
3379       br(Assembler::LT, L_by16);
3380       eor(v16, T16B, v16, v16);
3381 
3382     Label L_fold;
3383 
3384       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3385 
3386       ld1(v0, v1, T2D, post(buf, 32));
3387       ld1r(v4, T2D, post(tmp, 8));
3388       ld1r(v5, T2D, post(tmp, 8));
3389       ld1r(v6, T2D, post(tmp, 8));
3390       ld1r(v7, T2D, post(tmp, 8));
3391       mov(v16, T4S, 0, crc);
3392 
3393       eor(v0, T16B, v0, v16);
3394       sub(len, len, 64);
3395 
3396     BIND(L_fold);
3397       pmull(v22, T8H, v0, v5, T8B);
3398       pmull(v20, T8H, v0, v7, T8B);
3399       pmull(v23, T8H, v0, v4, T8B);
3400       pmull(v21, T8H, v0, v6, T8B);
3401 
3402       pmull2(v18, T8H, v0, v5, T16B);
3403       pmull2(v16, T8H, v0, v7, T16B);
3404       pmull2(v19, T8H, v0, v4, T16B);
3405       pmull2(v17, T8H, v0, v6, T16B);
3406 
3407       uzp1(v24, T8H, v20, v22);
3408       uzp2(v25, T8H, v20, v22);
3409       eor(v20, T16B, v24, v25);
3410 
3411       uzp1(v26, T8H, v16, v18);
3412       uzp2(v27, T8H, v16, v18);
3413       eor(v16, T16B, v26, v27);
3414 
3415       ushll2(v22, T4S, v20, T8H, 8);
3416       ushll(v20, T4S, v20, T4H, 8);
3417 
3418       ushll2(v18, T4S, v16, T8H, 8);
3419       ushll(v16, T4S, v16, T4H, 8);
3420 
3421       eor(v22, T16B, v23, v22);
3422       eor(v18, T16B, v19, v18);
3423       eor(v20, T16B, v21, v20);
3424       eor(v16, T16B, v17, v16);
3425 
3426       uzp1(v17, T2D, v16, v20);
3427       uzp2(v21, T2D, v16, v20);
3428       eor(v17, T16B, v17, v21);
3429 
3430       ushll2(v20, T2D, v17, T4S, 16);
3431       ushll(v16, T2D, v17, T2S, 16);
3432 
3433       eor(v20, T16B, v20, v22);
3434       eor(v16, T16B, v16, v18);
3435 
3436       uzp1(v17, T2D, v20, v16);
3437       uzp2(v21, T2D, v20, v16);
3438       eor(v28, T16B, v17, v21);
3439 
3440       pmull(v22, T8H, v1, v5, T8B);
3441       pmull(v20, T8H, v1, v7, T8B);
3442       pmull(v23, T8H, v1, v4, T8B);
3443       pmull(v21, T8H, v1, v6, T8B);
3444 
3445       pmull2(v18, T8H, v1, v5, T16B);
3446       pmull2(v16, T8H, v1, v7, T16B);
3447       pmull2(v19, T8H, v1, v4, T16B);
3448       pmull2(v17, T8H, v1, v6, T16B);
3449 
3450       ld1(v0, v1, T2D, post(buf, 32));
3451 
3452       uzp1(v24, T8H, v20, v22);
3453       uzp2(v25, T8H, v20, v22);
3454       eor(v20, T16B, v24, v25);
3455 
3456       uzp1(v26, T8H, v16, v18);
3457       uzp2(v27, T8H, v16, v18);
3458       eor(v16, T16B, v26, v27);
3459 
3460       ushll2(v22, T4S, v20, T8H, 8);
3461       ushll(v20, T4S, v20, T4H, 8);
3462 
3463       ushll2(v18, T4S, v16, T8H, 8);
3464       ushll(v16, T4S, v16, T4H, 8);
3465 
3466       eor(v22, T16B, v23, v22);
3467       eor(v18, T16B, v19, v18);
3468       eor(v20, T16B, v21, v20);
3469       eor(v16, T16B, v17, v16);
3470 
3471       uzp1(v17, T2D, v16, v20);
3472       uzp2(v21, T2D, v16, v20);
3473       eor(v16, T16B, v17, v21);
3474 
3475       ushll2(v20, T2D, v16, T4S, 16);
3476       ushll(v16, T2D, v16, T2S, 16);
3477 
3478       eor(v20, T16B, v22, v20);
3479       eor(v16, T16B, v16, v18);
3480 
3481       uzp1(v17, T2D, v20, v16);
3482       uzp2(v21, T2D, v20, v16);
3483       eor(v20, T16B, v17, v21);
3484 
3485       shl(v16, T2D, v28, 1);
3486       shl(v17, T2D, v20, 1);
3487 
3488       eor(v0, T16B, v0, v16);
3489       eor(v1, T16B, v1, v17);
3490 
3491       subs(len, len, 32);
3492       br(Assembler::GE, L_fold);
3493 
3494       mov(crc, 0);
3495       mov(tmp, v0, T1D, 0);
3496       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3497       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3498       mov(tmp, v0, T1D, 1);
3499       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3500       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3501       mov(tmp, v1, T1D, 0);
3502       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3503       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3504       mov(tmp, v1, T1D, 1);
3505       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3506       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3507 
3508       add(len, len, 32);
3509   }
3510 
3511   BIND(L_by16);
3512     subs(len, len, 16);
3513     br(Assembler::GE, L_by16_loop);
3514     adds(len, len, 16-4);
3515     br(Assembler::GE, L_by4_loop);
3516     adds(len, len, 4);
3517     br(Assembler::GT, L_by1_loop);
3518     b(L_exit);
3519 
3520   BIND(L_by4_loop);
3521     ldrw(tmp, Address(post(buf, 4)));
3522     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3523     subs(len, len, 4);
3524     br(Assembler::GE, L_by4_loop);
3525     adds(len, len, 4);
3526     br(Assembler::LE, L_exit);
3527   BIND(L_by1_loop);
3528     subs(len, len, 1);
3529     ldrb(tmp, Address(post(buf, 1)));
3530     update_byte_crc32(crc, tmp, table0);
3531     br(Assembler::GT, L_by1_loop);
3532     b(L_exit);
3533 
3534     align(CodeEntryAlignment);
3535   BIND(L_by16_loop);
3536     subs(len, len, 16);
3537     ldp(tmp, tmp3, Address(post(buf, 16)));
3538     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3539     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3540     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3541     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3542     br(Assembler::GE, L_by16_loop);
3543     adds(len, len, 16-4);
3544     br(Assembler::GE, L_by4_loop);
3545     adds(len, len, 4);
3546     br(Assembler::GT, L_by1_loop);
3547   BIND(L_exit);
3548     mvnw(crc, crc);
3549 }
3550 
3551 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3552         Register len, Register tmp0, Register tmp1, Register tmp2,
3553         Register tmp3) {
3554     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3555     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3556 
3557     subs(len, len, 128);
3558     br(Assembler::GE, CRC_by64_pre);
3559   BIND(CRC_less64);
3560     adds(len, len, 128-32);
3561     br(Assembler::GE, CRC_by32_loop);
3562   BIND(CRC_less32);
3563     adds(len, len, 32-4);
3564     br(Assembler::GE, CRC_by4_loop);
3565     adds(len, len, 4);
3566     br(Assembler::GT, CRC_by1_loop);
3567     b(L_exit);
3568 
3569   BIND(CRC_by32_loop);
3570     ldp(tmp0, tmp1, Address(post(buf, 16)));
3571     subs(len, len, 32);
3572     crc32cx(crc, crc, tmp0);
3573     ldr(tmp2, Address(post(buf, 8)));
3574     crc32cx(crc, crc, tmp1);
3575     ldr(tmp3, Address(post(buf, 8)));
3576     crc32cx(crc, crc, tmp2);
3577     crc32cx(crc, crc, tmp3);
3578     br(Assembler::GE, CRC_by32_loop);
3579     cmn(len, 32);
3580     br(Assembler::NE, CRC_less32);
3581     b(L_exit);
3582 
3583   BIND(CRC_by4_loop);
3584     ldrw(tmp0, Address(post(buf, 4)));
3585     subs(len, len, 4);
3586     crc32cw(crc, crc, tmp0);
3587     br(Assembler::GE, CRC_by4_loop);
3588     adds(len, len, 4);
3589     br(Assembler::LE, L_exit);
3590   BIND(CRC_by1_loop);
3591     ldrb(tmp0, Address(post(buf, 1)));
3592     subs(len, len, 1);
3593     crc32cb(crc, crc, tmp0);
3594     br(Assembler::GT, CRC_by1_loop);
3595     b(L_exit);
3596 
3597   BIND(CRC_by64_pre);
3598     sub(buf, buf, 8);
3599     ldp(tmp0, tmp1, Address(buf, 8));
3600     crc32cx(crc, crc, tmp0);
3601     ldr(tmp2, Address(buf, 24));
3602     crc32cx(crc, crc, tmp1);
3603     ldr(tmp3, Address(buf, 32));
3604     crc32cx(crc, crc, tmp2);
3605     ldr(tmp0, Address(buf, 40));
3606     crc32cx(crc, crc, tmp3);
3607     ldr(tmp1, Address(buf, 48));
3608     crc32cx(crc, crc, tmp0);
3609     ldr(tmp2, Address(buf, 56));
3610     crc32cx(crc, crc, tmp1);
3611     ldr(tmp3, Address(pre(buf, 64)));
3612 
3613     b(CRC_by64_loop);
3614 
3615     align(CodeEntryAlignment);
3616   BIND(CRC_by64_loop);
3617     subs(len, len, 64);
3618     crc32cx(crc, crc, tmp2);
3619     ldr(tmp0, Address(buf, 8));
3620     crc32cx(crc, crc, tmp3);
3621     ldr(tmp1, Address(buf, 16));
3622     crc32cx(crc, crc, tmp0);
3623     ldr(tmp2, Address(buf, 24));
3624     crc32cx(crc, crc, tmp1);
3625     ldr(tmp3, Address(buf, 32));
3626     crc32cx(crc, crc, tmp2);
3627     ldr(tmp0, Address(buf, 40));
3628     crc32cx(crc, crc, tmp3);
3629     ldr(tmp1, Address(buf, 48));
3630     crc32cx(crc, crc, tmp0);
3631     ldr(tmp2, Address(buf, 56));
3632     crc32cx(crc, crc, tmp1);
3633     ldr(tmp3, Address(pre(buf, 64)));
3634     br(Assembler::GE, CRC_by64_loop);
3635 
3636     // post-loop
3637     crc32cx(crc, crc, tmp2);
3638     crc32cx(crc, crc, tmp3);
3639 
3640     sub(len, len, 64);
3641     add(buf, buf, 8);
3642     cmn(len, 128);
3643     br(Assembler::NE, CRC_less64);
3644   BIND(L_exit);
3645 }
3646 
3647 /**
3648  * @param crc   register containing existing CRC (32-bit)
3649  * @param buf   register pointing to input byte buffer (byte*)
3650  * @param len   register containing number of bytes
3651  * @param table register that will contain address of CRC table
3652  * @param tmp   scratch register
3653  */
3654 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3655         Register table0, Register table1, Register table2, Register table3,
3656         Register tmp, Register tmp2, Register tmp3) {
3657   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3658 }
3659 
3660 
3661 SkipIfEqual::SkipIfEqual(
3662     MacroAssembler* masm, const bool* flag_addr, bool value) {
3663   _masm = masm;
3664   unsigned long offset;
3665   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3666   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3667   _masm->cbzw(rscratch1, _label);
3668 }
3669 
3670 SkipIfEqual::~SkipIfEqual() {
3671   _masm->bind(_label);
3672 }
3673 
3674 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3675   Address adr;
3676   switch(dst.getMode()) {
3677   case Address::base_plus_offset:
3678     // This is the expected mode, although we allow all the other
3679     // forms below.
3680     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3681     break;
3682   default:
3683     lea(rscratch2, dst);
3684     adr = Address(rscratch2);
3685     break;
3686   }
3687   ldr(rscratch1, adr);
3688   add(rscratch1, rscratch1, src);
3689   str(rscratch1, adr);
3690 }
3691 
3692 void MacroAssembler::cmpptr(Register src1, Address src2) {
3693   unsigned long offset;
3694   adrp(rscratch1, src2, offset);
3695   ldr(rscratch1, Address(rscratch1, offset));
3696   cmp(src1, rscratch1);
3697 }
3698 
3699 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3700   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3701   bs->obj_equals(this, obj1, obj2);
3702 }
3703 
3704 void MacroAssembler::load_method_holder(Register holder, Register method) {
3705   ldr(holder, Address(method, Method::const_offset()));                      // ConstMethod*
3706   ldr(holder, Address(holder, ConstMethod::constants_offset()));             // ConstantPool*
3707   ldr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass*
3708 }
3709 
3710 void MacroAssembler::load_klass(Register dst, Register src) {
3711   if (UseCompressedClassPointers) {
3712     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3713     decode_klass_not_null(dst);
3714   } else {
3715     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3716   }
3717 }
3718 
3719 // ((OopHandle)result).resolve();
3720 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3721   // OopHandle::resolve is an indirection.
3722   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3723 }
3724 
3725 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3726   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3727   ldr(dst, Address(rmethod, Method::const_offset()));
3728   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3729   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3730   ldr(dst, Address(dst, mirror_offset));
3731   resolve_oop_handle(dst, tmp);
3732 }
3733 
3734 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3735   if (UseCompressedClassPointers) {
3736     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3737     if (CompressedKlassPointers::base() == NULL) {
3738       cmp(trial_klass, tmp, LSL, CompressedKlassPointers::shift());
3739       return;
3740     } else if (((uint64_t)CompressedKlassPointers::base() & 0xffffffff) == 0
3741                && CompressedKlassPointers::shift() == 0) {
3742       // Only the bottom 32 bits matter
3743       cmpw(trial_klass, tmp);
3744       return;
3745     }
3746     decode_klass_not_null(tmp);
3747   } else {
3748     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3749   }
3750   cmp(trial_klass, tmp);
3751 }
3752 
3753 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3754   load_klass(dst, src);
3755   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3756 }
3757 
3758 void MacroAssembler::store_klass(Register dst, Register src) {
3759   // FIXME: Should this be a store release?  concurrent gcs assumes
3760   // klass length is valid if klass field is not null.
3761   if (UseCompressedClassPointers) {
3762     encode_klass_not_null(src);
3763     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3764   } else {
3765     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3766   }
3767 }
3768 
3769 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3770   if (UseCompressedClassPointers) {
3771     // Store to klass gap in destination
3772     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3773   }
3774 }
3775 
3776 // Algorithm must match CompressedOops::encode.
3777 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3778 #ifdef ASSERT
3779   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3780 #endif
3781   verify_oop(s, "broken oop in encode_heap_oop");
3782   if (CompressedOops::base() == NULL) {
3783     if (CompressedOops::shift() != 0) {
3784       assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3785       lsr(d, s, LogMinObjAlignmentInBytes);
3786     } else {
3787       mov(d, s);
3788     }
3789   } else {
3790     subs(d, s, rheapbase);
3791     csel(d, d, zr, Assembler::HS);
3792     lsr(d, d, LogMinObjAlignmentInBytes);
3793 
3794     /*  Old algorithm: is this any worse?
3795     Label nonnull;
3796     cbnz(r, nonnull);
3797     sub(r, r, rheapbase);
3798     bind(nonnull);
3799     lsr(r, r, LogMinObjAlignmentInBytes);
3800     */
3801   }
3802 }
3803 
3804 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3805 #ifdef ASSERT
3806   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3807   if (CheckCompressedOops) {
3808     Label ok;
3809     cbnz(r, ok);
3810     stop("null oop passed to encode_heap_oop_not_null");
3811     bind(ok);
3812   }
3813 #endif
3814   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3815   if (CompressedOops::base() != NULL) {
3816     sub(r, r, rheapbase);
3817   }
3818   if (CompressedOops::shift() != 0) {
3819     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3820     lsr(r, r, LogMinObjAlignmentInBytes);
3821   }
3822 }
3823 
3824 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3825 #ifdef ASSERT
3826   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3827   if (CheckCompressedOops) {
3828     Label ok;
3829     cbnz(src, ok);
3830     stop("null oop passed to encode_heap_oop_not_null2");
3831     bind(ok);
3832   }
3833 #endif
3834   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3835 
3836   Register data = src;
3837   if (CompressedOops::base() != NULL) {
3838     sub(dst, src, rheapbase);
3839     data = dst;
3840   }
3841   if (CompressedOops::shift() != 0) {
3842     assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3843     lsr(dst, data, LogMinObjAlignmentInBytes);
3844     data = dst;
3845   }
3846   if (data == src)
3847     mov(dst, src);
3848 }
3849 
3850 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3851 #ifdef ASSERT
3852   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3853 #endif
3854   if (CompressedOops::base() == NULL) {
3855     if (CompressedOops::shift() != 0 || d != s) {
3856       lsl(d, s, CompressedOops::shift());
3857     }
3858   } else {
3859     Label done;
3860     if (d != s)
3861       mov(d, s);
3862     cbz(s, done);
3863     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3864     bind(done);
3865   }
3866   verify_oop(d, "broken oop in decode_heap_oop");
3867 }
3868 
3869 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3870   assert (UseCompressedOops, "should only be used for compressed headers");
3871   assert (Universe::heap() != NULL, "java heap should be initialized");
3872   // Cannot assert, unverified entry point counts instructions (see .ad file)
3873   // vtableStubs also counts instructions in pd_code_size_limit.
3874   // Also do not verify_oop as this is called by verify_oop.
3875   if (CompressedOops::shift() != 0) {
3876     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3877     if (CompressedOops::base() != NULL) {
3878       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3879     } else {
3880       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3881     }
3882   } else {
3883     assert (CompressedOops::base() == NULL, "sanity");
3884   }
3885 }
3886 
3887 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3888   assert (UseCompressedOops, "should only be used for compressed headers");
3889   assert (Universe::heap() != NULL, "java heap should be initialized");
3890   // Cannot assert, unverified entry point counts instructions (see .ad file)
3891   // vtableStubs also counts instructions in pd_code_size_limit.
3892   // Also do not verify_oop as this is called by verify_oop.
3893   if (CompressedOops::shift() != 0) {
3894     assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong");
3895     if (CompressedOops::base() != NULL) {
3896       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3897     } else {
3898       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3899     }
3900   } else {
3901     assert (CompressedOops::base() == NULL, "sanity");
3902     if (dst != src) {
3903       mov(dst, src);
3904     }
3905   }
3906 }
3907 
3908 MacroAssembler::KlassDecodeMode MacroAssembler::_klass_decode_mode(KlassDecodeNone);
3909 
3910 MacroAssembler::KlassDecodeMode MacroAssembler::klass_decode_mode() {
3911   assert(UseCompressedClassPointers, "not using compressed class pointers");
3912   assert(Metaspace::initialized(), "metaspace not initialized yet");
3913 
3914   if (_klass_decode_mode != KlassDecodeNone) {
3915     return _klass_decode_mode;
3916   }
3917 
3918   assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift()
3919          || 0 == CompressedKlassPointers::shift(), "decode alg wrong");
3920 
3921   if (CompressedKlassPointers::base() == NULL) {
3922     return (_klass_decode_mode = KlassDecodeZero);
3923   }
3924 
3925   if (operand_valid_for_logical_immediate(
3926         /*is32*/false, (uint64_t)CompressedKlassPointers::base())) {
3927     const uint64_t range_mask =
3928       (1UL << log2_intptr(CompressedKlassPointers::range())) - 1;
3929     if (((uint64_t)CompressedKlassPointers::base() & range_mask) == 0) {
3930       return (_klass_decode_mode = KlassDecodeXor);
3931     }
3932   }
3933 
3934   const uint64_t shifted_base =
3935     (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
3936   guarantee((shifted_base & 0xffff0000ffffffff) == 0,
3937             "compressed class base bad alignment");
3938 
3939   return (_klass_decode_mode = KlassDecodeMovk);
3940 }
3941 
3942 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3943   switch (klass_decode_mode()) {
3944   case KlassDecodeZero:
3945     if (CompressedKlassPointers::shift() != 0) {
3946       lsr(dst, src, LogKlassAlignmentInBytes);
3947     } else {
3948       if (dst != src) mov(dst, src);
3949     }
3950     break;
3951 
3952   case KlassDecodeXor:
3953     if (CompressedKlassPointers::shift() != 0) {
3954       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3955       lsr(dst, dst, LogKlassAlignmentInBytes);
3956     } else {
3957       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3958     }
3959     break;
3960 
3961   case KlassDecodeMovk:
3962     if (CompressedKlassPointers::shift() != 0) {
3963       ubfx(dst, src, LogKlassAlignmentInBytes, 32);
3964     } else {
3965       movw(dst, src);
3966     }
3967     break;
3968 
3969   case KlassDecodeNone:
3970     ShouldNotReachHere();
3971     break;
3972   }
3973 }
3974 
3975 void MacroAssembler::encode_klass_not_null(Register r) {
3976   encode_klass_not_null(r, r);
3977 }
3978 
3979 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3980   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3981 
3982   switch (klass_decode_mode()) {
3983   case KlassDecodeZero:
3984     if (CompressedKlassPointers::shift() != 0) {
3985       lsl(dst, src, LogKlassAlignmentInBytes);
3986     } else {
3987       if (dst != src) mov(dst, src);
3988     }
3989     break;
3990 
3991   case KlassDecodeXor:
3992     if (CompressedKlassPointers::shift() != 0) {
3993       lsl(dst, src, LogKlassAlignmentInBytes);
3994       eor(dst, dst, (uint64_t)CompressedKlassPointers::base());
3995     } else {
3996       eor(dst, src, (uint64_t)CompressedKlassPointers::base());
3997     }
3998     break;
3999 
4000   case KlassDecodeMovk: {
4001     const uint64_t shifted_base =
4002       (uint64_t)CompressedKlassPointers::base() >> CompressedKlassPointers::shift();
4003 
4004     if (dst != src) movw(dst, src);
4005     movk(dst, shifted_base >> 32, 32);
4006 
4007     if (CompressedKlassPointers::shift() != 0) {
4008       lsl(dst, dst, LogKlassAlignmentInBytes);
4009     }
4010 
4011     break;
4012   }
4013 
4014   case KlassDecodeNone:
4015     ShouldNotReachHere();
4016     break;
4017   }
4018 }
4019 
4020 void  MacroAssembler::decode_klass_not_null(Register r) {
4021   decode_klass_not_null(r, r);
4022 }
4023 
4024 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4025 #ifdef ASSERT
4026   {
4027     ThreadInVMfromUnknown tiv;
4028     assert (UseCompressedOops, "should only be used for compressed oops");
4029     assert (Universe::heap() != NULL, "java heap should be initialized");
4030     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4031     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4032   }
4033 #endif
4034   int oop_index = oop_recorder()->find_index(obj);
4035   InstructionMark im(this);
4036   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4037   code_section()->relocate(inst_mark(), rspec);
4038   movz(dst, 0xDEAD, 16);
4039   movk(dst, 0xBEEF);
4040 }
4041 
4042 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4043   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4044   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4045   int index = oop_recorder()->find_index(k);
4046   assert(! Universe::heap()->is_in(k), "should not be an oop");
4047 
4048   InstructionMark im(this);
4049   RelocationHolder rspec = metadata_Relocation::spec(index);
4050   code_section()->relocate(inst_mark(), rspec);
4051   narrowKlass nk = CompressedKlassPointers::encode(k);
4052   movz(dst, (nk >> 16), 16);
4053   movk(dst, nk & 0xffff);
4054 }
4055 
4056 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4057                                     Register dst, Address src,
4058                                     Register tmp1, Register thread_tmp) {
4059   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4060   decorators = AccessInternal::decorator_fixup(decorators);
4061   bool as_raw = (decorators & AS_RAW) != 0;
4062   if (as_raw) {
4063     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4064   } else {
4065     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4066   }
4067 }
4068 
4069 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4070                                      Address dst, Register src,
4071                                      Register tmp1, Register thread_tmp) {
4072   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4073   decorators = AccessInternal::decorator_fixup(decorators);
4074   bool as_raw = (decorators & AS_RAW) != 0;
4075   if (as_raw) {
4076     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4077   } else {
4078     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4079   }
4080 }
4081 
4082 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4083   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4084   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4085     decorators |= ACCESS_READ | ACCESS_WRITE;
4086   }
4087   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4088   return bs->resolve(this, decorators, obj);
4089 }
4090 
4091 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4092                                    Register thread_tmp, DecoratorSet decorators) {
4093   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4094 }
4095 
4096 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4097                                             Register thread_tmp, DecoratorSet decorators) {
4098   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4099 }
4100 
4101 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4102                                     Register thread_tmp, DecoratorSet decorators) {
4103   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4104 }
4105 
4106 // Used for storing NULLs.
4107 void MacroAssembler::store_heap_oop_null(Address dst) {
4108   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4109 }
4110 
4111 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4112   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4113   int index = oop_recorder()->allocate_metadata_index(obj);
4114   RelocationHolder rspec = metadata_Relocation::spec(index);
4115   return Address((address)obj, rspec);
4116 }
4117 
4118 // Move an oop into a register.  immediate is true if we want
4119 // immediate instrcutions, i.e. we are not going to patch this
4120 // instruction while the code is being executed by another thread.  In
4121 // that case we can use move immediates rather than the constant pool.
4122 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4123   int oop_index;
4124   if (obj == NULL) {
4125     oop_index = oop_recorder()->allocate_oop_index(obj);
4126   } else {
4127 #ifdef ASSERT
4128     {
4129       ThreadInVMfromUnknown tiv;
4130       assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "should be real oop");
4131     }
4132 #endif
4133     oop_index = oop_recorder()->find_index(obj);
4134   }
4135   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4136   if (! immediate) {
4137     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4138     ldr_constant(dst, Address(dummy, rspec));
4139   } else
4140     mov(dst, Address((address)obj, rspec));
4141 }
4142 
4143 // Move a metadata address into a register.
4144 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4145   int oop_index;
4146   if (obj == NULL) {
4147     oop_index = oop_recorder()->allocate_metadata_index(obj);
4148   } else {
4149     oop_index = oop_recorder()->find_index(obj);
4150   }
4151   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4152   mov(dst, Address((address)obj, rspec));
4153 }
4154 
4155 Address MacroAssembler::constant_oop_address(jobject obj) {
4156 #ifdef ASSERT
4157   {
4158     ThreadInVMfromUnknown tiv;
4159     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4160     assert(Universe::heap()->is_in(JNIHandles::resolve(obj)), "not an oop");
4161   }
4162 #endif
4163   int oop_index = oop_recorder()->find_index(obj);
4164   return Address((address)obj, oop_Relocation::spec(oop_index));
4165 }
4166 
4167 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4168 void MacroAssembler::tlab_allocate(Register obj,
4169                                    Register var_size_in_bytes,
4170                                    int con_size_in_bytes,
4171                                    Register t1,
4172                                    Register t2,
4173                                    Label& slow_case) {
4174   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4175   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4176 }
4177 
4178 // Defines obj, preserves var_size_in_bytes
4179 void MacroAssembler::eden_allocate(Register obj,
4180                                    Register var_size_in_bytes,
4181                                    int con_size_in_bytes,
4182                                    Register t1,
4183                                    Label& slow_case) {
4184   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4185   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4186 }
4187 
4188 // Zero words; len is in bytes
4189 // Destroys all registers except addr
4190 // len must be a nonzero multiple of wordSize
4191 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4192   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4193 
4194 #ifdef ASSERT
4195   { Label L;
4196     tst(len, BytesPerWord - 1);
4197     br(Assembler::EQ, L);
4198     stop("len is not a multiple of BytesPerWord");
4199     bind(L);
4200   }
4201 #endif
4202 
4203 #ifndef PRODUCT
4204   block_comment("zero memory");
4205 #endif
4206 
4207   Label loop;
4208   Label entry;
4209 
4210 //  Algorithm:
4211 //
4212 //    scratch1 = cnt & 7;
4213 //    cnt -= scratch1;
4214 //    p += scratch1;
4215 //    switch (scratch1) {
4216 //      do {
4217 //        cnt -= 8;
4218 //          p[-8] = 0;
4219 //        case 7:
4220 //          p[-7] = 0;
4221 //        case 6:
4222 //          p[-6] = 0;
4223 //          // ...
4224 //        case 1:
4225 //          p[-1] = 0;
4226 //        case 0:
4227 //          p += 8;
4228 //      } while (cnt);
4229 //    }
4230 
4231   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4232 
4233   lsr(len, len, LogBytesPerWord);
4234   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4235   sub(len, len, rscratch1);      // cnt -= unroll
4236   // t1 always points to the end of the region we're about to zero
4237   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4238   adr(rscratch2, entry);
4239   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4240   br(rscratch2);
4241   bind(loop);
4242   sub(len, len, unroll);
4243   for (int i = -unroll; i < 0; i++)
4244     Assembler::str(zr, Address(t1, i * wordSize));
4245   bind(entry);
4246   add(t1, t1, unroll * wordSize);
4247   cbnz(len, loop);
4248 }
4249 
4250 void MacroAssembler::verify_tlab() {
4251 #ifdef ASSERT
4252   if (UseTLAB && VerifyOops) {
4253     Label next, ok;
4254 
4255     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4256 
4257     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4258     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4259     cmp(rscratch2, rscratch1);
4260     br(Assembler::HS, next);
4261     STOP("assert(top >= start)");
4262     should_not_reach_here();
4263 
4264     bind(next);
4265     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4266     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4267     cmp(rscratch2, rscratch1);
4268     br(Assembler::HS, ok);
4269     STOP("assert(top <= end)");
4270     should_not_reach_here();
4271 
4272     bind(ok);
4273     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4274   }
4275 #endif
4276 }
4277 
4278 // Writes to stack successive pages until offset reached to check for
4279 // stack overflow + shadow pages.  This clobbers tmp.
4280 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4281   assert_different_registers(tmp, size, rscratch1);
4282   mov(tmp, sp);
4283   // Bang stack for total size given plus shadow page size.
4284   // Bang one page at a time because large size can bang beyond yellow and
4285   // red zones.
4286   Label loop;
4287   mov(rscratch1, os::vm_page_size());
4288   bind(loop);
4289   lea(tmp, Address(tmp, -os::vm_page_size()));
4290   subsw(size, size, rscratch1);
4291   str(size, Address(tmp));
4292   br(Assembler::GT, loop);
4293 
4294   // Bang down shadow pages too.
4295   // At this point, (tmp-0) is the last address touched, so don't
4296   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4297   // was post-decremented.)  Skip this address by starting at i=1, and
4298   // touch a few more pages below.  N.B.  It is important to touch all
4299   // the way down to and including i=StackShadowPages.
4300   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4301     // this could be any sized move but this is can be a debugging crumb
4302     // so the bigger the better.
4303     lea(tmp, Address(tmp, -os::vm_page_size()));
4304     str(size, Address(tmp));
4305   }
4306 }
4307 
4308 
4309 // Move the address of the polling page into dest.
4310 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4311   if (SafepointMechanism::uses_thread_local_poll()) {
4312     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4313   } else {
4314     unsigned long off;
4315     adrp(dest, Address(page, rtype), off);
4316     assert(off == 0, "polling page must be page aligned");
4317   }
4318 }
4319 
4320 // Move the address of the polling page into r, then read the polling
4321 // page.
4322 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4323   get_polling_page(r, page, rtype);
4324   return read_polling_page(r, rtype);
4325 }
4326 
4327 // Read the polling page.  The address of the polling page must
4328 // already be in r.
4329 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4330   InstructionMark im(this);
4331   code_section()->relocate(inst_mark(), rtype);
4332   ldrw(zr, Address(r, 0));
4333   return inst_mark();
4334 }
4335 
4336 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4337   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4338   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4339   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4340   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4341   long offset_low = dest_page - low_page;
4342   long offset_high = dest_page - high_page;
4343 
4344   assert(is_valid_AArch64_address(dest.target()), "bad address");
4345   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4346 
4347   InstructionMark im(this);
4348   code_section()->relocate(inst_mark(), dest.rspec());
4349   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4350   // the code cache so that if it is relocated we know it will still reach
4351   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4352     _adrp(reg1, dest.target());
4353   } else {
4354     unsigned long target = (unsigned long)dest.target();
4355     unsigned long adrp_target
4356       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4357 
4358     _adrp(reg1, (address)adrp_target);
4359     movk(reg1, target >> 32, 32);
4360   }
4361   byte_offset = (unsigned long)dest.target() & 0xfff;
4362 }
4363 
4364 void MacroAssembler::load_byte_map_base(Register reg) {
4365   CardTable::CardValue* byte_map_base =
4366     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4367 
4368   if (is_valid_AArch64_address((address)byte_map_base)) {
4369     // Strictly speaking the byte_map_base isn't an address at all,
4370     // and it might even be negative.
4371     unsigned long offset;
4372     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4373     // We expect offset to be zero with most collectors.
4374     if (offset != 0) {
4375       add(reg, reg, offset);
4376     }
4377   } else {
4378     mov(reg, (uint64_t)byte_map_base);
4379   }
4380 }
4381 
4382 void MacroAssembler::build_frame(int framesize) {
4383   assert(framesize > 0, "framesize must be > 0");
4384   if (framesize < ((1 << 9) + 2 * wordSize)) {
4385     sub(sp, sp, framesize);
4386     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4387     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4388   } else {
4389     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4390     if (PreserveFramePointer) mov(rfp, sp);
4391     if (framesize < ((1 << 12) + 2 * wordSize))
4392       sub(sp, sp, framesize - 2 * wordSize);
4393     else {
4394       mov(rscratch1, framesize - 2 * wordSize);
4395       sub(sp, sp, rscratch1);
4396     }
4397   }
4398 }
4399 
4400 void MacroAssembler::remove_frame(int framesize) {
4401   assert(framesize > 0, "framesize must be > 0");
4402   if (framesize < ((1 << 9) + 2 * wordSize)) {
4403     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4404     add(sp, sp, framesize);
4405   } else {
4406     if (framesize < ((1 << 12) + 2 * wordSize))
4407       add(sp, sp, framesize - 2 * wordSize);
4408     else {
4409       mov(rscratch1, framesize - 2 * wordSize);
4410       add(sp, sp, rscratch1);
4411     }
4412     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4413   }
4414 }
4415 
4416 #ifdef COMPILER2
4417 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4418 
4419 // Search for str1 in str2 and return index or -1
4420 void MacroAssembler::string_indexof(Register str2, Register str1,
4421                                     Register cnt2, Register cnt1,
4422                                     Register tmp1, Register tmp2,
4423                                     Register tmp3, Register tmp4,
4424                                     Register tmp5, Register tmp6,
4425                                     int icnt1, Register result, int ae) {
4426   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4427   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4428 
4429   Register ch1 = rscratch1;
4430   Register ch2 = rscratch2;
4431   Register cnt1tmp = tmp1;
4432   Register cnt2tmp = tmp2;
4433   Register cnt1_neg = cnt1;
4434   Register cnt2_neg = cnt2;
4435   Register result_tmp = tmp4;
4436 
4437   bool isL = ae == StrIntrinsicNode::LL;
4438 
4439   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4440   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4441   int str1_chr_shift = str1_isL ? 0:1;
4442   int str2_chr_shift = str2_isL ? 0:1;
4443   int str1_chr_size = str1_isL ? 1:2;
4444   int str2_chr_size = str2_isL ? 1:2;
4445   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4446                                       (chr_insn)&MacroAssembler::ldrh;
4447   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4448                                       (chr_insn)&MacroAssembler::ldrh;
4449   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4450   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4451 
4452   // Note, inline_string_indexOf() generates checks:
4453   // if (substr.count > string.count) return -1;
4454   // if (substr.count == 0) return 0;
4455 
4456   // We have two strings, a source string in str2, cnt2 and a pattern string
4457   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4458 
4459   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4460   // With a small pattern and source we use linear scan.
4461 
4462   if (icnt1 == -1) {
4463     sub(result_tmp, cnt2, cnt1);
4464     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4465     br(LT, LINEARSEARCH);
4466     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4467     subs(zr, cnt1, 256);
4468     lsr(tmp1, cnt2, 2);
4469     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4470     br(GE, LINEARSTUB);
4471   }
4472 
4473 // The Boyer Moore alogorithm is based on the description here:-
4474 //
4475 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4476 //
4477 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4478 // and the 'Good Suffix' rule.
4479 //
4480 // These rules are essentially heuristics for how far we can shift the
4481 // pattern along the search string.
4482 //
4483 // The implementation here uses the 'Bad Character' rule only because of the
4484 // complexity of initialisation for the 'Good Suffix' rule.
4485 //
4486 // This is also known as the Boyer-Moore-Horspool algorithm:-
4487 //
4488 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4489 //
4490 // This particular implementation has few java-specific optimizations.
4491 //
4492 // #define ASIZE 256
4493 //
4494 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4495 //       int i, j;
4496 //       unsigned c;
4497 //       unsigned char bc[ASIZE];
4498 //
4499 //       /* Preprocessing */
4500 //       for (i = 0; i < ASIZE; ++i)
4501 //          bc[i] = m;
4502 //       for (i = 0; i < m - 1; ) {
4503 //          c = x[i];
4504 //          ++i;
4505 //          // c < 256 for Latin1 string, so, no need for branch
4506 //          #ifdef PATTERN_STRING_IS_LATIN1
4507 //          bc[c] = m - i;
4508 //          #else
4509 //          if (c < ASIZE) bc[c] = m - i;
4510 //          #endif
4511 //       }
4512 //
4513 //       /* Searching */
4514 //       j = 0;
4515 //       while (j <= n - m) {
4516 //          c = y[i+j];
4517 //          if (x[m-1] == c)
4518 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4519 //          if (i < 0) return j;
4520 //          // c < 256 for Latin1 string, so, no need for branch
4521 //          #ifdef SOURCE_STRING_IS_LATIN1
4522 //          // LL case: (c< 256) always true. Remove branch
4523 //          j += bc[y[j+m-1]];
4524 //          #endif
4525 //          #ifndef PATTERN_STRING_IS_UTF
4526 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4527 //          if (c < ASIZE)
4528 //            j += bc[y[j+m-1]];
4529 //          else
4530 //            j += 1
4531 //          #endif
4532 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4533 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4534 //          if (c < ASIZE)
4535 //            j += bc[y[j+m-1]];
4536 //          else
4537 //            j += m
4538 //          #endif
4539 //       }
4540 //    }
4541 
4542   if (icnt1 == -1) {
4543     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4544         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4545     Register cnt1end = tmp2;
4546     Register str2end = cnt2;
4547     Register skipch = tmp2;
4548 
4549     // str1 length is >=8, so, we can read at least 1 register for cases when
4550     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4551     // UL case. We'll re-read last character in inner pre-loop code to have
4552     // single outer pre-loop load
4553     const int firstStep = isL ? 7 : 3;
4554 
4555     const int ASIZE = 256;
4556     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4557     sub(sp, sp, ASIZE);
4558     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4559     mov(ch1, sp);
4560     BIND(BM_INIT_LOOP);
4561       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4562       subs(tmp5, tmp5, 1);
4563       br(GT, BM_INIT_LOOP);
4564 
4565       sub(cnt1tmp, cnt1, 1);
4566       mov(tmp5, str2);
4567       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4568       sub(ch2, cnt1, 1);
4569       mov(tmp3, str1);
4570     BIND(BCLOOP);
4571       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4572       if (!str1_isL) {
4573         subs(zr, ch1, ASIZE);
4574         br(HS, BCSKIP);
4575       }
4576       strb(ch2, Address(sp, ch1));
4577     BIND(BCSKIP);
4578       subs(ch2, ch2, 1);
4579       br(GT, BCLOOP);
4580 
4581       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4582       if (str1_isL == str2_isL) {
4583         // load last 8 bytes (8LL/4UU symbols)
4584         ldr(tmp6, Address(tmp6, -wordSize));
4585       } else {
4586         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4587         // convert Latin1 to UTF. We'll have to wait until load completed, but
4588         // it's still faster than per-character loads+checks
4589         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4590         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4591         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4592         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4593         orr(ch2, ch1, ch2, LSL, 16);
4594         orr(tmp6, tmp6, tmp3, LSL, 48);
4595         orr(tmp6, tmp6, ch2, LSL, 16);
4596       }
4597     BIND(BMLOOPSTR2);
4598       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4599       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4600       if (str1_isL == str2_isL) {
4601         // re-init tmp3. It's for free because it's executed in parallel with
4602         // load above. Alternative is to initialize it before loop, but it'll
4603         // affect performance on in-order systems with 2 or more ld/st pipelines
4604         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4605       }
4606       if (!isL) { // UU/UL case
4607         lsl(ch2, cnt1tmp, 1); // offset in bytes
4608       }
4609       cmp(tmp3, skipch);
4610       br(NE, BMSKIP);
4611       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4612       mov(ch1, tmp6);
4613       if (isL) {
4614         b(BMLOOPSTR1_AFTER_LOAD);
4615       } else {
4616         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4617         b(BMLOOPSTR1_CMP);
4618       }
4619     BIND(BMLOOPSTR1);
4620       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4621       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4622     BIND(BMLOOPSTR1_AFTER_LOAD);
4623       subs(cnt1tmp, cnt1tmp, 1);
4624       br(LT, BMLOOPSTR1_LASTCMP);
4625     BIND(BMLOOPSTR1_CMP);
4626       cmp(ch1, ch2);
4627       br(EQ, BMLOOPSTR1);
4628     BIND(BMSKIP);
4629       if (!isL) {
4630         // if we've met UTF symbol while searching Latin1 pattern, then we can
4631         // skip cnt1 symbols
4632         if (str1_isL != str2_isL) {
4633           mov(result_tmp, cnt1);
4634         } else {
4635           mov(result_tmp, 1);
4636         }
4637         subs(zr, skipch, ASIZE);
4638         br(HS, BMADV);
4639       }
4640       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4641     BIND(BMADV);
4642       sub(cnt1tmp, cnt1, 1);
4643       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4644       cmp(str2, str2end);
4645       br(LE, BMLOOPSTR2);
4646       add(sp, sp, ASIZE);
4647       b(NOMATCH);
4648     BIND(BMLOOPSTR1_LASTCMP);
4649       cmp(ch1, ch2);
4650       br(NE, BMSKIP);
4651     BIND(BMMATCH);
4652       sub(result, str2, tmp5);
4653       if (!str2_isL) lsr(result, result, 1);
4654       add(sp, sp, ASIZE);
4655       b(DONE);
4656 
4657     BIND(LINEARSTUB);
4658     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4659     br(LT, LINEAR_MEDIUM);
4660     mov(result, zr);
4661     RuntimeAddress stub = NULL;
4662     if (isL) {
4663       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4664       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4665     } else if (str1_isL) {
4666       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4667        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4668     } else {
4669       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4670       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4671     }
4672     trampoline_call(stub);
4673     b(DONE);
4674   }
4675 
4676   BIND(LINEARSEARCH);
4677   {
4678     Label DO1, DO2, DO3;
4679 
4680     Register str2tmp = tmp2;
4681     Register first = tmp3;
4682 
4683     if (icnt1 == -1)
4684     {
4685         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4686 
4687         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4688         br(LT, DOSHORT);
4689       BIND(LINEAR_MEDIUM);
4690         (this->*str1_load_1chr)(first, Address(str1));
4691         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4692         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4693         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4694         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4695 
4696       BIND(FIRST_LOOP);
4697         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4698         cmp(first, ch2);
4699         br(EQ, STR1_LOOP);
4700       BIND(STR2_NEXT);
4701         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4702         br(LE, FIRST_LOOP);
4703         b(NOMATCH);
4704 
4705       BIND(STR1_LOOP);
4706         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4707         add(cnt2tmp, cnt2_neg, str2_chr_size);
4708         br(GE, MATCH);
4709 
4710       BIND(STR1_NEXT);
4711         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4712         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4713         cmp(ch1, ch2);
4714         br(NE, STR2_NEXT);
4715         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4716         add(cnt2tmp, cnt2tmp, str2_chr_size);
4717         br(LT, STR1_NEXT);
4718         b(MATCH);
4719 
4720       BIND(DOSHORT);
4721       if (str1_isL == str2_isL) {
4722         cmp(cnt1, (u1)2);
4723         br(LT, DO1);
4724         br(GT, DO3);
4725       }
4726     }
4727 
4728     if (icnt1 == 4) {
4729       Label CH1_LOOP;
4730 
4731         (this->*load_4chr)(ch1, str1);
4732         sub(result_tmp, cnt2, 4);
4733         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4734         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4735 
4736       BIND(CH1_LOOP);
4737         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4738         cmp(ch1, ch2);
4739         br(EQ, MATCH);
4740         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4741         br(LE, CH1_LOOP);
4742         b(NOMATCH);
4743       }
4744 
4745     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4746       Label CH1_LOOP;
4747 
4748       BIND(DO2);
4749         (this->*load_2chr)(ch1, str1);
4750         if (icnt1 == 2) {
4751           sub(result_tmp, cnt2, 2);
4752         }
4753         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4754         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4755       BIND(CH1_LOOP);
4756         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4757         cmp(ch1, ch2);
4758         br(EQ, MATCH);
4759         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4760         br(LE, CH1_LOOP);
4761         b(NOMATCH);
4762     }
4763 
4764     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4765       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4766 
4767       BIND(DO3);
4768         (this->*load_2chr)(first, str1);
4769         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4770         if (icnt1 == 3) {
4771           sub(result_tmp, cnt2, 3);
4772         }
4773         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4774         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4775       BIND(FIRST_LOOP);
4776         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4777         cmpw(first, ch2);
4778         br(EQ, STR1_LOOP);
4779       BIND(STR2_NEXT);
4780         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4781         br(LE, FIRST_LOOP);
4782         b(NOMATCH);
4783 
4784       BIND(STR1_LOOP);
4785         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4786         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4787         cmp(ch1, ch2);
4788         br(NE, STR2_NEXT);
4789         b(MATCH);
4790     }
4791 
4792     if (icnt1 == -1 || icnt1 == 1) {
4793       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4794 
4795       BIND(DO1);
4796         (this->*str1_load_1chr)(ch1, str1);
4797         cmp(cnt2, (u1)8);
4798         br(LT, DO1_SHORT);
4799 
4800         sub(result_tmp, cnt2, 8/str2_chr_size);
4801         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4802         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4803         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4804 
4805         if (str2_isL) {
4806           orr(ch1, ch1, ch1, LSL, 8);
4807         }
4808         orr(ch1, ch1, ch1, LSL, 16);
4809         orr(ch1, ch1, ch1, LSL, 32);
4810       BIND(CH1_LOOP);
4811         ldr(ch2, Address(str2, cnt2_neg));
4812         eor(ch2, ch1, ch2);
4813         sub(tmp1, ch2, tmp3);
4814         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4815         bics(tmp1, tmp1, tmp2);
4816         br(NE, HAS_ZERO);
4817         adds(cnt2_neg, cnt2_neg, 8);
4818         br(LT, CH1_LOOP);
4819 
4820         cmp(cnt2_neg, (u1)8);
4821         mov(cnt2_neg, 0);
4822         br(LT, CH1_LOOP);
4823         b(NOMATCH);
4824 
4825       BIND(HAS_ZERO);
4826         rev(tmp1, tmp1);
4827         clz(tmp1, tmp1);
4828         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4829         b(MATCH);
4830 
4831       BIND(DO1_SHORT);
4832         mov(result_tmp, cnt2);
4833         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4834         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4835       BIND(DO1_LOOP);
4836         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4837         cmpw(ch1, ch2);
4838         br(EQ, MATCH);
4839         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4840         br(LT, DO1_LOOP);
4841     }
4842   }
4843   BIND(NOMATCH);
4844     mov(result, -1);
4845     b(DONE);
4846   BIND(MATCH);
4847     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4848   BIND(DONE);
4849 }
4850 
4851 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4852 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4853 
4854 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4855                                          Register ch, Register result,
4856                                          Register tmp1, Register tmp2, Register tmp3)
4857 {
4858   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4859   Register cnt1_neg = cnt1;
4860   Register ch1 = rscratch1;
4861   Register result_tmp = rscratch2;
4862 
4863   cmp(cnt1, (u1)4);
4864   br(LT, DO1_SHORT);
4865 
4866   orr(ch, ch, ch, LSL, 16);
4867   orr(ch, ch, ch, LSL, 32);
4868 
4869   sub(cnt1, cnt1, 4);
4870   mov(result_tmp, cnt1);
4871   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4872   sub(cnt1_neg, zr, cnt1, LSL, 1);
4873 
4874   mov(tmp3, 0x0001000100010001);
4875 
4876   BIND(CH1_LOOP);
4877     ldr(ch1, Address(str1, cnt1_neg));
4878     eor(ch1, ch, ch1);
4879     sub(tmp1, ch1, tmp3);
4880     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4881     bics(tmp1, tmp1, tmp2);
4882     br(NE, HAS_ZERO);
4883     adds(cnt1_neg, cnt1_neg, 8);
4884     br(LT, CH1_LOOP);
4885 
4886     cmp(cnt1_neg, (u1)8);
4887     mov(cnt1_neg, 0);
4888     br(LT, CH1_LOOP);
4889     b(NOMATCH);
4890 
4891   BIND(HAS_ZERO);
4892     rev(tmp1, tmp1);
4893     clz(tmp1, tmp1);
4894     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4895     b(MATCH);
4896 
4897   BIND(DO1_SHORT);
4898     mov(result_tmp, cnt1);
4899     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4900     sub(cnt1_neg, zr, cnt1, LSL, 1);
4901   BIND(DO1_LOOP);
4902     ldrh(ch1, Address(str1, cnt1_neg));
4903     cmpw(ch, ch1);
4904     br(EQ, MATCH);
4905     adds(cnt1_neg, cnt1_neg, 2);
4906     br(LT, DO1_LOOP);
4907   BIND(NOMATCH);
4908     mov(result, -1);
4909     b(DONE);
4910   BIND(MATCH);
4911     add(result, result_tmp, cnt1_neg, ASR, 1);
4912   BIND(DONE);
4913 }
4914 
4915 // Compare strings.
4916 void MacroAssembler::string_compare(Register str1, Register str2,
4917     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4918     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4919   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4920       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4921       SHORT_LOOP_START, TAIL_CHECK;
4922 
4923   bool isLL = ae == StrIntrinsicNode::LL;
4924   bool isLU = ae == StrIntrinsicNode::LU;
4925   bool isUL = ae == StrIntrinsicNode::UL;
4926 
4927   // The stub threshold for LL strings is: 72 (64 + 8) chars
4928   // UU: 36 chars, or 72 bytes (valid for the 64-byte large loop with prefetch)
4929   // LU/UL: 24 chars, or 48 bytes (valid for the 16-character loop at least)
4930   const u1 stub_threshold = isLL ? 72 : ((isLU || isUL) ? 24 : 36);
4931 
4932   bool str1_isL = isLL || isLU;
4933   bool str2_isL = isLL || isUL;
4934 
4935   int str1_chr_shift = str1_isL ? 0 : 1;
4936   int str2_chr_shift = str2_isL ? 0 : 1;
4937   int str1_chr_size = str1_isL ? 1 : 2;
4938   int str2_chr_size = str2_isL ? 1 : 2;
4939   int minCharsInWord = isLL ? wordSize : wordSize/2;
4940 
4941   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4942   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4943                                       (chr_insn)&MacroAssembler::ldrh;
4944   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4945                                       (chr_insn)&MacroAssembler::ldrh;
4946   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4947                             (uxt_insn)&MacroAssembler::uxthw;
4948 
4949   BLOCK_COMMENT("string_compare {");
4950 
4951   // Bizzarely, the counts are passed in bytes, regardless of whether they
4952   // are L or U strings, however the result is always in characters.
4953   if (!str1_isL) asrw(cnt1, cnt1, 1);
4954   if (!str2_isL) asrw(cnt2, cnt2, 1);
4955 
4956   // Compute the minimum of the string lengths and save the difference.
4957   subsw(result, cnt1, cnt2);
4958   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4959 
4960   // A very short string
4961   cmpw(cnt2, minCharsInWord);
4962   br(Assembler::LE, SHORT_STRING);
4963 
4964   // Compare longwords
4965   // load first parts of strings and finish initialization while loading
4966   {
4967     if (str1_isL == str2_isL) { // LL or UU
4968       ldr(tmp1, Address(str1));
4969       cmp(str1, str2);
4970       br(Assembler::EQ, DONE);
4971       ldr(tmp2, Address(str2));
4972       cmp(cnt2, stub_threshold);
4973       br(GE, STUB);
4974       subsw(cnt2, cnt2, minCharsInWord);
4975       br(EQ, TAIL_CHECK);
4976       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4977       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4978       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4979     } else if (isLU) {
4980       ldrs(vtmp, Address(str1));
4981       ldr(tmp2, Address(str2));
4982       cmp(cnt2, stub_threshold);
4983       br(GE, STUB);
4984       subw(cnt2, cnt2, 4);
4985       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4986       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4987       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4988       zip1(vtmp, T8B, vtmp, vtmpZ);
4989       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4990       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4991       add(cnt1, cnt1, 4);
4992       fmovd(tmp1, vtmp);
4993     } else { // UL case
4994       ldr(tmp1, Address(str1));
4995       ldrs(vtmp, Address(str2));
4996       cmp(cnt2, stub_threshold);
4997       br(GE, STUB);
4998       subw(cnt2, cnt2, 4);
4999       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5000       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5001       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5002       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5003       zip1(vtmp, T8B, vtmp, vtmpZ);
5004       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5005       add(cnt1, cnt1, 8);
5006       fmovd(tmp2, vtmp);
5007     }
5008     adds(cnt2, cnt2, isUL ? 4 : 8);
5009     br(GE, TAIL);
5010     eor(rscratch2, tmp1, tmp2);
5011     cbnz(rscratch2, DIFFERENCE);
5012     // main loop
5013     bind(NEXT_WORD);
5014     if (str1_isL == str2_isL) {
5015       ldr(tmp1, Address(str1, cnt2));
5016       ldr(tmp2, Address(str2, cnt2));
5017       adds(cnt2, cnt2, 8);
5018     } else if (isLU) {
5019       ldrs(vtmp, Address(str1, cnt1));
5020       ldr(tmp2, Address(str2, cnt2));
5021       add(cnt1, cnt1, 4);
5022       zip1(vtmp, T8B, vtmp, vtmpZ);
5023       fmovd(tmp1, vtmp);
5024       adds(cnt2, cnt2, 8);
5025     } else { // UL
5026       ldrs(vtmp, Address(str2, cnt2));
5027       ldr(tmp1, Address(str1, cnt1));
5028       zip1(vtmp, T8B, vtmp, vtmpZ);
5029       add(cnt1, cnt1, 8);
5030       fmovd(tmp2, vtmp);
5031       adds(cnt2, cnt2, 4);
5032     }
5033     br(GE, TAIL);
5034 
5035     eor(rscratch2, tmp1, tmp2);
5036     cbz(rscratch2, NEXT_WORD);
5037     b(DIFFERENCE);
5038     bind(TAIL);
5039     eor(rscratch2, tmp1, tmp2);
5040     cbnz(rscratch2, DIFFERENCE);
5041     // Last longword.  In the case where length == 4 we compare the
5042     // same longword twice, but that's still faster than another
5043     // conditional branch.
5044     if (str1_isL == str2_isL) {
5045       ldr(tmp1, Address(str1));
5046       ldr(tmp2, Address(str2));
5047     } else if (isLU) {
5048       ldrs(vtmp, Address(str1));
5049       ldr(tmp2, Address(str2));
5050       zip1(vtmp, T8B, vtmp, vtmpZ);
5051       fmovd(tmp1, vtmp);
5052     } else { // UL
5053       ldrs(vtmp, Address(str2));
5054       ldr(tmp1, Address(str1));
5055       zip1(vtmp, T8B, vtmp, vtmpZ);
5056       fmovd(tmp2, vtmp);
5057     }
5058     bind(TAIL_CHECK);
5059     eor(rscratch2, tmp1, tmp2);
5060     cbz(rscratch2, DONE);
5061 
5062     // Find the first different characters in the longwords and
5063     // compute their difference.
5064     bind(DIFFERENCE);
5065     rev(rscratch2, rscratch2);
5066     clz(rscratch2, rscratch2);
5067     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5068     lsrv(tmp1, tmp1, rscratch2);
5069     (this->*ext_chr)(tmp1, tmp1);
5070     lsrv(tmp2, tmp2, rscratch2);
5071     (this->*ext_chr)(tmp2, tmp2);
5072     subw(result, tmp1, tmp2);
5073     b(DONE);
5074   }
5075 
5076   bind(STUB);
5077     RuntimeAddress stub = NULL;
5078     switch(ae) {
5079       case StrIntrinsicNode::LL:
5080         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5081         break;
5082       case StrIntrinsicNode::UU:
5083         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5084         break;
5085       case StrIntrinsicNode::LU:
5086         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5087         break;
5088       case StrIntrinsicNode::UL:
5089         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5090         break;
5091       default:
5092         ShouldNotReachHere();
5093      }
5094     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5095     trampoline_call(stub);
5096     b(DONE);
5097 
5098   bind(SHORT_STRING);
5099   // Is the minimum length zero?
5100   cbz(cnt2, DONE);
5101   // arrange code to do most branches while loading and loading next characters
5102   // while comparing previous
5103   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5104   subs(cnt2, cnt2, 1);
5105   br(EQ, SHORT_LAST_INIT);
5106   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5107   b(SHORT_LOOP_START);
5108   bind(SHORT_LOOP);
5109   subs(cnt2, cnt2, 1);
5110   br(EQ, SHORT_LAST);
5111   bind(SHORT_LOOP_START);
5112   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5113   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5114   cmp(tmp1, cnt1);
5115   br(NE, SHORT_LOOP_TAIL);
5116   subs(cnt2, cnt2, 1);
5117   br(EQ, SHORT_LAST2);
5118   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5119   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5120   cmp(tmp2, rscratch1);
5121   br(EQ, SHORT_LOOP);
5122   sub(result, tmp2, rscratch1);
5123   b(DONE);
5124   bind(SHORT_LOOP_TAIL);
5125   sub(result, tmp1, cnt1);
5126   b(DONE);
5127   bind(SHORT_LAST2);
5128   cmp(tmp2, rscratch1);
5129   br(EQ, DONE);
5130   sub(result, tmp2, rscratch1);
5131 
5132   b(DONE);
5133   bind(SHORT_LAST_INIT);
5134   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5135   bind(SHORT_LAST);
5136   cmp(tmp1, cnt1);
5137   br(EQ, DONE);
5138   sub(result, tmp1, cnt1);
5139 
5140   bind(DONE);
5141 
5142   BLOCK_COMMENT("} string_compare");
5143 }
5144 #endif // COMPILER2
5145 
5146 // This method checks if provided byte array contains byte with highest bit set.
5147 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5148     // Simple and most common case of aligned small array which is not at the
5149     // end of memory page is placed here. All other cases are in stub.
5150     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5151     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5152     assert_different_registers(ary1, len, result);
5153 
5154     cmpw(len, 0);
5155     br(LE, SET_RESULT);
5156     cmpw(len, 4 * wordSize);
5157     br(GE, STUB_LONG); // size > 32 then go to stub
5158 
5159     int shift = 64 - exact_log2(os::vm_page_size());
5160     lsl(rscratch1, ary1, shift);
5161     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5162     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5163     br(CS, STUB); // at the end of page then go to stub
5164     subs(len, len, wordSize);
5165     br(LT, END);
5166 
5167   BIND(LOOP);
5168     ldr(rscratch1, Address(post(ary1, wordSize)));
5169     tst(rscratch1, UPPER_BIT_MASK);
5170     br(NE, SET_RESULT);
5171     subs(len, len, wordSize);
5172     br(GE, LOOP);
5173     cmpw(len, -wordSize);
5174     br(EQ, SET_RESULT);
5175 
5176   BIND(END);
5177     ldr(result, Address(ary1));
5178     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5179     lslv(result, result, len);
5180     tst(result, UPPER_BIT_MASK);
5181     b(SET_RESULT);
5182 
5183   BIND(STUB);
5184     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5185     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5186     trampoline_call(has_neg);
5187     b(DONE);
5188 
5189   BIND(STUB_LONG);
5190     RuntimeAddress has_neg_long =  RuntimeAddress(
5191             StubRoutines::aarch64::has_negatives_long());
5192     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5193     trampoline_call(has_neg_long);
5194     b(DONE);
5195 
5196   BIND(SET_RESULT);
5197     cset(result, NE); // set true or false
5198 
5199   BIND(DONE);
5200 }
5201 
5202 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5203                                    Register tmp4, Register tmp5, Register result,
5204                                    Register cnt1, int elem_size) {
5205   Label DONE, SAME;
5206   Register tmp1 = rscratch1;
5207   Register tmp2 = rscratch2;
5208   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5209   int elem_per_word = wordSize/elem_size;
5210   int log_elem_size = exact_log2(elem_size);
5211   int length_offset = arrayOopDesc::length_offset_in_bytes();
5212   int base_offset
5213     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5214   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5215 
5216   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5217   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5218 
5219 #ifndef PRODUCT
5220   {
5221     const char kind = (elem_size == 2) ? 'U' : 'L';
5222     char comment[64];
5223     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5224     BLOCK_COMMENT(comment);
5225   }
5226 #endif
5227 
5228   // if (a1 == a2)
5229   //     return true;
5230   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5231   br(EQ, SAME);
5232 
5233   if (UseSimpleArrayEquals) {
5234     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5235     // if (a1 == null || a2 == null)
5236     //     return false;
5237     // a1 & a2 == 0 means (some-pointer is null) or
5238     // (very-rare-or-even-probably-impossible-pointer-values)
5239     // so, we can save one branch in most cases
5240     tst(a1, a2);
5241     mov(result, false);
5242     br(EQ, A_MIGHT_BE_NULL);
5243     // if (a1.length != a2.length)
5244     //      return false;
5245     bind(A_IS_NOT_NULL);
5246     ldrw(cnt1, Address(a1, length_offset));
5247     ldrw(cnt2, Address(a2, length_offset));
5248     eorw(tmp5, cnt1, cnt2);
5249     cbnzw(tmp5, DONE);
5250     lea(a1, Address(a1, base_offset));
5251     lea(a2, Address(a2, base_offset));
5252     // Check for short strings, i.e. smaller than wordSize.
5253     subs(cnt1, cnt1, elem_per_word);
5254     br(Assembler::LT, SHORT);
5255     // Main 8 byte comparison loop.
5256     bind(NEXT_WORD); {
5257       ldr(tmp1, Address(post(a1, wordSize)));
5258       ldr(tmp2, Address(post(a2, wordSize)));
5259       subs(cnt1, cnt1, elem_per_word);
5260       eor(tmp5, tmp1, tmp2);
5261       cbnz(tmp5, DONE);
5262     } br(GT, NEXT_WORD);
5263     // Last longword.  In the case where length == 4 we compare the
5264     // same longword twice, but that's still faster than another
5265     // conditional branch.
5266     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5267     // length == 4.
5268     if (log_elem_size > 0)
5269       lsl(cnt1, cnt1, log_elem_size);
5270     ldr(tmp3, Address(a1, cnt1));
5271     ldr(tmp4, Address(a2, cnt1));
5272     eor(tmp5, tmp3, tmp4);
5273     cbnz(tmp5, DONE);
5274     b(SAME);
5275     bind(A_MIGHT_BE_NULL);
5276     // in case both a1 and a2 are not-null, proceed with loads
5277     cbz(a1, DONE);
5278     cbz(a2, DONE);
5279     b(A_IS_NOT_NULL);
5280     bind(SHORT);
5281 
5282     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5283     {
5284       ldrw(tmp1, Address(post(a1, 4)));
5285       ldrw(tmp2, Address(post(a2, 4)));
5286       eorw(tmp5, tmp1, tmp2);
5287       cbnzw(tmp5, DONE);
5288     }
5289     bind(TAIL03);
5290     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5291     {
5292       ldrh(tmp3, Address(post(a1, 2)));
5293       ldrh(tmp4, Address(post(a2, 2)));
5294       eorw(tmp5, tmp3, tmp4);
5295       cbnzw(tmp5, DONE);
5296     }
5297     bind(TAIL01);
5298     if (elem_size == 1) { // Only needed when comparing byte arrays.
5299       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5300       {
5301         ldrb(tmp1, a1);
5302         ldrb(tmp2, a2);
5303         eorw(tmp5, tmp1, tmp2);
5304         cbnzw(tmp5, DONE);
5305       }
5306     }
5307   } else {
5308     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5309         CSET_EQ, LAST_CHECK;
5310     mov(result, false);
5311     cbz(a1, DONE);
5312     ldrw(cnt1, Address(a1, length_offset));
5313     cbz(a2, DONE);
5314     ldrw(cnt2, Address(a2, length_offset));
5315     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5316     // faster to perform another branch before comparing a1 and a2
5317     cmp(cnt1, (u1)elem_per_word);
5318     br(LE, SHORT); // short or same
5319     ldr(tmp3, Address(pre(a1, base_offset)));
5320     subs(zr, cnt1, stubBytesThreshold);
5321     br(GE, STUB);
5322     ldr(tmp4, Address(pre(a2, base_offset)));
5323     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5324     cmp(cnt2, cnt1);
5325     br(NE, DONE);
5326 
5327     // Main 16 byte comparison loop with 2 exits
5328     bind(NEXT_DWORD); {
5329       ldr(tmp1, Address(pre(a1, wordSize)));
5330       ldr(tmp2, Address(pre(a2, wordSize)));
5331       subs(cnt1, cnt1, 2 * elem_per_word);
5332       br(LE, TAIL);
5333       eor(tmp4, tmp3, tmp4);
5334       cbnz(tmp4, DONE);
5335       ldr(tmp3, Address(pre(a1, wordSize)));
5336       ldr(tmp4, Address(pre(a2, wordSize)));
5337       cmp(cnt1, (u1)elem_per_word);
5338       br(LE, TAIL2);
5339       cmp(tmp1, tmp2);
5340     } br(EQ, NEXT_DWORD);
5341     b(DONE);
5342 
5343     bind(TAIL);
5344     eor(tmp4, tmp3, tmp4);
5345     eor(tmp2, tmp1, tmp2);
5346     lslv(tmp2, tmp2, tmp5);
5347     orr(tmp5, tmp4, tmp2);
5348     cmp(tmp5, zr);
5349     b(CSET_EQ);
5350 
5351     bind(TAIL2);
5352     eor(tmp2, tmp1, tmp2);
5353     cbnz(tmp2, DONE);
5354     b(LAST_CHECK);
5355 
5356     bind(STUB);
5357     ldr(tmp4, Address(pre(a2, base_offset)));
5358     cmp(cnt2, cnt1);
5359     br(NE, DONE);
5360     if (elem_size == 2) { // convert to byte counter
5361       lsl(cnt1, cnt1, 1);
5362     }
5363     eor(tmp5, tmp3, tmp4);
5364     cbnz(tmp5, DONE);
5365     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5366     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5367     trampoline_call(stub);
5368     b(DONE);
5369 
5370     bind(EARLY_OUT);
5371     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5372     // so, if a2 == null => return false(0), else return true, so we can return a2
5373     mov(result, a2);
5374     b(DONE);
5375     bind(SHORT);
5376     cmp(cnt2, cnt1);
5377     br(NE, DONE);
5378     cbz(cnt1, SAME);
5379     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5380     ldr(tmp3, Address(a1, base_offset));
5381     ldr(tmp4, Address(a2, base_offset));
5382     bind(LAST_CHECK);
5383     eor(tmp4, tmp3, tmp4);
5384     lslv(tmp5, tmp4, tmp5);
5385     cmp(tmp5, zr);
5386     bind(CSET_EQ);
5387     cset(result, EQ);
5388     b(DONE);
5389   }
5390 
5391   bind(SAME);
5392   mov(result, true);
5393   // That's it.
5394   bind(DONE);
5395 
5396   BLOCK_COMMENT("} array_equals");
5397 }
5398 
5399 // Compare Strings
5400 
5401 // For Strings we're passed the address of the first characters in a1
5402 // and a2 and the length in cnt1.
5403 // elem_size is the element size in bytes: either 1 or 2.
5404 // There are two implementations.  For arrays >= 8 bytes, all
5405 // comparisons (including the final one, which may overlap) are
5406 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5407 // halfword, then a short, and then a byte.
5408 
5409 void MacroAssembler::string_equals(Register a1, Register a2,
5410                                    Register result, Register cnt1, int elem_size)
5411 {
5412   Label SAME, DONE, SHORT, NEXT_WORD;
5413   Register tmp1 = rscratch1;
5414   Register tmp2 = rscratch2;
5415   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5416 
5417   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5418   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5419 
5420 #ifndef PRODUCT
5421   {
5422     const char kind = (elem_size == 2) ? 'U' : 'L';
5423     char comment[64];
5424     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5425     BLOCK_COMMENT(comment);
5426   }
5427 #endif
5428 
5429   mov(result, false);
5430 
5431   // Check for short strings, i.e. smaller than wordSize.
5432   subs(cnt1, cnt1, wordSize);
5433   br(Assembler::LT, SHORT);
5434   // Main 8 byte comparison loop.
5435   bind(NEXT_WORD); {
5436     ldr(tmp1, Address(post(a1, wordSize)));
5437     ldr(tmp2, Address(post(a2, wordSize)));
5438     subs(cnt1, cnt1, wordSize);
5439     eor(tmp1, tmp1, tmp2);
5440     cbnz(tmp1, DONE);
5441   } br(GT, NEXT_WORD);
5442   // Last longword.  In the case where length == 4 we compare the
5443   // same longword twice, but that's still faster than another
5444   // conditional branch.
5445   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5446   // length == 4.
5447   ldr(tmp1, Address(a1, cnt1));
5448   ldr(tmp2, Address(a2, cnt1));
5449   eor(tmp2, tmp1, tmp2);
5450   cbnz(tmp2, DONE);
5451   b(SAME);
5452 
5453   bind(SHORT);
5454   Label TAIL03, TAIL01;
5455 
5456   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5457   {
5458     ldrw(tmp1, Address(post(a1, 4)));
5459     ldrw(tmp2, Address(post(a2, 4)));
5460     eorw(tmp1, tmp1, tmp2);
5461     cbnzw(tmp1, DONE);
5462   }
5463   bind(TAIL03);
5464   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5465   {
5466     ldrh(tmp1, Address(post(a1, 2)));
5467     ldrh(tmp2, Address(post(a2, 2)));
5468     eorw(tmp1, tmp1, tmp2);
5469     cbnzw(tmp1, DONE);
5470   }
5471   bind(TAIL01);
5472   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5473     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5474     {
5475       ldrb(tmp1, a1);
5476       ldrb(tmp2, a2);
5477       eorw(tmp1, tmp1, tmp2);
5478       cbnzw(tmp1, DONE);
5479     }
5480   }
5481   // Arrays are equal.
5482   bind(SAME);
5483   mov(result, true);
5484 
5485   // That's it.
5486   bind(DONE);
5487   BLOCK_COMMENT("} string_equals");
5488 }
5489 
5490 
5491 // The size of the blocks erased by the zero_blocks stub.  We must
5492 // handle anything smaller than this ourselves in zero_words().
5493 const int MacroAssembler::zero_words_block_size = 8;
5494 
5495 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5496 // possible, handling small word counts locally and delegating
5497 // anything larger to the zero_blocks stub.  It is expanded many times
5498 // in compiled code, so it is important to keep it short.
5499 
5500 // ptr:   Address of a buffer to be zeroed.
5501 // cnt:   Count in HeapWords.
5502 //
5503 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5504 void MacroAssembler::zero_words(Register ptr, Register cnt)
5505 {
5506   assert(is_power_of_2(zero_words_block_size), "adjust this");
5507   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5508 
5509   BLOCK_COMMENT("zero_words {");
5510   cmp(cnt, (u1)zero_words_block_size);
5511   Label around;
5512   br(LO, around);
5513   {
5514     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5515     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5516     if (StubRoutines::aarch64::complete()) {
5517       trampoline_call(zero_blocks);
5518     } else {
5519       bl(zero_blocks);
5520     }
5521   }
5522   bind(around);
5523   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5524     Label l;
5525     tbz(cnt, exact_log2(i), l);
5526     for (int j = 0; j < i; j += 2) {
5527       stp(zr, zr, post(ptr, 16));
5528     }
5529     bind(l);
5530   }
5531   {
5532     Label l;
5533     tbz(cnt, 0, l);
5534     str(zr, Address(ptr));
5535     bind(l);
5536   }
5537   BLOCK_COMMENT("} zero_words");
5538 }
5539 
5540 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5541 // cnt:          Immediate count in HeapWords.
5542 #define SmallArraySize (18 * BytesPerLong)
5543 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5544 {
5545   BLOCK_COMMENT("zero_words {");
5546   int i = cnt & 1;  // store any odd word to start
5547   if (i) str(zr, Address(base));
5548 
5549   if (cnt <= SmallArraySize / BytesPerLong) {
5550     for (; i < (int)cnt; i += 2)
5551       stp(zr, zr, Address(base, i * wordSize));
5552   } else {
5553     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5554     int remainder = cnt % (2 * unroll);
5555     for (; i < remainder; i += 2)
5556       stp(zr, zr, Address(base, i * wordSize));
5557 
5558     Label loop;
5559     Register cnt_reg = rscratch1;
5560     Register loop_base = rscratch2;
5561     cnt = cnt - remainder;
5562     mov(cnt_reg, cnt);
5563     // adjust base and prebias by -2 * wordSize so we can pre-increment
5564     add(loop_base, base, (remainder - 2) * wordSize);
5565     bind(loop);
5566     sub(cnt_reg, cnt_reg, 2 * unroll);
5567     for (i = 1; i < unroll; i++)
5568       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5569     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5570     cbnz(cnt_reg, loop);
5571   }
5572   BLOCK_COMMENT("} zero_words");
5573 }
5574 
5575 // Zero blocks of memory by using DC ZVA.
5576 //
5577 // Aligns the base address first sufficently for DC ZVA, then uses
5578 // DC ZVA repeatedly for every full block.  cnt is the size to be
5579 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5580 // in cnt.
5581 //
5582 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5583 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5584 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5585   Register tmp = rscratch1;
5586   Register tmp2 = rscratch2;
5587   int zva_length = VM_Version::zva_length();
5588   Label initial_table_end, loop_zva;
5589   Label fini;
5590 
5591   // Base must be 16 byte aligned. If not just return and let caller handle it
5592   tst(base, 0x0f);
5593   br(Assembler::NE, fini);
5594   // Align base with ZVA length.
5595   neg(tmp, base);
5596   andr(tmp, tmp, zva_length - 1);
5597 
5598   // tmp: the number of bytes to be filled to align the base with ZVA length.
5599   add(base, base, tmp);
5600   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5601   adr(tmp2, initial_table_end);
5602   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5603   br(tmp2);
5604 
5605   for (int i = -zva_length + 16; i < 0; i += 16)
5606     stp(zr, zr, Address(base, i));
5607   bind(initial_table_end);
5608 
5609   sub(cnt, cnt, zva_length >> 3);
5610   bind(loop_zva);
5611   dc(Assembler::ZVA, base);
5612   subs(cnt, cnt, zva_length >> 3);
5613   add(base, base, zva_length);
5614   br(Assembler::GE, loop_zva);
5615   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5616   bind(fini);
5617 }
5618 
5619 // base:   Address of a buffer to be filled, 8 bytes aligned.
5620 // cnt:    Count in 8-byte unit.
5621 // value:  Value to be filled with.
5622 // base will point to the end of the buffer after filling.
5623 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5624 {
5625 //  Algorithm:
5626 //
5627 //    scratch1 = cnt & 7;
5628 //    cnt -= scratch1;
5629 //    p += scratch1;
5630 //    switch (scratch1) {
5631 //      do {
5632 //        cnt -= 8;
5633 //          p[-8] = v;
5634 //        case 7:
5635 //          p[-7] = v;
5636 //        case 6:
5637 //          p[-6] = v;
5638 //          // ...
5639 //        case 1:
5640 //          p[-1] = v;
5641 //        case 0:
5642 //          p += 8;
5643 //      } while (cnt);
5644 //    }
5645 
5646   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5647 
5648   Label fini, skip, entry, loop;
5649   const int unroll = 8; // Number of stp instructions we'll unroll
5650 
5651   cbz(cnt, fini);
5652   tbz(base, 3, skip);
5653   str(value, Address(post(base, 8)));
5654   sub(cnt, cnt, 1);
5655   bind(skip);
5656 
5657   andr(rscratch1, cnt, (unroll-1) * 2);
5658   sub(cnt, cnt, rscratch1);
5659   add(base, base, rscratch1, Assembler::LSL, 3);
5660   adr(rscratch2, entry);
5661   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5662   br(rscratch2);
5663 
5664   bind(loop);
5665   add(base, base, unroll * 16);
5666   for (int i = -unroll; i < 0; i++)
5667     stp(value, value, Address(base, i * 16));
5668   bind(entry);
5669   subs(cnt, cnt, unroll * 2);
5670   br(Assembler::GE, loop);
5671 
5672   tbz(cnt, 0, fini);
5673   str(value, Address(post(base, 8)));
5674   bind(fini);
5675 }
5676 
5677 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5678 // java/lang/StringUTF16.compress.
5679 void MacroAssembler::encode_iso_array(Register src, Register dst,
5680                       Register len, Register result,
5681                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5682                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5683 {
5684     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5685         NEXT_32_START, NEXT_32_PRFM_START;
5686     Register tmp1 = rscratch1, tmp2 = rscratch2;
5687 
5688       mov(result, len); // Save initial len
5689 
5690       cmp(len, (u1)8); // handle shortest strings first
5691       br(LT, LOOP_1);
5692       cmp(len, (u1)32);
5693       br(LT, NEXT_8);
5694       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5695       // to convert chars to bytes
5696       if (SoftwarePrefetchHintDistance >= 0) {
5697         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5698         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5699         br(LE, NEXT_32_START);
5700         b(NEXT_32_PRFM_START);
5701         BIND(NEXT_32_PRFM);
5702           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5703         BIND(NEXT_32_PRFM_START);
5704           prfm(Address(src, SoftwarePrefetchHintDistance));
5705           orr(v4, T16B, Vtmp1, Vtmp2);
5706           orr(v5, T16B, Vtmp3, Vtmp4);
5707           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5708           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5709           uzp2(v5, T16B, v4, v5); // high bytes
5710           umov(tmp2, v5, D, 1);
5711           fmovd(tmp1, v5);
5712           orr(tmp1, tmp1, tmp2);
5713           cbnz(tmp1, LOOP_8);
5714           stpq(Vtmp1, Vtmp3, dst);
5715           sub(len, len, 32);
5716           add(dst, dst, 32);
5717           add(src, src, 64);
5718           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5719           br(GE, NEXT_32_PRFM);
5720           cmp(len, (u1)32);
5721           br(LT, LOOP_8);
5722         BIND(NEXT_32);
5723           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5724         BIND(NEXT_32_START);
5725       } else {
5726         BIND(NEXT_32);
5727           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5728       }
5729       prfm(Address(src, SoftwarePrefetchHintDistance));
5730       uzp1(v4, T16B, Vtmp1, Vtmp2);
5731       uzp1(v5, T16B, Vtmp3, Vtmp4);
5732       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5733       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5734       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5735       umov(tmp2, Vtmp1, D, 1);
5736       fmovd(tmp1, Vtmp1);
5737       orr(tmp1, tmp1, tmp2);
5738       cbnz(tmp1, LOOP_8);
5739       stpq(v4, v5, dst);
5740       sub(len, len, 32);
5741       add(dst, dst, 32);
5742       add(src, src, 64);
5743       cmp(len, (u1)32);
5744       br(GE, NEXT_32);
5745       cbz(len, DONE);
5746 
5747     BIND(LOOP_8);
5748       cmp(len, (u1)8);
5749       br(LT, LOOP_1);
5750     BIND(NEXT_8);
5751       ld1(Vtmp1, T8H, src);
5752       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5753       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5754       fmovd(tmp1, Vtmp3);
5755       cbnz(tmp1, NEXT_1);
5756       strd(Vtmp2, dst);
5757 
5758       sub(len, len, 8);
5759       add(dst, dst, 8);
5760       add(src, src, 16);
5761       cmp(len, (u1)8);
5762       br(GE, NEXT_8);
5763 
5764     BIND(LOOP_1);
5765 
5766     cbz(len, DONE);
5767     BIND(NEXT_1);
5768       ldrh(tmp1, Address(post(src, 2)));
5769       tst(tmp1, 0xff00);
5770       br(NE, SET_RESULT);
5771       strb(tmp1, Address(post(dst, 1)));
5772       subs(len, len, 1);
5773       br(GT, NEXT_1);
5774 
5775     BIND(SET_RESULT);
5776       sub(result, result, len); // Return index where we stopped
5777                                 // Return len == 0 if we processed all
5778                                 // characters
5779     BIND(DONE);
5780 }
5781 
5782 
5783 // Inflate byte[] array to char[].
5784 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5785                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5786                                         Register tmp4) {
5787   Label big, done, after_init, to_stub;
5788 
5789   assert_different_registers(src, dst, len, tmp4, rscratch1);
5790 
5791   fmovd(vtmp1, zr);
5792   lsrw(tmp4, len, 3);
5793   bind(after_init);
5794   cbnzw(tmp4, big);
5795   // Short string: less than 8 bytes.
5796   {
5797     Label loop, tiny;
5798 
5799     cmpw(len, 4);
5800     br(LT, tiny);
5801     // Use SIMD to do 4 bytes.
5802     ldrs(vtmp2, post(src, 4));
5803     zip1(vtmp3, T8B, vtmp2, vtmp1);
5804     subw(len, len, 4);
5805     strd(vtmp3, post(dst, 8));
5806 
5807     cbzw(len, done);
5808 
5809     // Do the remaining bytes by steam.
5810     bind(loop);
5811     ldrb(tmp4, post(src, 1));
5812     strh(tmp4, post(dst, 2));
5813     subw(len, len, 1);
5814 
5815     bind(tiny);
5816     cbnz(len, loop);
5817 
5818     b(done);
5819   }
5820 
5821   if (SoftwarePrefetchHintDistance >= 0) {
5822     bind(to_stub);
5823       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5824       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5825       trampoline_call(stub);
5826       b(after_init);
5827   }
5828 
5829   // Unpack the bytes 8 at a time.
5830   bind(big);
5831   {
5832     Label loop, around, loop_last, loop_start;
5833 
5834     if (SoftwarePrefetchHintDistance >= 0) {
5835       const int large_loop_threshold = (64 + 16)/8;
5836       ldrd(vtmp2, post(src, 8));
5837       andw(len, len, 7);
5838       cmp(tmp4, (u1)large_loop_threshold);
5839       br(GE, to_stub);
5840       b(loop_start);
5841 
5842       bind(loop);
5843       ldrd(vtmp2, post(src, 8));
5844       bind(loop_start);
5845       subs(tmp4, tmp4, 1);
5846       br(EQ, loop_last);
5847       zip1(vtmp2, T16B, vtmp2, vtmp1);
5848       ldrd(vtmp3, post(src, 8));
5849       st1(vtmp2, T8H, post(dst, 16));
5850       subs(tmp4, tmp4, 1);
5851       zip1(vtmp3, T16B, vtmp3, vtmp1);
5852       st1(vtmp3, T8H, post(dst, 16));
5853       br(NE, loop);
5854       b(around);
5855       bind(loop_last);
5856       zip1(vtmp2, T16B, vtmp2, vtmp1);
5857       st1(vtmp2, T8H, post(dst, 16));
5858       bind(around);
5859       cbz(len, done);
5860     } else {
5861       andw(len, len, 7);
5862       bind(loop);
5863       ldrd(vtmp2, post(src, 8));
5864       sub(tmp4, tmp4, 1);
5865       zip1(vtmp3, T16B, vtmp2, vtmp1);
5866       st1(vtmp3, T8H, post(dst, 16));
5867       cbnz(tmp4, loop);
5868     }
5869   }
5870 
5871   // Do the tail of up to 8 bytes.
5872   add(src, src, len);
5873   ldrd(vtmp3, Address(src, -8));
5874   add(dst, dst, len, ext::uxtw, 1);
5875   zip1(vtmp3, T16B, vtmp3, vtmp1);
5876   strq(vtmp3, Address(dst, -16));
5877 
5878   bind(done);
5879 }
5880 
5881 // Compress char[] array to byte[].
5882 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5883                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5884                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5885                                          Register result) {
5886   encode_iso_array(src, dst, len, result,
5887                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5888   cmp(len, zr);
5889   csel(result, result, zr, EQ);
5890 }
5891 
5892 // get_thread() can be called anywhere inside generated code so we
5893 // need to save whatever non-callee save context might get clobbered
5894 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5895 // the call setup code.
5896 //
5897 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5898 //
5899 void MacroAssembler::get_thread(Register dst) {
5900   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5901   push(saved_regs, sp);
5902 
5903   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5904   blr(lr);
5905   if (dst != c_rarg0) {
5906     mov(dst, c_rarg0);
5907   }
5908 
5909   pop(saved_regs, sp);
5910 }
5911 
5912 void MacroAssembler::cache_wb(Address line) {
5913   assert(line.getMode() == Address::base_plus_offset, "mode should be base_plus_offset");
5914   assert(line.index() == noreg, "index should be noreg");
5915   assert(line.offset() == 0, "offset should be 0");
5916   // would like to assert this
5917   // assert(line._ext.shift == 0, "shift should be zero");
5918   if (VM_Version::supports_dcpop()) {
5919     // writeback using clear virtual address to point of persistence
5920     dc(Assembler::CVAP, line.base());
5921   } else {
5922     // no need to generate anything as Unsafe.writebackMemory should
5923     // never invoke this stub
5924   }
5925 }
5926 
5927 void MacroAssembler::cache_wbsync(bool is_pre) {
5928   // we only need a barrier post sync
5929   if (!is_pre) {
5930     membar(Assembler::AnyAny);
5931   }
5932 }