1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   if (last_java_pc != NULL) {
 377     adr(scratch, last_java_pc);
 378   } else {
 379     // FIXME: This is almost never correct.  We should delete all
 380     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 381     // correct return address instead.
 382     adr(scratch, pc());
 383   }
 384 
 385   str(scratch, Address(rthread,
 386                        JavaThread::frame_anchor_offset()
 387                        + JavaFrameAnchor::last_Java_pc_offset()));
 388 
 389   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 390 }
 391 
 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 393                                          Register last_java_fp,
 394                                          Label &L,
 395                                          Register scratch) {
 396   if (L.is_bound()) {
 397     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 398   } else {
 399     InstructionMark im(this);
 400     L.add_patch_at(code(), locator());
 401     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 402   }
 403 }
 404 
 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 407   assert(CodeCache::find_blob(entry.target()) != NULL,
 408          "destination of far call not found in code cache");
 409   if (far_branches()) {
 410     unsigned long offset;
 411     // We can use ADRP here because we know that the total size of
 412     // the code cache cannot exceed 2Gb.
 413     adrp(tmp, entry, offset);
 414     add(tmp, tmp, offset);
 415     if (cbuf) cbuf->set_insts_mark();
 416     blr(tmp);
 417   } else {
 418     if (cbuf) cbuf->set_insts_mark();
 419     bl(entry);
 420   }
 421 }
 422 
 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 424   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 425   assert(CodeCache::find_blob(entry.target()) != NULL,
 426          "destination of far call not found in code cache");
 427   if (far_branches()) {
 428     unsigned long offset;
 429     // We can use ADRP here because we know that the total size of
 430     // the code cache cannot exceed 2Gb.
 431     adrp(tmp, entry, offset);
 432     add(tmp, tmp, offset);
 433     if (cbuf) cbuf->set_insts_mark();
 434     br(tmp);
 435   } else {
 436     if (cbuf) cbuf->set_insts_mark();
 437     b(entry);
 438   }
 439 }
 440 
 441 void MacroAssembler::reserved_stack_check() {
 442     // testing if reserved zone needs to be enabled
 443     Label no_reserved_zone_enabling;
 444 
 445     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 446     cmp(sp, rscratch1);
 447     br(Assembler::LO, no_reserved_zone_enabling);
 448 
 449     enter();   // LR and FP are live.
 450     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 451     mov(c_rarg0, rthread);
 452     blr(rscratch1);
 453     leave();
 454 
 455     // We have already removed our own frame.
 456     // throw_delayed_StackOverflowError will think that it's been
 457     // called by our caller.
 458     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 459     br(rscratch1);
 460     should_not_reach_here();
 461 
 462     bind(no_reserved_zone_enabling);
 463 }
 464 
 465 int MacroAssembler::biased_locking_enter(Register lock_reg,
 466                                          Register obj_reg,
 467                                          Register swap_reg,
 468                                          Register tmp_reg,
 469                                          bool swap_reg_contains_mark,
 470                                          Label& done,
 471                                          Label* slow_case,
 472                                          BiasedLockingCounters* counters) {
 473   assert(UseBiasedLocking, "why call this otherwise?");
 474   assert_different_registers(lock_reg, obj_reg, swap_reg);
 475 
 476   if (PrintBiasedLockingStatistics && counters == NULL)
 477     counters = BiasedLocking::counters();
 478 
 479   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 480   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 481   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 482   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 483   Address saved_mark_addr(lock_reg, 0);
 484 
 485   // Biased locking
 486   // See whether the lock is currently biased toward our thread and
 487   // whether the epoch is still valid
 488   // Note that the runtime guarantees sufficient alignment of JavaThread
 489   // pointers to allow age to be placed into low bits
 490   // First check to see whether biasing is even enabled for this object
 491   Label cas_label;
 492   int null_check_offset = -1;
 493   if (!swap_reg_contains_mark) {
 494     null_check_offset = offset();
 495     ldr(swap_reg, mark_addr);
 496   }
 497   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 498   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 499   br(Assembler::NE, cas_label);
 500   // The bias pattern is present in the object's header. Need to check
 501   // whether the bias owner and the epoch are both still current.
 502   load_prototype_header(tmp_reg, obj_reg);
 503   orr(tmp_reg, tmp_reg, rthread);
 504   eor(tmp_reg, swap_reg, tmp_reg);
 505   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 506   if (counters != NULL) {
 507     Label around;
 508     cbnz(tmp_reg, around);
 509     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 510     b(done);
 511     bind(around);
 512   } else {
 513     cbz(tmp_reg, done);
 514   }
 515 
 516   Label try_revoke_bias;
 517   Label try_rebias;
 518 
 519   // At this point we know that the header has the bias pattern and
 520   // that we are not the bias owner in the current epoch. We need to
 521   // figure out more details about the state of the header in order to
 522   // know what operations can be legally performed on the object's
 523   // header.
 524 
 525   // If the low three bits in the xor result aren't clear, that means
 526   // the prototype header is no longer biased and we have to revoke
 527   // the bias on this object.
 528   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 529   cbnz(rscratch1, try_revoke_bias);
 530 
 531   // Biasing is still enabled for this data type. See whether the
 532   // epoch of the current bias is still valid, meaning that the epoch
 533   // bits of the mark word are equal to the epoch bits of the
 534   // prototype header. (Note that the prototype header's epoch bits
 535   // only change at a safepoint.) If not, attempt to rebias the object
 536   // toward the current thread. Note that we must be absolutely sure
 537   // that the current epoch is invalid in order to do this because
 538   // otherwise the manipulations it performs on the mark word are
 539   // illegal.
 540   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 541   cbnz(rscratch1, try_rebias);
 542 
 543   // The epoch of the current bias is still valid but we know nothing
 544   // about the owner; it might be set or it might be clear. Try to
 545   // acquire the bias of the object using an atomic operation. If this
 546   // fails we will go in to the runtime to revoke the object's bias.
 547   // Note that we first construct the presumed unbiased header so we
 548   // don't accidentally blow away another thread's valid bias.
 549   {
 550     Label here;
 551     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 552     andr(swap_reg, swap_reg, rscratch1);
 553     orr(tmp_reg, swap_reg, rthread);
 554     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 555     // If the biasing toward our thread failed, this means that
 556     // another thread succeeded in biasing it toward itself and we
 557     // need to revoke that bias. The revocation will occur in the
 558     // interpreter runtime in the slow case.
 559     bind(here);
 560     if (counters != NULL) {
 561       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 562                   tmp_reg, rscratch1, rscratch2);
 563     }
 564   }
 565   b(done);
 566 
 567   bind(try_rebias);
 568   // At this point we know the epoch has expired, meaning that the
 569   // current "bias owner", if any, is actually invalid. Under these
 570   // circumstances _only_, we are allowed to use the current header's
 571   // value as the comparison value when doing the cas to acquire the
 572   // bias in the current epoch. In other words, we allow transfer of
 573   // the bias from one thread to another directly in this situation.
 574   //
 575   // FIXME: due to a lack of registers we currently blow away the age
 576   // bits in this situation. Should attempt to preserve them.
 577   {
 578     Label here;
 579     load_prototype_header(tmp_reg, obj_reg);
 580     orr(tmp_reg, rthread, tmp_reg);
 581     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 582     // If the biasing toward our thread failed, then another thread
 583     // succeeded in biasing it toward itself and we need to revoke that
 584     // bias. The revocation will occur in the runtime in the slow case.
 585     bind(here);
 586     if (counters != NULL) {
 587       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 588                   tmp_reg, rscratch1, rscratch2);
 589     }
 590   }
 591   b(done);
 592 
 593   bind(try_revoke_bias);
 594   // The prototype mark in the klass doesn't have the bias bit set any
 595   // more, indicating that objects of this data type are not supposed
 596   // to be biased any more. We are going to try to reset the mark of
 597   // this object to the prototype value and fall through to the
 598   // CAS-based locking scheme. Note that if our CAS fails, it means
 599   // that another thread raced us for the privilege of revoking the
 600   // bias of this particular object, so it's okay to continue in the
 601   // normal locking code.
 602   //
 603   // FIXME: due to a lack of registers we currently blow away the age
 604   // bits in this situation. Should attempt to preserve them.
 605   {
 606     Label here, nope;
 607     load_prototype_header(tmp_reg, obj_reg);
 608     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 609     bind(here);
 610 
 611     // Fall through to the normal CAS-based lock, because no matter what
 612     // the result of the above CAS, some thread must have succeeded in
 613     // removing the bias bit from the object's header.
 614     if (counters != NULL) {
 615       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 616                   rscratch1, rscratch2);
 617     }
 618     bind(nope);
 619   }
 620 
 621   bind(cas_label);
 622 
 623   return null_check_offset;
 624 }
 625 
 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 627   assert(UseBiasedLocking, "why call this otherwise?");
 628 
 629   // Check for biased locking unlock case, which is a no-op
 630   // Note: we do not have to check the thread ID for two reasons.
 631   // First, the interpreter checks for IllegalMonitorStateException at
 632   // a higher level. Second, if the bias was revoked while we held the
 633   // lock, the object could not be rebiased toward another thread, so
 634   // the bias bit would be clear.
 635   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 636   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 637   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 638   br(Assembler::EQ, done);
 639 }
 640 
 641 static void pass_arg0(MacroAssembler* masm, Register arg) {
 642   if (c_rarg0 != arg ) {
 643     masm->mov(c_rarg0, arg);
 644   }
 645 }
 646 
 647 static void pass_arg1(MacroAssembler* masm, Register arg) {
 648   if (c_rarg1 != arg ) {
 649     masm->mov(c_rarg1, arg);
 650   }
 651 }
 652 
 653 static void pass_arg2(MacroAssembler* masm, Register arg) {
 654   if (c_rarg2 != arg ) {
 655     masm->mov(c_rarg2, arg);
 656   }
 657 }
 658 
 659 static void pass_arg3(MacroAssembler* masm, Register arg) {
 660   if (c_rarg3 != arg ) {
 661     masm->mov(c_rarg3, arg);
 662   }
 663 }
 664 
 665 void MacroAssembler::call_VM_base(Register oop_result,
 666                                   Register java_thread,
 667                                   Register last_java_sp,
 668                                   address  entry_point,
 669                                   int      number_of_arguments,
 670                                   bool     check_exceptions) {
 671    // determine java_thread register
 672   if (!java_thread->is_valid()) {
 673     java_thread = rthread;
 674   }
 675 
 676   // determine last_java_sp register
 677   if (!last_java_sp->is_valid()) {
 678     last_java_sp = esp;
 679   }
 680 
 681   // debugging support
 682   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 683   assert(java_thread == rthread, "unexpected register");
 684 #ifdef ASSERT
 685   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 686   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 687 #endif // ASSERT
 688 
 689   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 690   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 691 
 692   // push java thread (becomes first argument of C function)
 693 
 694   mov(c_rarg0, java_thread);
 695 
 696   // set last Java frame before call
 697   assert(last_java_sp != rfp, "can't use rfp");
 698 
 699   Label l;
 700   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 701 
 702   // do the call, remove parameters
 703   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 704 
 705   // reset last Java frame
 706   // Only interpreter should have to clear fp
 707   reset_last_Java_frame(true);
 708 
 709    // C++ interp handles this in the interpreter
 710   check_and_handle_popframe(java_thread);
 711   check_and_handle_earlyret(java_thread);
 712 
 713   if (check_exceptions) {
 714     // check for pending exceptions (java_thread is set upon return)
 715     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 716     Label ok;
 717     cbz(rscratch1, ok);
 718     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 719     br(rscratch1);
 720     bind(ok);
 721   }
 722 
 723   // get oop result if there is one and reset the value in the thread
 724   if (oop_result->is_valid()) {
 725     get_vm_result(oop_result, java_thread);
 726   }
 727 }
 728 
 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 730   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 731 }
 732 
 733 // Maybe emit a call via a trampoline.  If the code cache is small
 734 // trampolines won't be emitted.
 735 
 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 737   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 738   assert(entry.rspec().type() == relocInfo::runtime_call_type
 739          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 740          || entry.rspec().type() == relocInfo::static_call_type
 741          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 742 
 743   // We need a trampoline if branches are far.
 744   if (far_branches()) {
 745     bool in_scratch_emit_size = false;
 746 #ifdef COMPILER2
 747     // We don't want to emit a trampoline if C2 is generating dummy
 748     // code during its branch shortening phase.
 749     CompileTask* task = ciEnv::current()->task();
 750     in_scratch_emit_size =
 751       (task != NULL && is_c2_compile(task->comp_level()) &&
 752        Compile::current()->in_scratch_emit_size());
 753 #endif
 754     if (!in_scratch_emit_size) {
 755       address stub = emit_trampoline_stub(offset(), entry.target());
 756       if (stub == NULL) {
 757         return NULL; // CodeCache is full
 758       }
 759     }
 760   }
 761 
 762   if (cbuf) cbuf->set_insts_mark();
 763   relocate(entry.rspec());
 764   if (!far_branches()) {
 765     bl(entry.target());
 766   } else {
 767     bl(pc());
 768   }
 769   // just need to return a non-null address
 770   return pc();
 771 }
 772 
 773 
 774 // Emit a trampoline stub for a call to a target which is too far away.
 775 //
 776 // code sequences:
 777 //
 778 // call-site:
 779 //   branch-and-link to <destination> or <trampoline stub>
 780 //
 781 // Related trampoline stub for this call site in the stub section:
 782 //   load the call target from the constant pool
 783 //   branch (LR still points to the call site above)
 784 
 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 786                                              address dest) {
 787   // Max stub size: alignment nop, TrampolineStub.
 788   address stub = start_a_stub(NativeInstruction::instruction_size
 789                    + NativeCallTrampolineStub::instruction_size);
 790   if (stub == NULL) {
 791     return NULL;  // CodeBuffer::expand failed
 792   }
 793 
 794   // Create a trampoline stub relocation which relates this trampoline stub
 795   // with the call instruction at insts_call_instruction_offset in the
 796   // instructions code-section.
 797   align(wordSize);
 798   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 799                                             + insts_call_instruction_offset));
 800   const int stub_start_offset = offset();
 801 
 802   // Now, create the trampoline stub's code:
 803   // - load the call
 804   // - call
 805   Label target;
 806   ldr(rscratch1, target);
 807   br(rscratch1);
 808   bind(target);
 809   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 810          "should be");
 811   emit_int64((int64_t)dest);
 812 
 813   const address stub_start_addr = addr_at(stub_start_offset);
 814 
 815   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 816 
 817   end_a_stub();
 818   return stub_start_addr;
 819 }
 820 
 821 void MacroAssembler::c2bool(Register x) {
 822   // implements x == 0 ? 0 : 1
 823   // note: must only look at least-significant byte of x
 824   //       since C-style booleans are stored in one byte
 825   //       only! (was bug)
 826   tst(x, 0xff);
 827   cset(x, Assembler::NE);
 828 }
 829 
 830 address MacroAssembler::ic_call(address entry, jint method_index) {
 831   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 832   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 833   // unsigned long offset;
 834   // ldr_constant(rscratch2, const_ptr);
 835   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 836   return trampoline_call(Address(entry, rh));
 837 }
 838 
 839 // Implementation of call_VM versions
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              bool check_exceptions) {
 844   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 845 }
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              Register arg_1,
 850                              bool check_exceptions) {
 851   pass_arg1(this, arg_1);
 852   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              address entry_point,
 868                              Register arg_1,
 869                              Register arg_2,
 870                              Register arg_3,
 871                              bool check_exceptions) {
 872   assert(arg_1 != c_rarg3, "smashed arg");
 873   assert(arg_2 != c_rarg3, "smashed arg");
 874   pass_arg3(this, arg_3);
 875 
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878 
 879   pass_arg1(this, arg_1);
 880   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 881 }
 882 
 883 void MacroAssembler::call_VM(Register oop_result,
 884                              Register last_java_sp,
 885                              address entry_point,
 886                              int number_of_arguments,
 887                              bool check_exceptions) {
 888   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              Register arg_1,
 895                              bool check_exceptions) {
 896   pass_arg1(this, arg_1);
 897   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 898 }
 899 
 900 void MacroAssembler::call_VM(Register oop_result,
 901                              Register last_java_sp,
 902                              address entry_point,
 903                              Register arg_1,
 904                              Register arg_2,
 905                              bool check_exceptions) {
 906 
 907   assert(arg_1 != c_rarg2, "smashed arg");
 908   pass_arg2(this, arg_2);
 909   pass_arg1(this, arg_1);
 910   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 911 }
 912 
 913 void MacroAssembler::call_VM(Register oop_result,
 914                              Register last_java_sp,
 915                              address entry_point,
 916                              Register arg_1,
 917                              Register arg_2,
 918                              Register arg_3,
 919                              bool check_exceptions) {
 920   assert(arg_1 != c_rarg3, "smashed arg");
 921   assert(arg_2 != c_rarg3, "smashed arg");
 922   pass_arg3(this, arg_3);
 923   assert(arg_1 != c_rarg2, "smashed arg");
 924   pass_arg2(this, arg_2);
 925   pass_arg1(this, arg_1);
 926   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 927 }
 928 
 929 
 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 931   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 933   verify_oop(oop_result, "broken oop in call_VM_base");
 934 }
 935 
 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 937   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 939 }
 940 
 941 void MacroAssembler::align(int modulus) {
 942   while (offset() % modulus != 0) nop();
 943 }
 944 
 945 // these are no-ops overridden by InterpreterMacroAssembler
 946 
 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 948 
 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 950 
 951 
 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 953                                                       Register tmp,
 954                                                       int offset) {
 955   intptr_t value = *delayed_value_addr;
 956   if (value != 0)
 957     return RegisterOrConstant(value + offset);
 958 
 959   // load indirectly to solve generation ordering problem
 960   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 961 
 962   if (offset != 0)
 963     add(tmp, tmp, offset);
 964 
 965   return RegisterOrConstant(tmp);
 966 }
 967 
 968 
 969 void MacroAssembler:: notify(int type) {
 970   if (type == bytecode_start) {
 971     // set_last_Java_frame(esp, rfp, (address)NULL);
 972     Assembler:: notify(type);
 973     // reset_last_Java_frame(true);
 974   }
 975   else
 976     Assembler:: notify(type);
 977 }
 978 
 979 // Look up the method for a megamorphic invokeinterface call.
 980 // The target method is determined by <intf_klass, itable_index>.
 981 // The receiver klass is in recv_klass.
 982 // On success, the result will be in method_result, and execution falls through.
 983 // On failure, execution transfers to the given label.
 984 void MacroAssembler::lookup_interface_method(Register recv_klass,
 985                                              Register intf_klass,
 986                                              RegisterOrConstant itable_index,
 987                                              Register method_result,
 988                                              Register scan_temp,
 989                                              Label& L_no_such_interface,
 990                          bool return_method) {
 991   assert_different_registers(recv_klass, intf_klass, scan_temp);
 992   assert_different_registers(method_result, intf_klass, scan_temp);
 993   assert(recv_klass != method_result || !return_method,
 994      "recv_klass can be destroyed when method isn't needed");
 995   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 996          "caller must use same register for non-constant itable index as for method");
 997 
 998   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 999   int vtable_base = in_bytes(Klass::vtable_start_offset());
1000   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1001   int scan_step   = itableOffsetEntry::size() * wordSize;
1002   int vte_size    = vtableEntry::size_in_bytes();
1003   assert(vte_size == wordSize, "else adjust times_vte_scale");
1004 
1005   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1006 
1007   // %%% Could store the aligned, prescaled offset in the klassoop.
1008   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1009   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1010   add(scan_temp, scan_temp, vtable_base);
1011 
1012   if (return_method) {
1013     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1014     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1015     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1016     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1017     if (itentry_off)
1018       add(recv_klass, recv_klass, itentry_off);
1019   }
1020 
1021   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1022   //   if (scan->interface() == intf) {
1023   //     result = (klass + scan->offset() + itable_index);
1024   //   }
1025   // }
1026   Label search, found_method;
1027 
1028   for (int peel = 1; peel >= 0; peel--) {
1029     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1030     cmp(intf_klass, method_result);
1031 
1032     if (peel) {
1033       br(Assembler::EQ, found_method);
1034     } else {
1035       br(Assembler::NE, search);
1036       // (invert the test to fall through to found_method...)
1037     }
1038 
1039     if (!peel)  break;
1040 
1041     bind(search);
1042 
1043     // Check that the previous entry is non-null.  A null entry means that
1044     // the receiver class doesn't implement the interface, and wasn't the
1045     // same as when the caller was compiled.
1046     cbz(method_result, L_no_such_interface);
1047     add(scan_temp, scan_temp, scan_step);
1048   }
1049 
1050   bind(found_method);
1051 
1052   // Got a hit.
1053   if (return_method) {
1054     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1055     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1056   }
1057 }
1058 
1059 // virtual method calling
1060 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1061                                            RegisterOrConstant vtable_index,
1062                                            Register method_result) {
1063   const int base = in_bytes(Klass::vtable_start_offset());
1064   assert(vtableEntry::size() * wordSize == 8,
1065          "adjust the scaling in the code below");
1066   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1067 
1068   if (vtable_index.is_register()) {
1069     lea(method_result, Address(recv_klass,
1070                                vtable_index.as_register(),
1071                                Address::lsl(LogBytesPerWord)));
1072     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1073   } else {
1074     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1075     ldr(method_result,
1076         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1077   }
1078 }
1079 
1080 void MacroAssembler::check_klass_subtype(Register sub_klass,
1081                            Register super_klass,
1082                            Register temp_reg,
1083                            Label& L_success) {
1084   Label L_failure;
1085   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1086   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1087   bind(L_failure);
1088 }
1089 
1090 
1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1092                                                    Register super_klass,
1093                                                    Register temp_reg,
1094                                                    Label* L_success,
1095                                                    Label* L_failure,
1096                                                    Label* L_slow_path,
1097                                         RegisterOrConstant super_check_offset) {
1098   assert_different_registers(sub_klass, super_klass, temp_reg);
1099   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1100   if (super_check_offset.is_register()) {
1101     assert_different_registers(sub_klass, super_klass,
1102                                super_check_offset.as_register());
1103   } else if (must_load_sco) {
1104     assert(temp_reg != noreg, "supply either a temp or a register offset");
1105   }
1106 
1107   Label L_fallthrough;
1108   int label_nulls = 0;
1109   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1110   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1111   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1112   assert(label_nulls <= 1, "at most one NULL in the batch");
1113 
1114   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1115   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1116   Address super_check_offset_addr(super_klass, sco_offset);
1117 
1118   // Hacked jmp, which may only be used just before L_fallthrough.
1119 #define final_jmp(label)                                                \
1120   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1121   else                            b(label)                /*omit semi*/
1122 
1123   // If the pointers are equal, we are done (e.g., String[] elements).
1124   // This self-check enables sharing of secondary supertype arrays among
1125   // non-primary types such as array-of-interface.  Otherwise, each such
1126   // type would need its own customized SSA.
1127   // We move this check to the front of the fast path because many
1128   // type checks are in fact trivially successful in this manner,
1129   // so we get a nicely predicted branch right at the start of the check.
1130   cmp(sub_klass, super_klass);
1131   br(Assembler::EQ, *L_success);
1132 
1133   // Check the supertype display:
1134   if (must_load_sco) {
1135     ldrw(temp_reg, super_check_offset_addr);
1136     super_check_offset = RegisterOrConstant(temp_reg);
1137   }
1138   Address super_check_addr(sub_klass, super_check_offset);
1139   ldr(rscratch1, super_check_addr);
1140   cmp(super_klass, rscratch1); // load displayed supertype
1141 
1142   // This check has worked decisively for primary supers.
1143   // Secondary supers are sought in the super_cache ('super_cache_addr').
1144   // (Secondary supers are interfaces and very deeply nested subtypes.)
1145   // This works in the same check above because of a tricky aliasing
1146   // between the super_cache and the primary super display elements.
1147   // (The 'super_check_addr' can address either, as the case requires.)
1148   // Note that the cache is updated below if it does not help us find
1149   // what we need immediately.
1150   // So if it was a primary super, we can just fail immediately.
1151   // Otherwise, it's the slow path for us (no success at this point).
1152 
1153   if (super_check_offset.is_register()) {
1154     br(Assembler::EQ, *L_success);
1155     subs(zr, super_check_offset.as_register(), sc_offset);
1156     if (L_failure == &L_fallthrough) {
1157       br(Assembler::EQ, *L_slow_path);
1158     } else {
1159       br(Assembler::NE, *L_failure);
1160       final_jmp(*L_slow_path);
1161     }
1162   } else if (super_check_offset.as_constant() == sc_offset) {
1163     // Need a slow path; fast failure is impossible.
1164     if (L_slow_path == &L_fallthrough) {
1165       br(Assembler::EQ, *L_success);
1166     } else {
1167       br(Assembler::NE, *L_slow_path);
1168       final_jmp(*L_success);
1169     }
1170   } else {
1171     // No slow path; it's a fast decision.
1172     if (L_failure == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_failure);
1176       final_jmp(*L_success);
1177     }
1178   }
1179 
1180   bind(L_fallthrough);
1181 
1182 #undef final_jmp
1183 }
1184 
1185 // These two are taken from x86, but they look generally useful
1186 
1187 // scans count pointer sized words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldr(scratch, post(addr, wordSize));
1195   cmp(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 // scans count 4 byte words at [addr] for occurence of value,
1203 // generic
1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1205                                 Register scratch) {
1206   Label Lloop, Lexit;
1207   cbz(count, Lexit);
1208   bind(Lloop);
1209   ldrw(scratch, post(addr, wordSize));
1210   cmpw(value, scratch);
1211   br(EQ, Lexit);
1212   sub(count, count, 1);
1213   cbnz(count, Lloop);
1214   bind(Lexit);
1215 }
1216 
1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1218                                                    Register super_klass,
1219                                                    Register temp_reg,
1220                                                    Register temp2_reg,
1221                                                    Label* L_success,
1222                                                    Label* L_failure,
1223                                                    bool set_cond_codes) {
1224   assert_different_registers(sub_klass, super_klass, temp_reg);
1225   if (temp2_reg != noreg)
1226     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1228 
1229   Label L_fallthrough;
1230   int label_nulls = 0;
1231   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1232   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1233   assert(label_nulls <= 1, "at most one NULL in the batch");
1234 
1235   // a couple of useful fields in sub_klass:
1236   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1237   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1238   Address secondary_supers_addr(sub_klass, ss_offset);
1239   Address super_cache_addr(     sub_klass, sc_offset);
1240 
1241   BLOCK_COMMENT("check_klass_subtype_slow_path");
1242 
1243   // Do a linear scan of the secondary super-klass chain.
1244   // This code is rarely used, so simplicity is a virtue here.
1245   // The repne_scan instruction uses fixed registers, which we must spill.
1246   // Don't worry too much about pre-existing connections with the input regs.
1247 
1248   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1249   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1250 
1251   RegSet pushed_registers;
1252   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1253   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1254 
1255   if (super_klass != r0 || UseCompressedOops) {
1256     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1257   }
1258 
1259   push(pushed_registers, sp);
1260 
1261   // Get super_klass value into r0 (even if it was in r5 or r2).
1262   if (super_klass != r0) {
1263     mov(r0, super_klass);
1264   }
1265 
1266 #ifndef PRODUCT
1267   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1268   Address pst_counter_addr(rscratch2);
1269   ldr(rscratch1, pst_counter_addr);
1270   add(rscratch1, rscratch1, 1);
1271   str(rscratch1, pst_counter_addr);
1272 #endif //PRODUCT
1273 
1274   // We will consult the secondary-super array.
1275   ldr(r5, secondary_supers_addr);
1276   // Load the array length.
1277   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1278   // Skip to start of data.
1279   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1280 
1281   cmp(sp, zr); // Clear Z flag; SP is never zero
1282   // Scan R2 words at [R5] for an occurrence of R0.
1283   // Set NZ/Z based on last compare.
1284   repne_scan(r5, r0, r2, rscratch1);
1285 
1286   // Unspill the temp. registers:
1287   pop(pushed_registers, sp);
1288 
1289   br(Assembler::NE, *L_failure);
1290 
1291   // Success.  Cache the super we found and proceed in triumph.
1292   str(super_klass, super_cache_addr);
1293 
1294   if (L_success != &L_fallthrough) {
1295     b(*L_success);
1296   }
1297 
1298 #undef IS_A_TEMP
1299 
1300   bind(L_fallthrough);
1301 }
1302 
1303 
1304 void MacroAssembler::verify_oop(Register reg, const char* s) {
1305   if (!VerifyOops || VerifyAdapterSharing) {
1306     // Below address of the code string confuses VerifyAdapterSharing
1307     // because it may differ between otherwise equivalent adapters.
1308     return;
1309   }
1310 
1311   // Pass register number to verify_oop_subroutine
1312   const char* b = NULL;
1313   {
1314     ResourceMark rm;
1315     stringStream ss;
1316     ss.print("verify_oop: %s: %s", reg->name(), s);
1317     b = code_string(ss.as_string());
1318   }
1319   BLOCK_COMMENT("verify_oop {");
1320 
1321   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1322   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1323 
1324   mov(r0, reg);
1325   mov(rscratch1, (address)b);
1326 
1327   // call indirectly to solve generation ordering problem
1328   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1329   ldr(rscratch2, Address(rscratch2));
1330   blr(rscratch2);
1331 
1332   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1333   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1334 
1335   BLOCK_COMMENT("} verify_oop");
1336 }
1337 
1338 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1339   if (!VerifyOops || VerifyAdapterSharing) {
1340     // Below address of the code string confuses VerifyAdapterSharing
1341     // because it may differ between otherwise equivalent adapters.
1342     return;
1343   }
1344 
1345   const char* b = NULL;
1346   {
1347     ResourceMark rm;
1348     stringStream ss;
1349     ss.print("verify_oop_addr: %s", s);
1350     b = code_string(ss.as_string());
1351   }
1352   BLOCK_COMMENT("verify_oop_addr {");
1353 
1354   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1355   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1356 
1357   // addr may contain sp so we will have to adjust it based on the
1358   // pushes that we just did.
1359   if (addr.uses(sp)) {
1360     lea(r0, addr);
1361     ldr(r0, Address(r0, 4 * wordSize));
1362   } else {
1363     ldr(r0, addr);
1364   }
1365   mov(rscratch1, (address)b);
1366 
1367   // call indirectly to solve generation ordering problem
1368   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1369   ldr(rscratch2, Address(rscratch2));
1370   blr(rscratch2);
1371 
1372   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1373   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1374 
1375   BLOCK_COMMENT("} verify_oop_addr");
1376 }
1377 
1378 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1379                                          int extra_slot_offset) {
1380   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1381   int stackElementSize = Interpreter::stackElementSize;
1382   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1383 #ifdef ASSERT
1384   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1385   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1386 #endif
1387   if (arg_slot.is_constant()) {
1388     return Address(esp, arg_slot.as_constant() * stackElementSize
1389                    + offset);
1390   } else {
1391     add(rscratch1, esp, arg_slot.as_register(),
1392         ext::uxtx, exact_log2(stackElementSize));
1393     return Address(rscratch1, offset);
1394   }
1395 }
1396 
1397 void MacroAssembler::call_VM_leaf_base(address entry_point,
1398                                        int number_of_arguments,
1399                                        Label *retaddr) {
1400   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1401 }
1402 
1403 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1404                                         int number_of_gp_arguments,
1405                                         int number_of_fp_arguments,
1406                                         ret_type type,
1407                                         Label *retaddr) {
1408   Label E, L;
1409 
1410   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1411 
1412   // We add 1 to number_of_arguments because the thread in arg0 is
1413   // not counted
1414   mov(rscratch1, entry_point);
1415   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1416   if (retaddr)
1417     bind(*retaddr);
1418 
1419   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1420   maybe_isb();
1421 }
1422 
1423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1424   call_VM_leaf_base(entry_point, number_of_arguments);
1425 }
1426 
1427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1428   pass_arg0(this, arg_0);
1429   call_VM_leaf_base(entry_point, 1);
1430 }
1431 
1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1433   pass_arg0(this, arg_0);
1434   pass_arg1(this, arg_1);
1435   call_VM_leaf_base(entry_point, 2);
1436 }
1437 
1438 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1439                                   Register arg_1, Register arg_2) {
1440   pass_arg0(this, arg_0);
1441   pass_arg1(this, arg_1);
1442   pass_arg2(this, arg_2);
1443   call_VM_leaf_base(entry_point, 3);
1444 }
1445 
1446 void MacroAssembler::super_call_VM_leaf(address entry_point) {
1447   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1448 }
1449 
1450 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1456 
1457   assert(arg_0 != c_rarg1, "smashed arg");
1458   pass_arg1(this, arg_1);
1459   pass_arg0(this, arg_0);
1460   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1461 }
1462 
1463 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1464   assert(arg_0 != c_rarg2, "smashed arg");
1465   assert(arg_1 != c_rarg2, "smashed arg");
1466   pass_arg2(this, arg_2);
1467   assert(arg_0 != c_rarg1, "smashed arg");
1468   pass_arg1(this, arg_1);
1469   pass_arg0(this, arg_0);
1470   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1471 }
1472 
1473 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1474   assert(arg_0 != c_rarg3, "smashed arg");
1475   assert(arg_1 != c_rarg3, "smashed arg");
1476   assert(arg_2 != c_rarg3, "smashed arg");
1477   pass_arg3(this, arg_3);
1478   assert(arg_0 != c_rarg2, "smashed arg");
1479   assert(arg_1 != c_rarg2, "smashed arg");
1480   pass_arg2(this, arg_2);
1481   assert(arg_0 != c_rarg1, "smashed arg");
1482   pass_arg1(this, arg_1);
1483   pass_arg0(this, arg_0);
1484   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1485 }
1486 
1487 void MacroAssembler::null_check(Register reg, int offset) {
1488   if (needs_explicit_null_check(offset)) {
1489     // provoke OS NULL exception if reg = NULL by
1490     // accessing M[reg] w/o changing any registers
1491     // NOTE: this is plenty to provoke a segv
1492     ldr(zr, Address(reg));
1493   } else {
1494     // nothing to do, (later) access of M[reg + offset]
1495     // will provoke OS NULL exception if reg = NULL
1496   }
1497 }
1498 
1499 void MacroAssembler::test_klass_is_value(Register klass, Register temp_reg, Label& is_value) {
1500   ldrw(temp_reg, Address(klass, Klass::access_flags_offset()));
1501   andr(temp_reg, temp_reg, JVM_ACC_VALUE);
1502   cbnz(temp_reg, is_value); 
1503 }
1504 
1505 void MacroAssembler::test_field_is_flattenable(Register flags, Register temp_reg, Label& is_flattenable) {
1506   (void) temp_reg; // keep signature uniform with x86
1507   tbnz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, is_flattenable);
1508 }
1509 
1510 void MacroAssembler::test_field_is_not_flattenable(Register flags, Register temp_reg, Label& not_flattenable) {
1511   (void) temp_reg; // keep signature uniform with x86
1512   tbz(flags, ConstantPoolCacheEntry::is_flattenable_field_shift, not_flattenable);
1513 }
1514 
1515 void MacroAssembler::test_field_is_flattened(Register flags, Register temp_reg, Label& is_flattened) {
1516   (void) temp_reg; // keep signature uniform with x86
1517   tbnz(flags, ConstantPoolCacheEntry::is_flattened_field_shift, is_flattened);
1518 }
1519 
1520 void MacroAssembler::test_flat_array_klass(Register klass, Register temp_reg, Label& is_flattened) {
1521   ldrw(temp_reg, Address(klass, Klass::layout_helper_offset()));
1522   asrw(temp_reg, temp_reg, Klass::_lh_array_tag_shift);
1523   cmpw(temp_reg, Klass::_lh_array_tag_vt_value);
1524   br(Assembler::EQ, is_flattened);
1525 }
1526 
1527 void MacroAssembler::test_flat_array_oop(Register oop, Register temp_reg, Label& is_flattened) {
1528   load_klass(temp_reg, oop);
1529   test_flat_array_klass(temp_reg, temp_reg, is_flattened);
1530 }
1531 
1532 // MacroAssembler protected routines needed to implement
1533 // public methods
1534 
1535 void MacroAssembler::mov(Register r, Address dest) {
1536   code_section()->relocate(pc(), dest.rspec());
1537   u_int64_t imm64 = (u_int64_t)dest.target();
1538   movptr(r, imm64);
1539 }
1540 
1541 // Move a constant pointer into r.  In AArch64 mode the virtual
1542 // address space is 48 bits in size, so we only need three
1543 // instructions to create a patchable instruction sequence that can
1544 // reach anywhere.
1545 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1546 #ifndef PRODUCT
1547   {
1548     char buffer[64];
1549     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1550     block_comment(buffer);
1551   }
1552 #endif
1553   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1554   movz(r, imm64 & 0xffff);
1555   imm64 >>= 16;
1556   movk(r, imm64 & 0xffff, 16);
1557   imm64 >>= 16;
1558   movk(r, imm64 & 0xffff, 32);
1559 }
1560 
1561 // Macro to mov replicated immediate to vector register.
1562 //  Vd will get the following values for different arrangements in T
1563 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1564 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1565 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1566 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1567 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1568 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1569 //   T1D/T2D: invalid
1570 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1571   assert(T != T1D && T != T2D, "invalid arrangement");
1572   if (T == T8B || T == T16B) {
1573     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1574     movi(Vd, T, imm32 & 0xff, 0);
1575     return;
1576   }
1577   u_int32_t nimm32 = ~imm32;
1578   if (T == T4H || T == T8H) {
1579     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1580     imm32 &= 0xffff;
1581     nimm32 &= 0xffff;
1582   }
1583   u_int32_t x = imm32;
1584   int movi_cnt = 0;
1585   int movn_cnt = 0;
1586   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1587   x = nimm32;
1588   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1589   if (movn_cnt < movi_cnt) imm32 = nimm32;
1590   unsigned lsl = 0;
1591   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1592   if (movn_cnt < movi_cnt)
1593     mvni(Vd, T, imm32 & 0xff, lsl);
1594   else
1595     movi(Vd, T, imm32 & 0xff, lsl);
1596   imm32 >>= 8; lsl += 8;
1597   while (imm32) {
1598     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1599     if (movn_cnt < movi_cnt)
1600       bici(Vd, T, imm32 & 0xff, lsl);
1601     else
1602       orri(Vd, T, imm32 & 0xff, lsl);
1603     lsl += 8; imm32 >>= 8;
1604   }
1605 }
1606 
1607 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1608 {
1609 #ifndef PRODUCT
1610   {
1611     char buffer[64];
1612     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1613     block_comment(buffer);
1614   }
1615 #endif
1616   if (operand_valid_for_logical_immediate(false, imm64)) {
1617     orr(dst, zr, imm64);
1618   } else {
1619     // we can use a combination of MOVZ or MOVN with
1620     // MOVK to build up the constant
1621     u_int64_t imm_h[4];
1622     int zero_count = 0;
1623     int neg_count = 0;
1624     int i;
1625     for (i = 0; i < 4; i++) {
1626       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1627       if (imm_h[i] == 0) {
1628         zero_count++;
1629       } else if (imm_h[i] == 0xffffL) {
1630         neg_count++;
1631       }
1632     }
1633     if (zero_count == 4) {
1634       // one MOVZ will do
1635       movz(dst, 0);
1636     } else if (neg_count == 4) {
1637       // one MOVN will do
1638       movn(dst, 0);
1639     } else if (zero_count == 3) {
1640       for (i = 0; i < 4; i++) {
1641         if (imm_h[i] != 0L) {
1642           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1643           break;
1644         }
1645       }
1646     } else if (neg_count == 3) {
1647       // one MOVN will do
1648       for (int i = 0; i < 4; i++) {
1649         if (imm_h[i] != 0xffffL) {
1650           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1651           break;
1652         }
1653       }
1654     } else if (zero_count == 2) {
1655       // one MOVZ and one MOVK will do
1656       for (i = 0; i < 3; i++) {
1657         if (imm_h[i] != 0L) {
1658           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1659           i++;
1660           break;
1661         }
1662       }
1663       for (;i < 4; i++) {
1664         if (imm_h[i] != 0L) {
1665           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1666         }
1667       }
1668     } else if (neg_count == 2) {
1669       // one MOVN and one MOVK will do
1670       for (i = 0; i < 4; i++) {
1671         if (imm_h[i] != 0xffffL) {
1672           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1673           i++;
1674           break;
1675         }
1676       }
1677       for (;i < 4; i++) {
1678         if (imm_h[i] != 0xffffL) {
1679           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1680         }
1681       }
1682     } else if (zero_count == 1) {
1683       // one MOVZ and two MOVKs will do
1684       for (i = 0; i < 4; i++) {
1685         if (imm_h[i] != 0L) {
1686           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1687           i++;
1688           break;
1689         }
1690       }
1691       for (;i < 4; i++) {
1692         if (imm_h[i] != 0x0L) {
1693           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1694         }
1695       }
1696     } else if (neg_count == 1) {
1697       // one MOVN and two MOVKs will do
1698       for (i = 0; i < 4; i++) {
1699         if (imm_h[i] != 0xffffL) {
1700           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1701           i++;
1702           break;
1703         }
1704       }
1705       for (;i < 4; i++) {
1706         if (imm_h[i] != 0xffffL) {
1707           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1708         }
1709       }
1710     } else {
1711       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1712       movz(dst, (u_int32_t)imm_h[0], 0);
1713       for (i = 1; i < 4; i++) {
1714         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1715       }
1716     }
1717   }
1718 }
1719 
1720 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1721 {
1722 #ifndef PRODUCT
1723     {
1724       char buffer[64];
1725       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1726       block_comment(buffer);
1727     }
1728 #endif
1729   if (operand_valid_for_logical_immediate(true, imm32)) {
1730     orrw(dst, zr, imm32);
1731   } else {
1732     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1733     // constant
1734     u_int32_t imm_h[2];
1735     imm_h[0] = imm32 & 0xffff;
1736     imm_h[1] = ((imm32 >> 16) & 0xffff);
1737     if (imm_h[0] == 0) {
1738       movzw(dst, imm_h[1], 16);
1739     } else if (imm_h[0] == 0xffff) {
1740       movnw(dst, imm_h[1] ^ 0xffff, 16);
1741     } else if (imm_h[1] == 0) {
1742       movzw(dst, imm_h[0], 0);
1743     } else if (imm_h[1] == 0xffff) {
1744       movnw(dst, imm_h[0] ^ 0xffff, 0);
1745     } else {
1746       // use a MOVZ and MOVK (makes it easier to debug)
1747       movzw(dst, imm_h[0], 0);
1748       movkw(dst, imm_h[1], 16);
1749     }
1750   }
1751 }
1752 
1753 // Form an address from base + offset in Rd.  Rd may or may
1754 // not actually be used: you must use the Address that is returned.
1755 // It is up to you to ensure that the shift provided matches the size
1756 // of your data.
1757 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1758   if (Address::offset_ok_for_immed(byte_offset, shift))
1759     // It fits; no need for any heroics
1760     return Address(base, byte_offset);
1761 
1762   // Don't do anything clever with negative or misaligned offsets
1763   unsigned mask = (1 << shift) - 1;
1764   if (byte_offset < 0 || byte_offset & mask) {
1765     mov(Rd, byte_offset);
1766     add(Rd, base, Rd);
1767     return Address(Rd);
1768   }
1769 
1770   // See if we can do this with two 12-bit offsets
1771   {
1772     unsigned long word_offset = byte_offset >> shift;
1773     unsigned long masked_offset = word_offset & 0xfff000;
1774     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1775         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1776       add(Rd, base, masked_offset << shift);
1777       word_offset -= masked_offset;
1778       return Address(Rd, word_offset << shift);
1779     }
1780   }
1781 
1782   // Do it the hard way
1783   mov(Rd, byte_offset);
1784   add(Rd, base, Rd);
1785   return Address(Rd);
1786 }
1787 
1788 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1789   if (UseLSE) {
1790     mov(tmp, 1);
1791     ldadd(Assembler::word, tmp, zr, counter_addr);
1792     return;
1793   }
1794   Label retry_load;
1795   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1796     prfm(Address(counter_addr), PSTL1STRM);
1797   bind(retry_load);
1798   // flush and load exclusive from the memory location
1799   ldxrw(tmp, counter_addr);
1800   addw(tmp, tmp, 1);
1801   // if we store+flush with no intervening write tmp wil be zero
1802   stxrw(tmp2, tmp, counter_addr);
1803   cbnzw(tmp2, retry_load);
1804 }
1805 
1806 
1807 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1808                                     bool want_remainder, Register scratch)
1809 {
1810   // Full implementation of Java idiv and irem.  The function
1811   // returns the (pc) offset of the div instruction - may be needed
1812   // for implicit exceptions.
1813   //
1814   // constraint : ra/rb =/= scratch
1815   //         normal case
1816   //
1817   // input : ra: dividend
1818   //         rb: divisor
1819   //
1820   // result: either
1821   //         quotient  (= ra idiv rb)
1822   //         remainder (= ra irem rb)
1823 
1824   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1825 
1826   int idivl_offset = offset();
1827   if (! want_remainder) {
1828     sdivw(result, ra, rb);
1829   } else {
1830     sdivw(scratch, ra, rb);
1831     Assembler::msubw(result, scratch, rb, ra);
1832   }
1833 
1834   return idivl_offset;
1835 }
1836 
1837 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1838                                     bool want_remainder, Register scratch)
1839 {
1840   // Full implementation of Java ldiv and lrem.  The function
1841   // returns the (pc) offset of the div instruction - may be needed
1842   // for implicit exceptions.
1843   //
1844   // constraint : ra/rb =/= scratch
1845   //         normal case
1846   //
1847   // input : ra: dividend
1848   //         rb: divisor
1849   //
1850   // result: either
1851   //         quotient  (= ra idiv rb)
1852   //         remainder (= ra irem rb)
1853 
1854   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1855 
1856   int idivq_offset = offset();
1857   if (! want_remainder) {
1858     sdiv(result, ra, rb);
1859   } else {
1860     sdiv(scratch, ra, rb);
1861     Assembler::msub(result, scratch, rb, ra);
1862   }
1863 
1864   return idivq_offset;
1865 }
1866 
1867 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1868   address prev = pc() - NativeMembar::instruction_size;
1869   address last = code()->last_insn();
1870   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1871     NativeMembar *bar = NativeMembar_at(prev);
1872     // We are merging two memory barrier instructions.  On AArch64 we
1873     // can do this simply by ORing them together.
1874     bar->set_kind(bar->get_kind() | order_constraint);
1875     BLOCK_COMMENT("merged membar");
1876   } else {
1877     code()->set_last_insn(pc());
1878     dmb(Assembler::barrier(order_constraint));
1879   }
1880 }
1881 
1882 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1883   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1884     merge_ldst(rt, adr, size_in_bytes, is_store);
1885     code()->clear_last_insn();
1886     return true;
1887   } else {
1888     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1889     const unsigned mask = size_in_bytes - 1;
1890     if (adr.getMode() == Address::base_plus_offset &&
1891         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1892       code()->set_last_insn(pc());
1893     }
1894     return false;
1895   }
1896 }
1897 
1898 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1899   // We always try to merge two adjacent loads into one ldp.
1900   if (!try_merge_ldst(Rx, adr, 8, false)) {
1901     Assembler::ldr(Rx, adr);
1902   }
1903 }
1904 
1905 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1906   // We always try to merge two adjacent loads into one ldp.
1907   if (!try_merge_ldst(Rw, adr, 4, false)) {
1908     Assembler::ldrw(Rw, adr);
1909   }
1910 }
1911 
1912 void MacroAssembler::str(Register Rx, const Address &adr) {
1913   // We always try to merge two adjacent stores into one stp.
1914   if (!try_merge_ldst(Rx, adr, 8, true)) {
1915     Assembler::str(Rx, adr);
1916   }
1917 }
1918 
1919 void MacroAssembler::strw(Register Rw, const Address &adr) {
1920   // We always try to merge two adjacent stores into one stp.
1921   if (!try_merge_ldst(Rw, adr, 4, true)) {
1922     Assembler::strw(Rw, adr);
1923   }
1924 }
1925 
1926 // MacroAssembler routines found actually to be needed
1927 
1928 void MacroAssembler::push(Register src)
1929 {
1930   str(src, Address(pre(esp, -1 * wordSize)));
1931 }
1932 
1933 void MacroAssembler::pop(Register dst)
1934 {
1935   ldr(dst, Address(post(esp, 1 * wordSize)));
1936 }
1937 
1938 // Note: load_unsigned_short used to be called load_unsigned_word.
1939 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1940   int off = offset();
1941   ldrh(dst, src);
1942   return off;
1943 }
1944 
1945 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1946   int off = offset();
1947   ldrb(dst, src);
1948   return off;
1949 }
1950 
1951 int MacroAssembler::load_signed_short(Register dst, Address src) {
1952   int off = offset();
1953   ldrsh(dst, src);
1954   return off;
1955 }
1956 
1957 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1958   int off = offset();
1959   ldrsb(dst, src);
1960   return off;
1961 }
1962 
1963 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1964   int off = offset();
1965   ldrshw(dst, src);
1966   return off;
1967 }
1968 
1969 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1970   int off = offset();
1971   ldrsbw(dst, src);
1972   return off;
1973 }
1974 
1975 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1976   switch (size_in_bytes) {
1977   case  8:  ldr(dst, src); break;
1978   case  4:  ldrw(dst, src); break;
1979   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1980   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1981   default:  ShouldNotReachHere();
1982   }
1983 }
1984 
1985 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1986   switch (size_in_bytes) {
1987   case  8:  str(src, dst); break;
1988   case  4:  strw(src, dst); break;
1989   case  2:  strh(src, dst); break;
1990   case  1:  strb(src, dst); break;
1991   default:  ShouldNotReachHere();
1992   }
1993 }
1994 
1995 void MacroAssembler::decrementw(Register reg, int value)
1996 {
1997   if (value < 0)  { incrementw(reg, -value);      return; }
1998   if (value == 0) {                               return; }
1999   if (value < (1 << 12)) { subw(reg, reg, value); return; }
2000   /* else */ {
2001     guarantee(reg != rscratch2, "invalid dst for register decrement");
2002     movw(rscratch2, (unsigned)value);
2003     subw(reg, reg, rscratch2);
2004   }
2005 }
2006 
2007 void MacroAssembler::decrement(Register reg, int value)
2008 {
2009   if (value < 0)  { increment(reg, -value);      return; }
2010   if (value == 0) {                              return; }
2011   if (value < (1 << 12)) { sub(reg, reg, value); return; }
2012   /* else */ {
2013     assert(reg != rscratch2, "invalid dst for register decrement");
2014     mov(rscratch2, (unsigned long)value);
2015     sub(reg, reg, rscratch2);
2016   }
2017 }
2018 
2019 void MacroAssembler::decrementw(Address dst, int value)
2020 {
2021   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
2022   if (dst.getMode() == Address::literal) {
2023     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2024     lea(rscratch2, dst);
2025     dst = Address(rscratch2);
2026   }
2027   ldrw(rscratch1, dst);
2028   decrementw(rscratch1, value);
2029   strw(rscratch1, dst);
2030 }
2031 
2032 void MacroAssembler::decrement(Address dst, int value)
2033 {
2034   assert(!dst.uses(rscratch1), "invalid address for decrement");
2035   if (dst.getMode() == Address::literal) {
2036     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2037     lea(rscratch2, dst);
2038     dst = Address(rscratch2);
2039   }
2040   ldr(rscratch1, dst);
2041   decrement(rscratch1, value);
2042   str(rscratch1, dst);
2043 }
2044 
2045 void MacroAssembler::incrementw(Register reg, int value)
2046 {
2047   if (value < 0)  { decrementw(reg, -value);      return; }
2048   if (value == 0) {                               return; }
2049   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2050   /* else */ {
2051     assert(reg != rscratch2, "invalid dst for register increment");
2052     movw(rscratch2, (unsigned)value);
2053     addw(reg, reg, rscratch2);
2054   }
2055 }
2056 
2057 void MacroAssembler::increment(Register reg, int value)
2058 {
2059   if (value < 0)  { decrement(reg, -value);      return; }
2060   if (value == 0) {                              return; }
2061   if (value < (1 << 12)) { add(reg, reg, value); return; }
2062   /* else */ {
2063     assert(reg != rscratch2, "invalid dst for register increment");
2064     movw(rscratch2, (unsigned)value);
2065     add(reg, reg, rscratch2);
2066   }
2067 }
2068 
2069 void MacroAssembler::incrementw(Address dst, int value)
2070 {
2071   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2072   if (dst.getMode() == Address::literal) {
2073     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2074     lea(rscratch2, dst);
2075     dst = Address(rscratch2);
2076   }
2077   ldrw(rscratch1, dst);
2078   incrementw(rscratch1, value);
2079   strw(rscratch1, dst);
2080 }
2081 
2082 void MacroAssembler::increment(Address dst, int value)
2083 {
2084   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2085   if (dst.getMode() == Address::literal) {
2086     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2087     lea(rscratch2, dst);
2088     dst = Address(rscratch2);
2089   }
2090   ldr(rscratch1, dst);
2091   increment(rscratch1, value);
2092   str(rscratch1, dst);
2093 }
2094 
2095 
2096 void MacroAssembler::pusha() {
2097   push(0x7fffffff, sp);
2098 }
2099 
2100 void MacroAssembler::popa() {
2101   pop(0x7fffffff, sp);
2102 }
2103 
2104 // Push lots of registers in the bit set supplied.  Don't push sp.
2105 // Return the number of words pushed
2106 int MacroAssembler::push(unsigned int bitset, Register stack) {
2107   int words_pushed = 0;
2108 
2109   // Scan bitset to accumulate register pairs
2110   unsigned char regs[32];
2111   int count = 0;
2112   for (int reg = 0; reg <= 30; reg++) {
2113     if (1 & bitset)
2114       regs[count++] = reg;
2115     bitset >>= 1;
2116   }
2117   regs[count++] = zr->encoding_nocheck();
2118   count &= ~1;  // Only push an even nuber of regs
2119 
2120   if (count) {
2121     stp(as_Register(regs[0]), as_Register(regs[1]),
2122        Address(pre(stack, -count * wordSize)));
2123     words_pushed += 2;
2124   }
2125   for (int i = 2; i < count; i += 2) {
2126     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2127        Address(stack, i * wordSize));
2128     words_pushed += 2;
2129   }
2130 
2131   assert(words_pushed == count, "oops, pushed != count");
2132 
2133   return count;
2134 }
2135 
2136 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2137   int words_pushed = 0;
2138 
2139   // Scan bitset to accumulate register pairs
2140   unsigned char regs[32];
2141   int count = 0;
2142   for (int reg = 0; reg <= 30; reg++) {
2143     if (1 & bitset)
2144       regs[count++] = reg;
2145     bitset >>= 1;
2146   }
2147   regs[count++] = zr->encoding_nocheck();
2148   count &= ~1;
2149 
2150   for (int i = 2; i < count; i += 2) {
2151     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2152        Address(stack, i * wordSize));
2153     words_pushed += 2;
2154   }
2155   if (count) {
2156     ldp(as_Register(regs[0]), as_Register(regs[1]),
2157        Address(post(stack, count * wordSize)));
2158     words_pushed += 2;
2159   }
2160 
2161   assert(words_pushed == count, "oops, pushed != count");
2162 
2163   return count;
2164 }
2165 #ifdef ASSERT
2166 void MacroAssembler::verify_heapbase(const char* msg) {
2167 #if 0
2168   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2169   assert (Universe::heap() != NULL, "java heap should be initialized");
2170   if (CheckCompressedOops) {
2171     Label ok;
2172     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2173     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2174     br(Assembler::EQ, ok);
2175     stop(msg);
2176     bind(ok);
2177     pop(1 << rscratch1->encoding(), sp);
2178   }
2179 #endif
2180 }
2181 #endif
2182 
2183 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2184   Label done, not_weak;
2185   cbz(value, done);           // Use NULL as-is.
2186 
2187   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2188   tbz(r0, 0, not_weak);    // Test for jweak tag.
2189 
2190   // Resolve jweak.
2191   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2192                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2193   verify_oop(value);
2194   b(done);
2195 
2196   bind(not_weak);
2197   // Resolve (untagged) jobject.
2198   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2199   verify_oop(value);
2200   bind(done);
2201 }
2202 
2203 void MacroAssembler::stop(const char* msg) {
2204   address ip = pc();
2205   pusha();
2206   mov(c_rarg0, (address)msg);
2207   mov(c_rarg1, (address)ip);
2208   mov(c_rarg2, sp);
2209   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2210   // call(c_rarg3);
2211   blrt(c_rarg3, 3, 0, 1);
2212   hlt(0);
2213 }
2214 
2215 void MacroAssembler::unimplemented(const char* what) {
2216   const char* buf = NULL;
2217   {
2218     ResourceMark rm;
2219     stringStream ss;
2220     ss.print("unimplemented: %s", what);
2221     buf = code_string(ss.as_string());
2222   }
2223   stop(buf);
2224 }
2225 
2226 // If a constant does not fit in an immediate field, generate some
2227 // number of MOV instructions and then perform the operation.
2228 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2229                                            add_sub_imm_insn insn1,
2230                                            add_sub_reg_insn insn2) {
2231   assert(Rd != zr, "Rd = zr and not setting flags?");
2232   if (operand_valid_for_add_sub_immediate((int)imm)) {
2233     (this->*insn1)(Rd, Rn, imm);
2234   } else {
2235     if (uabs(imm) < (1 << 24)) {
2236        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2237        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2238     } else {
2239        assert_different_registers(Rd, Rn);
2240        mov(Rd, (uint64_t)imm);
2241        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2242     }
2243   }
2244 }
2245 
2246 // Seperate vsn which sets the flags. Optimisations are more restricted
2247 // because we must set the flags correctly.
2248 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2249                                            add_sub_imm_insn insn1,
2250                                            add_sub_reg_insn insn2) {
2251   if (operand_valid_for_add_sub_immediate((int)imm)) {
2252     (this->*insn1)(Rd, Rn, imm);
2253   } else {
2254     assert_different_registers(Rd, Rn);
2255     assert(Rd != zr, "overflow in immediate operand");
2256     mov(Rd, (uint64_t)imm);
2257     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2258   }
2259 }
2260 
2261 
2262 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2263   if (increment.is_register()) {
2264     add(Rd, Rn, increment.as_register());
2265   } else {
2266     add(Rd, Rn, increment.as_constant());
2267   }
2268 }
2269 
2270 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2271   if (increment.is_register()) {
2272     addw(Rd, Rn, increment.as_register());
2273   } else {
2274     addw(Rd, Rn, increment.as_constant());
2275   }
2276 }
2277 
2278 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2279   if (decrement.is_register()) {
2280     sub(Rd, Rn, decrement.as_register());
2281   } else {
2282     sub(Rd, Rn, decrement.as_constant());
2283   }
2284 }
2285 
2286 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2287   if (decrement.is_register()) {
2288     subw(Rd, Rn, decrement.as_register());
2289   } else {
2290     subw(Rd, Rn, decrement.as_constant());
2291   }
2292 }
2293 
2294 void MacroAssembler::reinit_heapbase()
2295 {
2296   if (UseCompressedOops) {
2297     if (Universe::is_fully_initialized()) {
2298       mov(rheapbase, Universe::narrow_ptrs_base());
2299     } else {
2300       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2301       ldr(rheapbase, Address(rheapbase));
2302     }
2303   }
2304 }
2305 
2306 // this simulates the behaviour of the x86 cmpxchg instruction using a
2307 // load linked/store conditional pair. we use the acquire/release
2308 // versions of these instructions so that we flush pending writes as
2309 // per Java semantics.
2310 
2311 // n.b the x86 version assumes the old value to be compared against is
2312 // in rax and updates rax with the value located in memory if the
2313 // cmpxchg fails. we supply a register for the old value explicitly
2314 
2315 // the aarch64 load linked/store conditional instructions do not
2316 // accept an offset. so, unlike x86, we must provide a plain register
2317 // to identify the memory word to be compared/exchanged rather than a
2318 // register+offset Address.
2319 
2320 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2321                                 Label &succeed, Label *fail) {
2322   // oldv holds comparison value
2323   // newv holds value to write in exchange
2324   // addr identifies memory word to compare against/update
2325   if (UseLSE) {
2326     mov(tmp, oldv);
2327     casal(Assembler::xword, oldv, newv, addr);
2328     cmp(tmp, oldv);
2329     br(Assembler::EQ, succeed);
2330     membar(AnyAny);
2331   } else {
2332     Label retry_load, nope;
2333     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2334       prfm(Address(addr), PSTL1STRM);
2335     bind(retry_load);
2336     // flush and load exclusive from the memory location
2337     // and fail if it is not what we expect
2338     ldaxr(tmp, addr);
2339     cmp(tmp, oldv);
2340     br(Assembler::NE, nope);
2341     // if we store+flush with no intervening write tmp wil be zero
2342     stlxr(tmp, newv, addr);
2343     cbzw(tmp, succeed);
2344     // retry so we only ever return after a load fails to compare
2345     // ensures we don't return a stale value after a failed write.
2346     b(retry_load);
2347     // if the memory word differs we return it in oldv and signal a fail
2348     bind(nope);
2349     membar(AnyAny);
2350     mov(oldv, tmp);
2351   }
2352   if (fail)
2353     b(*fail);
2354 }
2355 
2356 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2357                                         Label &succeed, Label *fail) {
2358   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2359   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2360 }
2361 
2362 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2363                                 Label &succeed, Label *fail) {
2364   // oldv holds comparison value
2365   // newv holds value to write in exchange
2366   // addr identifies memory word to compare against/update
2367   // tmp returns 0/1 for success/failure
2368   if (UseLSE) {
2369     mov(tmp, oldv);
2370     casal(Assembler::word, oldv, newv, addr);
2371     cmp(tmp, oldv);
2372     br(Assembler::EQ, succeed);
2373     membar(AnyAny);
2374   } else {
2375     Label retry_load, nope;
2376     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2377       prfm(Address(addr), PSTL1STRM);
2378     bind(retry_load);
2379     // flush and load exclusive from the memory location
2380     // and fail if it is not what we expect
2381     ldaxrw(tmp, addr);
2382     cmp(tmp, oldv);
2383     br(Assembler::NE, nope);
2384     // if we store+flush with no intervening write tmp wil be zero
2385     stlxrw(tmp, newv, addr);
2386     cbzw(tmp, succeed);
2387     // retry so we only ever return after a load fails to compare
2388     // ensures we don't return a stale value after a failed write.
2389     b(retry_load);
2390     // if the memory word differs we return it in oldv and signal a fail
2391     bind(nope);
2392     membar(AnyAny);
2393     mov(oldv, tmp);
2394   }
2395   if (fail)
2396     b(*fail);
2397 }
2398 
2399 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2400 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2401 // Pass a register for the result, otherwise pass noreg.
2402 
2403 // Clobbers rscratch1
2404 void MacroAssembler::cmpxchg(Register addr, Register expected,
2405                              Register new_val,
2406                              enum operand_size size,
2407                              bool acquire, bool release,
2408                              bool weak,
2409                              Register result) {
2410   if (result == noreg)  result = rscratch1;
2411   BLOCK_COMMENT("cmpxchg {");
2412   if (UseLSE) {
2413     mov(result, expected);
2414     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2415     compare_eq(result, expected, size);
2416   } else {
2417     Label retry_load, done;
2418     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2419       prfm(Address(addr), PSTL1STRM);
2420     bind(retry_load);
2421     load_exclusive(result, addr, size, acquire);
2422     compare_eq(result, expected, size);
2423     br(Assembler::NE, done);
2424     store_exclusive(rscratch1, new_val, addr, size, release);
2425     if (weak) {
2426       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2427     } else {
2428       cbnzw(rscratch1, retry_load);
2429     }
2430     bind(done);
2431   }
2432   BLOCK_COMMENT("} cmpxchg");
2433 }
2434 
2435 // A generic comparison. Only compares for equality, clobbers rscratch1.
2436 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2437   if (size == xword) {
2438     cmp(rm, rn);
2439   } else if (size == word) {
2440     cmpw(rm, rn);
2441   } else if (size == halfword) {
2442     eorw(rscratch1, rm, rn);
2443     ands(zr, rscratch1, 0xffff);
2444   } else if (size == byte) {
2445     eorw(rscratch1, rm, rn);
2446     ands(zr, rscratch1, 0xff);
2447   } else {
2448     ShouldNotReachHere();
2449   }
2450 }
2451 
2452 
2453 static bool different(Register a, RegisterOrConstant b, Register c) {
2454   if (b.is_constant())
2455     return a != c;
2456   else
2457     return a != b.as_register() && a != c && b.as_register() != c;
2458 }
2459 
2460 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2461 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2462   if (UseLSE) {                                                         \
2463     prev = prev->is_valid() ? prev : zr;                                \
2464     if (incr.is_register()) {                                           \
2465       AOP(sz, incr.as_register(), prev, addr);                          \
2466     } else {                                                            \
2467       mov(rscratch2, incr.as_constant());                               \
2468       AOP(sz, rscratch2, prev, addr);                                   \
2469     }                                                                   \
2470     return;                                                             \
2471   }                                                                     \
2472   Register result = rscratch2;                                          \
2473   if (prev->is_valid())                                                 \
2474     result = different(prev, incr, addr) ? prev : rscratch2;            \
2475                                                                         \
2476   Label retry_load;                                                     \
2477   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2478     prfm(Address(addr), PSTL1STRM);                                     \
2479   bind(retry_load);                                                     \
2480   LDXR(result, addr);                                                   \
2481   OP(rscratch1, result, incr);                                          \
2482   STXR(rscratch2, rscratch1, addr);                                     \
2483   cbnzw(rscratch2, retry_load);                                         \
2484   if (prev->is_valid() && prev != result) {                             \
2485     IOP(prev, rscratch1, incr);                                         \
2486   }                                                                     \
2487 }
2488 
2489 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2490 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2491 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2492 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2493 
2494 #undef ATOMIC_OP
2495 
2496 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2497 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2498   if (UseLSE) {                                                         \
2499     prev = prev->is_valid() ? prev : zr;                                \
2500     AOP(sz, newv, prev, addr);                                          \
2501     return;                                                             \
2502   }                                                                     \
2503   Register result = rscratch2;                                          \
2504   if (prev->is_valid())                                                 \
2505     result = different(prev, newv, addr) ? prev : rscratch2;            \
2506                                                                         \
2507   Label retry_load;                                                     \
2508   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2509     prfm(Address(addr), PSTL1STRM);                                     \
2510   bind(retry_load);                                                     \
2511   LDXR(result, addr);                                                   \
2512   STXR(rscratch1, newv, addr);                                          \
2513   cbnzw(rscratch1, retry_load);                                         \
2514   if (prev->is_valid() && prev != result)                               \
2515     mov(prev, result);                                                  \
2516 }
2517 
2518 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2519 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2520 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2521 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2522 
2523 #undef ATOMIC_XCHG
2524 
2525 #ifndef PRODUCT
2526 extern "C" void findpc(intptr_t x);
2527 #endif
2528 
2529 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2530 {
2531   // In order to get locks to work, we need to fake a in_VM state
2532   if (ShowMessageBoxOnError ) {
2533     JavaThread* thread = JavaThread::current();
2534     JavaThreadState saved_state = thread->thread_state();
2535     thread->set_thread_state(_thread_in_vm);
2536 #ifndef PRODUCT
2537     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2538       ttyLocker ttyl;
2539       BytecodeCounter::print();
2540     }
2541 #endif
2542     if (os::message_box(msg, "Execution stopped, print registers?")) {
2543       ttyLocker ttyl;
2544       tty->print_cr(" pc = 0x%016lx", pc);
2545 #ifndef PRODUCT
2546       tty->cr();
2547       findpc(pc);
2548       tty->cr();
2549 #endif
2550       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2551       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2552       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2553       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2554       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2555       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2556       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2557       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2558       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2559       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2560       tty->print_cr("r10 = 0x%016lx", regs[10]);
2561       tty->print_cr("r11 = 0x%016lx", regs[11]);
2562       tty->print_cr("r12 = 0x%016lx", regs[12]);
2563       tty->print_cr("r13 = 0x%016lx", regs[13]);
2564       tty->print_cr("r14 = 0x%016lx", regs[14]);
2565       tty->print_cr("r15 = 0x%016lx", regs[15]);
2566       tty->print_cr("r16 = 0x%016lx", regs[16]);
2567       tty->print_cr("r17 = 0x%016lx", regs[17]);
2568       tty->print_cr("r18 = 0x%016lx", regs[18]);
2569       tty->print_cr("r19 = 0x%016lx", regs[19]);
2570       tty->print_cr("r20 = 0x%016lx", regs[20]);
2571       tty->print_cr("r21 = 0x%016lx", regs[21]);
2572       tty->print_cr("r22 = 0x%016lx", regs[22]);
2573       tty->print_cr("r23 = 0x%016lx", regs[23]);
2574       tty->print_cr("r24 = 0x%016lx", regs[24]);
2575       tty->print_cr("r25 = 0x%016lx", regs[25]);
2576       tty->print_cr("r26 = 0x%016lx", regs[26]);
2577       tty->print_cr("r27 = 0x%016lx", regs[27]);
2578       tty->print_cr("r28 = 0x%016lx", regs[28]);
2579       tty->print_cr("r30 = 0x%016lx", regs[30]);
2580       tty->print_cr("r31 = 0x%016lx", regs[31]);
2581       BREAKPOINT;
2582     }
2583     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2584   } else {
2585     ttyLocker ttyl;
2586     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2587                     msg);
2588     assert(false, "DEBUG MESSAGE: %s", msg);
2589   }
2590 }
2591 
2592 #ifdef BUILTIN_SIM
2593 // routine to generate an x86 prolog for a stub function which
2594 // bootstraps into the generated ARM code which directly follows the
2595 // stub
2596 //
2597 // the argument encodes the number of general and fp registers
2598 // passed by the caller and the callng convention (currently just
2599 // the number of general registers and assumes C argument passing)
2600 
2601 extern "C" {
2602 int aarch64_stub_prolog_size();
2603 void aarch64_stub_prolog();
2604 void aarch64_prolog();
2605 }
2606 
2607 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2608                                    address *prolog_ptr)
2609 {
2610   int calltype = (((ret_type & 0x3) << 8) |
2611                   ((fp_arg_count & 0xf) << 4) |
2612                   (gp_arg_count & 0xf));
2613 
2614   // the addresses for the x86 to ARM entry code we need to use
2615   address start = pc();
2616   // printf("start = %lx\n", start);
2617   int byteCount =  aarch64_stub_prolog_size();
2618   // printf("byteCount = %x\n", byteCount);
2619   int instructionCount = (byteCount + 3)/ 4;
2620   // printf("instructionCount = %x\n", instructionCount);
2621   for (int i = 0; i < instructionCount; i++) {
2622     nop();
2623   }
2624 
2625   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2626 
2627   // write the address of the setup routine and the call format at the
2628   // end of into the copied code
2629   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2630   if (prolog_ptr)
2631     patch_end[-2] = (u_int64_t)prolog_ptr;
2632   patch_end[-1] = calltype;
2633 }
2634 #endif
2635 
2636 void MacroAssembler::push_call_clobbered_registers() {
2637   int step = 4 * wordSize;
2638   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2639   sub(sp, sp, step);
2640   mov(rscratch1, -step);
2641   // Push v0-v7, v16-v31.
2642   for (int i = 31; i>= 4; i -= 4) {
2643     if (i <= v7->encoding() || i >= v16->encoding())
2644       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2645           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2646   }
2647   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2648       as_FloatRegister(3), T1D, Address(sp));
2649 }
2650 
2651 void MacroAssembler::pop_call_clobbered_registers() {
2652   for (int i = 0; i < 32; i += 4) {
2653     if (i <= v7->encoding() || i >= v16->encoding())
2654       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2655           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2656   }
2657 
2658   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2659 }
2660 
2661 void MacroAssembler::push_CPU_state(bool save_vectors) {
2662   int step = (save_vectors ? 8 : 4) * wordSize;
2663   push(0x3fffffff, sp);         // integer registers except lr & sp
2664   mov(rscratch1, -step);
2665   sub(sp, sp, step);
2666   for (int i = 28; i >= 4; i -= 4) {
2667     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2668         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2669   }
2670   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2671 }
2672 
2673 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2674   int step = (restore_vectors ? 8 : 4) * wordSize;
2675   for (int i = 0; i <= 28; i += 4)
2676     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2677         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2678   pop(0x3fffffff, sp);         // integer registers except lr & sp
2679 }
2680 
2681 /**
2682  * Helpers for multiply_to_len().
2683  */
2684 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2685                                      Register src1, Register src2) {
2686   adds(dest_lo, dest_lo, src1);
2687   adc(dest_hi, dest_hi, zr);
2688   adds(dest_lo, dest_lo, src2);
2689   adc(final_dest_hi, dest_hi, zr);
2690 }
2691 
2692 // Generate an address from (r + r1 extend offset).  "size" is the
2693 // size of the operand.  The result may be in rscratch2.
2694 Address MacroAssembler::offsetted_address(Register r, Register r1,
2695                                           Address::extend ext, int offset, int size) {
2696   if (offset || (ext.shift() % size != 0)) {
2697     lea(rscratch2, Address(r, r1, ext));
2698     return Address(rscratch2, offset);
2699   } else {
2700     return Address(r, r1, ext);
2701   }
2702 }
2703 
2704 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2705 {
2706   assert(offset >= 0, "spill to negative address?");
2707   // Offset reachable ?
2708   //   Not aligned - 9 bits signed offset
2709   //   Aligned - 12 bits unsigned offset shifted
2710   Register base = sp;
2711   if ((offset & (size-1)) && offset >= (1<<8)) {
2712     add(tmp, base, offset & ((1<<12)-1));
2713     base = tmp;
2714     offset &= -1<<12;
2715   }
2716 
2717   if (offset >= (1<<12) * size) {
2718     add(tmp, base, offset & (((1<<12)-1)<<12));
2719     base = tmp;
2720     offset &= ~(((1<<12)-1)<<12);
2721   }
2722 
2723   return Address(base, offset);
2724 }
2725 
2726 // Checks whether offset is aligned.
2727 // Returns true if it is, else false.
2728 bool MacroAssembler::merge_alignment_check(Register base,
2729                                            size_t size,
2730                                            long cur_offset,
2731                                            long prev_offset) const {
2732   if (AvoidUnalignedAccesses) {
2733     if (base == sp) {
2734       // Checks whether low offset if aligned to pair of registers.
2735       long pair_mask = size * 2 - 1;
2736       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2737       return (offset & pair_mask) == 0;
2738     } else { // If base is not sp, we can't guarantee the access is aligned.
2739       return false;
2740     }
2741   } else {
2742     long mask = size - 1;
2743     // Load/store pair instruction only supports element size aligned offset.
2744     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2745   }
2746 }
2747 
2748 // Checks whether current and previous loads/stores can be merged.
2749 // Returns true if it can be merged, else false.
2750 bool MacroAssembler::ldst_can_merge(Register rt,
2751                                     const Address &adr,
2752                                     size_t cur_size_in_bytes,
2753                                     bool is_store) const {
2754   address prev = pc() - NativeInstruction::instruction_size;
2755   address last = code()->last_insn();
2756 
2757   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2758     return false;
2759   }
2760 
2761   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2762     return false;
2763   }
2764 
2765   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2766   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2767 
2768   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2769   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2770 
2771   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2772     return false;
2773   }
2774 
2775   long max_offset = 63 * prev_size_in_bytes;
2776   long min_offset = -64 * prev_size_in_bytes;
2777 
2778   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2779 
2780   // Only same base can be merged.
2781   if (adr.base() != prev_ldst->base()) {
2782     return false;
2783   }
2784 
2785   long cur_offset = adr.offset();
2786   long prev_offset = prev_ldst->offset();
2787   size_t diff = abs(cur_offset - prev_offset);
2788   if (diff != prev_size_in_bytes) {
2789     return false;
2790   }
2791 
2792   // Following cases can not be merged:
2793   // ldr x2, [x2, #8]
2794   // ldr x3, [x2, #16]
2795   // or:
2796   // ldr x2, [x3, #8]
2797   // ldr x2, [x3, #16]
2798   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2799   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2800     return false;
2801   }
2802 
2803   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2804   // Offset range must be in ldp/stp instruction's range.
2805   if (low_offset > max_offset || low_offset < min_offset) {
2806     return false;
2807   }
2808 
2809   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2810     return true;
2811   }
2812 
2813   return false;
2814 }
2815 
2816 // Merge current load/store with previous load/store into ldp/stp.
2817 void MacroAssembler::merge_ldst(Register rt,
2818                                 const Address &adr,
2819                                 size_t cur_size_in_bytes,
2820                                 bool is_store) {
2821 
2822   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2823 
2824   Register rt_low, rt_high;
2825   address prev = pc() - NativeInstruction::instruction_size;
2826   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2827 
2828   long offset;
2829 
2830   if (adr.offset() < prev_ldst->offset()) {
2831     offset = adr.offset();
2832     rt_low = rt;
2833     rt_high = prev_ldst->target();
2834   } else {
2835     offset = prev_ldst->offset();
2836     rt_low = prev_ldst->target();
2837     rt_high = rt;
2838   }
2839 
2840   Address adr_p = Address(prev_ldst->base(), offset);
2841   // Overwrite previous generated binary.
2842   code_section()->set_end(prev);
2843 
2844   const int sz = prev_ldst->size_in_bytes();
2845   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2846   if (!is_store) {
2847     BLOCK_COMMENT("merged ldr pair");
2848     if (sz == 8) {
2849       ldp(rt_low, rt_high, adr_p);
2850     } else {
2851       ldpw(rt_low, rt_high, adr_p);
2852     }
2853   } else {
2854     BLOCK_COMMENT("merged str pair");
2855     if (sz == 8) {
2856       stp(rt_low, rt_high, adr_p);
2857     } else {
2858       stpw(rt_low, rt_high, adr_p);
2859     }
2860   }
2861 }
2862 
2863 /**
2864  * Multiply 64 bit by 64 bit first loop.
2865  */
2866 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2867                                            Register y, Register y_idx, Register z,
2868                                            Register carry, Register product,
2869                                            Register idx, Register kdx) {
2870   //
2871   //  jlong carry, x[], y[], z[];
2872   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2873   //    huge_128 product = y[idx] * x[xstart] + carry;
2874   //    z[kdx] = (jlong)product;
2875   //    carry  = (jlong)(product >>> 64);
2876   //  }
2877   //  z[xstart] = carry;
2878   //
2879 
2880   Label L_first_loop, L_first_loop_exit;
2881   Label L_one_x, L_one_y, L_multiply;
2882 
2883   subsw(xstart, xstart, 1);
2884   br(Assembler::MI, L_one_x);
2885 
2886   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2887   ldr(x_xstart, Address(rscratch1));
2888   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2889 
2890   bind(L_first_loop);
2891   subsw(idx, idx, 1);
2892   br(Assembler::MI, L_first_loop_exit);
2893   subsw(idx, idx, 1);
2894   br(Assembler::MI, L_one_y);
2895   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2896   ldr(y_idx, Address(rscratch1));
2897   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2898   bind(L_multiply);
2899 
2900   // AArch64 has a multiply-accumulate instruction that we can't use
2901   // here because it has no way to process carries, so we have to use
2902   // separate add and adc instructions.  Bah.
2903   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2904   mul(product, x_xstart, y_idx);
2905   adds(product, product, carry);
2906   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2907 
2908   subw(kdx, kdx, 2);
2909   ror(product, product, 32); // back to big-endian
2910   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2911 
2912   b(L_first_loop);
2913 
2914   bind(L_one_y);
2915   ldrw(y_idx, Address(y,  0));
2916   b(L_multiply);
2917 
2918   bind(L_one_x);
2919   ldrw(x_xstart, Address(x,  0));
2920   b(L_first_loop);
2921 
2922   bind(L_first_loop_exit);
2923 }
2924 
2925 /**
2926  * Multiply 128 bit by 128. Unrolled inner loop.
2927  *
2928  */
2929 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2930                                              Register carry, Register carry2,
2931                                              Register idx, Register jdx,
2932                                              Register yz_idx1, Register yz_idx2,
2933                                              Register tmp, Register tmp3, Register tmp4,
2934                                              Register tmp6, Register product_hi) {
2935 
2936   //   jlong carry, x[], y[], z[];
2937   //   int kdx = ystart+1;
2938   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2939   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2940   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2941   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2942   //     carry  = (jlong)(tmp4 >>> 64);
2943   //     z[kdx+idx+1] = (jlong)tmp3;
2944   //     z[kdx+idx] = (jlong)tmp4;
2945   //   }
2946   //   idx += 2;
2947   //   if (idx > 0) {
2948   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2949   //     z[kdx+idx] = (jlong)yz_idx1;
2950   //     carry  = (jlong)(yz_idx1 >>> 64);
2951   //   }
2952   //
2953 
2954   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2955 
2956   lsrw(jdx, idx, 2);
2957 
2958   bind(L_third_loop);
2959 
2960   subsw(jdx, jdx, 1);
2961   br(Assembler::MI, L_third_loop_exit);
2962   subw(idx, idx, 4);
2963 
2964   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2965 
2966   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2967 
2968   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2969 
2970   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2971   ror(yz_idx2, yz_idx2, 32);
2972 
2973   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2974 
2975   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2976   umulh(tmp4, product_hi, yz_idx1);
2977 
2978   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2979   ror(rscratch2, rscratch2, 32);
2980 
2981   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2982   umulh(carry2, product_hi, yz_idx2);
2983 
2984   // propagate sum of both multiplications into carry:tmp4:tmp3
2985   adds(tmp3, tmp3, carry);
2986   adc(tmp4, tmp4, zr);
2987   adds(tmp3, tmp3, rscratch1);
2988   adcs(tmp4, tmp4, tmp);
2989   adc(carry, carry2, zr);
2990   adds(tmp4, tmp4, rscratch2);
2991   adc(carry, carry, zr);
2992 
2993   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2994   ror(tmp4, tmp4, 32);
2995   stp(tmp4, tmp3, Address(tmp6, 0));
2996 
2997   b(L_third_loop);
2998   bind (L_third_loop_exit);
2999 
3000   andw (idx, idx, 0x3);
3001   cbz(idx, L_post_third_loop_done);
3002 
3003   Label L_check_1;
3004   subsw(idx, idx, 2);
3005   br(Assembler::MI, L_check_1);
3006 
3007   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3008   ldr(yz_idx1, Address(rscratch1, 0));
3009   ror(yz_idx1, yz_idx1, 32);
3010   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
3011   umulh(tmp4, product_hi, yz_idx1);
3012   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3013   ldr(yz_idx2, Address(rscratch1, 0));
3014   ror(yz_idx2, yz_idx2, 32);
3015 
3016   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
3017 
3018   ror(tmp3, tmp3, 32);
3019   str(tmp3, Address(rscratch1, 0));
3020 
3021   bind (L_check_1);
3022 
3023   andw (idx, idx, 0x1);
3024   subsw(idx, idx, 1);
3025   br(Assembler::MI, L_post_third_loop_done);
3026   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3027   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3028   umulh(carry2, tmp4, product_hi);
3029   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3030 
3031   add2_with_carry(carry2, tmp3, tmp4, carry);
3032 
3033   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3034   extr(carry, carry2, tmp3, 32);
3035 
3036   bind(L_post_third_loop_done);
3037 }
3038 
3039 /**
3040  * Code for BigInteger::multiplyToLen() instrinsic.
3041  *
3042  * r0: x
3043  * r1: xlen
3044  * r2: y
3045  * r3: ylen
3046  * r4:  z
3047  * r5: zlen
3048  * r10: tmp1
3049  * r11: tmp2
3050  * r12: tmp3
3051  * r13: tmp4
3052  * r14: tmp5
3053  * r15: tmp6
3054  * r16: tmp7
3055  *
3056  */
3057 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3058                                      Register z, Register zlen,
3059                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3060                                      Register tmp5, Register tmp6, Register product_hi) {
3061 
3062   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3063 
3064   const Register idx = tmp1;
3065   const Register kdx = tmp2;
3066   const Register xstart = tmp3;
3067 
3068   const Register y_idx = tmp4;
3069   const Register carry = tmp5;
3070   const Register product  = xlen;
3071   const Register x_xstart = zlen;  // reuse register
3072 
3073   // First Loop.
3074   //
3075   //  final static long LONG_MASK = 0xffffffffL;
3076   //  int xstart = xlen - 1;
3077   //  int ystart = ylen - 1;
3078   //  long carry = 0;
3079   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3080   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3081   //    z[kdx] = (int)product;
3082   //    carry = product >>> 32;
3083   //  }
3084   //  z[xstart] = (int)carry;
3085   //
3086 
3087   movw(idx, ylen);      // idx = ylen;
3088   movw(kdx, zlen);      // kdx = xlen+ylen;
3089   mov(carry, zr);       // carry = 0;
3090 
3091   Label L_done;
3092 
3093   movw(xstart, xlen);
3094   subsw(xstart, xstart, 1);
3095   br(Assembler::MI, L_done);
3096 
3097   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3098 
3099   Label L_second_loop;
3100   cbzw(kdx, L_second_loop);
3101 
3102   Label L_carry;
3103   subw(kdx, kdx, 1);
3104   cbzw(kdx, L_carry);
3105 
3106   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3107   lsr(carry, carry, 32);
3108   subw(kdx, kdx, 1);
3109 
3110   bind(L_carry);
3111   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3112 
3113   // Second and third (nested) loops.
3114   //
3115   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3116   //   carry = 0;
3117   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3118   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3119   //                    (z[k] & LONG_MASK) + carry;
3120   //     z[k] = (int)product;
3121   //     carry = product >>> 32;
3122   //   }
3123   //   z[i] = (int)carry;
3124   // }
3125   //
3126   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3127 
3128   const Register jdx = tmp1;
3129 
3130   bind(L_second_loop);
3131   mov(carry, zr);                // carry = 0;
3132   movw(jdx, ylen);               // j = ystart+1
3133 
3134   subsw(xstart, xstart, 1);      // i = xstart-1;
3135   br(Assembler::MI, L_done);
3136 
3137   str(z, Address(pre(sp, -4 * wordSize)));
3138 
3139   Label L_last_x;
3140   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3141   subsw(xstart, xstart, 1);       // i = xstart-1;
3142   br(Assembler::MI, L_last_x);
3143 
3144   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3145   ldr(product_hi, Address(rscratch1));
3146   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3147 
3148   Label L_third_loop_prologue;
3149   bind(L_third_loop_prologue);
3150 
3151   str(ylen, Address(sp, wordSize));
3152   stp(x, xstart, Address(sp, 2 * wordSize));
3153   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3154                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3155   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3156   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3157 
3158   addw(tmp3, xlen, 1);
3159   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3160   subsw(tmp3, tmp3, 1);
3161   br(Assembler::MI, L_done);
3162 
3163   lsr(carry, carry, 32);
3164   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3165   b(L_second_loop);
3166 
3167   // Next infrequent code is moved outside loops.
3168   bind(L_last_x);
3169   ldrw(product_hi, Address(x,  0));
3170   b(L_third_loop_prologue);
3171 
3172   bind(L_done);
3173 }
3174 
3175 // Code for BigInteger::mulAdd instrinsic
3176 // out     = r0
3177 // in      = r1
3178 // offset  = r2  (already out.length-offset)
3179 // len     = r3
3180 // k       = r4
3181 //
3182 // pseudo code from java implementation:
3183 // carry = 0;
3184 // offset = out.length-offset - 1;
3185 // for (int j=len-1; j >= 0; j--) {
3186 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3187 //     out[offset--] = (int)product;
3188 //     carry = product >>> 32;
3189 // }
3190 // return (int)carry;
3191 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3192       Register len, Register k) {
3193     Label LOOP, END;
3194     // pre-loop
3195     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3196     csel(out, zr, out, Assembler::EQ);
3197     br(Assembler::EQ, END);
3198     add(in, in, len, LSL, 2); // in[j+1] address
3199     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3200     mov(out, zr); // used to keep carry now
3201     BIND(LOOP);
3202     ldrw(rscratch1, Address(pre(in, -4)));
3203     madd(rscratch1, rscratch1, k, out);
3204     ldrw(rscratch2, Address(pre(offset, -4)));
3205     add(rscratch1, rscratch1, rscratch2);
3206     strw(rscratch1, Address(offset));
3207     lsr(out, rscratch1, 32);
3208     subs(len, len, 1);
3209     br(Assembler::NE, LOOP);
3210     BIND(END);
3211 }
3212 
3213 /**
3214  * Emits code to update CRC-32 with a byte value according to constants in table
3215  *
3216  * @param [in,out]crc   Register containing the crc.
3217  * @param [in]val       Register containing the byte to fold into the CRC.
3218  * @param [in]table     Register containing the table of crc constants.
3219  *
3220  * uint32_t crc;
3221  * val = crc_table[(val ^ crc) & 0xFF];
3222  * crc = val ^ (crc >> 8);
3223  *
3224  */
3225 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3226   eor(val, val, crc);
3227   andr(val, val, 0xff);
3228   ldrw(val, Address(table, val, Address::lsl(2)));
3229   eor(crc, val, crc, Assembler::LSR, 8);
3230 }
3231 
3232 /**
3233  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3234  *
3235  * @param [in,out]crc   Register containing the crc.
3236  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3237  * @param [in]table0    Register containing table 0 of crc constants.
3238  * @param [in]table1    Register containing table 1 of crc constants.
3239  * @param [in]table2    Register containing table 2 of crc constants.
3240  * @param [in]table3    Register containing table 3 of crc constants.
3241  *
3242  * uint32_t crc;
3243  *   v = crc ^ v
3244  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3245  *
3246  */
3247 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3248         Register table0, Register table1, Register table2, Register table3,
3249         bool upper) {
3250   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3251   uxtb(tmp, v);
3252   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3253   ubfx(tmp, v, 8, 8);
3254   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3255   eor(crc, crc, tmp);
3256   ubfx(tmp, v, 16, 8);
3257   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3258   eor(crc, crc, tmp);
3259   ubfx(tmp, v, 24, 8);
3260   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3261   eor(crc, crc, tmp);
3262 }
3263 
3264 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3265         Register len, Register tmp0, Register tmp1, Register tmp2,
3266         Register tmp3) {
3267     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3268     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3269 
3270     mvnw(crc, crc);
3271 
3272     subs(len, len, 128);
3273     br(Assembler::GE, CRC_by64_pre);
3274   BIND(CRC_less64);
3275     adds(len, len, 128-32);
3276     br(Assembler::GE, CRC_by32_loop);
3277   BIND(CRC_less32);
3278     adds(len, len, 32-4);
3279     br(Assembler::GE, CRC_by4_loop);
3280     adds(len, len, 4);
3281     br(Assembler::GT, CRC_by1_loop);
3282     b(L_exit);
3283 
3284   BIND(CRC_by32_loop);
3285     ldp(tmp0, tmp1, Address(post(buf, 16)));
3286     subs(len, len, 32);
3287     crc32x(crc, crc, tmp0);
3288     ldr(tmp2, Address(post(buf, 8)));
3289     crc32x(crc, crc, tmp1);
3290     ldr(tmp3, Address(post(buf, 8)));
3291     crc32x(crc, crc, tmp2);
3292     crc32x(crc, crc, tmp3);
3293     br(Assembler::GE, CRC_by32_loop);
3294     cmn(len, 32);
3295     br(Assembler::NE, CRC_less32);
3296     b(L_exit);
3297 
3298   BIND(CRC_by4_loop);
3299     ldrw(tmp0, Address(post(buf, 4)));
3300     subs(len, len, 4);
3301     crc32w(crc, crc, tmp0);
3302     br(Assembler::GE, CRC_by4_loop);
3303     adds(len, len, 4);
3304     br(Assembler::LE, L_exit);
3305   BIND(CRC_by1_loop);
3306     ldrb(tmp0, Address(post(buf, 1)));
3307     subs(len, len, 1);
3308     crc32b(crc, crc, tmp0);
3309     br(Assembler::GT, CRC_by1_loop);
3310     b(L_exit);
3311 
3312   BIND(CRC_by64_pre);
3313     sub(buf, buf, 8);
3314     ldp(tmp0, tmp1, Address(buf, 8));
3315     crc32x(crc, crc, tmp0);
3316     ldr(tmp2, Address(buf, 24));
3317     crc32x(crc, crc, tmp1);
3318     ldr(tmp3, Address(buf, 32));
3319     crc32x(crc, crc, tmp2);
3320     ldr(tmp0, Address(buf, 40));
3321     crc32x(crc, crc, tmp3);
3322     ldr(tmp1, Address(buf, 48));
3323     crc32x(crc, crc, tmp0);
3324     ldr(tmp2, Address(buf, 56));
3325     crc32x(crc, crc, tmp1);
3326     ldr(tmp3, Address(pre(buf, 64)));
3327 
3328     b(CRC_by64_loop);
3329 
3330     align(CodeEntryAlignment);
3331   BIND(CRC_by64_loop);
3332     subs(len, len, 64);
3333     crc32x(crc, crc, tmp2);
3334     ldr(tmp0, Address(buf, 8));
3335     crc32x(crc, crc, tmp3);
3336     ldr(tmp1, Address(buf, 16));
3337     crc32x(crc, crc, tmp0);
3338     ldr(tmp2, Address(buf, 24));
3339     crc32x(crc, crc, tmp1);
3340     ldr(tmp3, Address(buf, 32));
3341     crc32x(crc, crc, tmp2);
3342     ldr(tmp0, Address(buf, 40));
3343     crc32x(crc, crc, tmp3);
3344     ldr(tmp1, Address(buf, 48));
3345     crc32x(crc, crc, tmp0);
3346     ldr(tmp2, Address(buf, 56));
3347     crc32x(crc, crc, tmp1);
3348     ldr(tmp3, Address(pre(buf, 64)));
3349     br(Assembler::GE, CRC_by64_loop);
3350 
3351     // post-loop
3352     crc32x(crc, crc, tmp2);
3353     crc32x(crc, crc, tmp3);
3354 
3355     sub(len, len, 64);
3356     add(buf, buf, 8);
3357     cmn(len, 128);
3358     br(Assembler::NE, CRC_less64);
3359   BIND(L_exit);
3360     mvnw(crc, crc);
3361 }
3362 
3363 /**
3364  * @param crc   register containing existing CRC (32-bit)
3365  * @param buf   register pointing to input byte buffer (byte*)
3366  * @param len   register containing number of bytes
3367  * @param table register that will contain address of CRC table
3368  * @param tmp   scratch register
3369  */
3370 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3371         Register table0, Register table1, Register table2, Register table3,
3372         Register tmp, Register tmp2, Register tmp3) {
3373   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3374   unsigned long offset;
3375 
3376   if (UseCRC32) {
3377       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3378       return;
3379   }
3380 
3381     mvnw(crc, crc);
3382 
3383     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3384     if (offset) add(table0, table0, offset);
3385     add(table1, table0, 1*256*sizeof(juint));
3386     add(table2, table0, 2*256*sizeof(juint));
3387     add(table3, table0, 3*256*sizeof(juint));
3388 
3389   if (UseNeon) {
3390       cmp(len, (u1)64);
3391       br(Assembler::LT, L_by16);
3392       eor(v16, T16B, v16, v16);
3393 
3394     Label L_fold;
3395 
3396       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3397 
3398       ld1(v0, v1, T2D, post(buf, 32));
3399       ld1r(v4, T2D, post(tmp, 8));
3400       ld1r(v5, T2D, post(tmp, 8));
3401       ld1r(v6, T2D, post(tmp, 8));
3402       ld1r(v7, T2D, post(tmp, 8));
3403       mov(v16, T4S, 0, crc);
3404 
3405       eor(v0, T16B, v0, v16);
3406       sub(len, len, 64);
3407 
3408     BIND(L_fold);
3409       pmull(v22, T8H, v0, v5, T8B);
3410       pmull(v20, T8H, v0, v7, T8B);
3411       pmull(v23, T8H, v0, v4, T8B);
3412       pmull(v21, T8H, v0, v6, T8B);
3413 
3414       pmull2(v18, T8H, v0, v5, T16B);
3415       pmull2(v16, T8H, v0, v7, T16B);
3416       pmull2(v19, T8H, v0, v4, T16B);
3417       pmull2(v17, T8H, v0, v6, T16B);
3418 
3419       uzp1(v24, T8H, v20, v22);
3420       uzp2(v25, T8H, v20, v22);
3421       eor(v20, T16B, v24, v25);
3422 
3423       uzp1(v26, T8H, v16, v18);
3424       uzp2(v27, T8H, v16, v18);
3425       eor(v16, T16B, v26, v27);
3426 
3427       ushll2(v22, T4S, v20, T8H, 8);
3428       ushll(v20, T4S, v20, T4H, 8);
3429 
3430       ushll2(v18, T4S, v16, T8H, 8);
3431       ushll(v16, T4S, v16, T4H, 8);
3432 
3433       eor(v22, T16B, v23, v22);
3434       eor(v18, T16B, v19, v18);
3435       eor(v20, T16B, v21, v20);
3436       eor(v16, T16B, v17, v16);
3437 
3438       uzp1(v17, T2D, v16, v20);
3439       uzp2(v21, T2D, v16, v20);
3440       eor(v17, T16B, v17, v21);
3441 
3442       ushll2(v20, T2D, v17, T4S, 16);
3443       ushll(v16, T2D, v17, T2S, 16);
3444 
3445       eor(v20, T16B, v20, v22);
3446       eor(v16, T16B, v16, v18);
3447 
3448       uzp1(v17, T2D, v20, v16);
3449       uzp2(v21, T2D, v20, v16);
3450       eor(v28, T16B, v17, v21);
3451 
3452       pmull(v22, T8H, v1, v5, T8B);
3453       pmull(v20, T8H, v1, v7, T8B);
3454       pmull(v23, T8H, v1, v4, T8B);
3455       pmull(v21, T8H, v1, v6, T8B);
3456 
3457       pmull2(v18, T8H, v1, v5, T16B);
3458       pmull2(v16, T8H, v1, v7, T16B);
3459       pmull2(v19, T8H, v1, v4, T16B);
3460       pmull2(v17, T8H, v1, v6, T16B);
3461 
3462       ld1(v0, v1, T2D, post(buf, 32));
3463 
3464       uzp1(v24, T8H, v20, v22);
3465       uzp2(v25, T8H, v20, v22);
3466       eor(v20, T16B, v24, v25);
3467 
3468       uzp1(v26, T8H, v16, v18);
3469       uzp2(v27, T8H, v16, v18);
3470       eor(v16, T16B, v26, v27);
3471 
3472       ushll2(v22, T4S, v20, T8H, 8);
3473       ushll(v20, T4S, v20, T4H, 8);
3474 
3475       ushll2(v18, T4S, v16, T8H, 8);
3476       ushll(v16, T4S, v16, T4H, 8);
3477 
3478       eor(v22, T16B, v23, v22);
3479       eor(v18, T16B, v19, v18);
3480       eor(v20, T16B, v21, v20);
3481       eor(v16, T16B, v17, v16);
3482 
3483       uzp1(v17, T2D, v16, v20);
3484       uzp2(v21, T2D, v16, v20);
3485       eor(v16, T16B, v17, v21);
3486 
3487       ushll2(v20, T2D, v16, T4S, 16);
3488       ushll(v16, T2D, v16, T2S, 16);
3489 
3490       eor(v20, T16B, v22, v20);
3491       eor(v16, T16B, v16, v18);
3492 
3493       uzp1(v17, T2D, v20, v16);
3494       uzp2(v21, T2D, v20, v16);
3495       eor(v20, T16B, v17, v21);
3496 
3497       shl(v16, T2D, v28, 1);
3498       shl(v17, T2D, v20, 1);
3499 
3500       eor(v0, T16B, v0, v16);
3501       eor(v1, T16B, v1, v17);
3502 
3503       subs(len, len, 32);
3504       br(Assembler::GE, L_fold);
3505 
3506       mov(crc, 0);
3507       mov(tmp, v0, T1D, 0);
3508       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3509       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3510       mov(tmp, v0, T1D, 1);
3511       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3512       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3513       mov(tmp, v1, T1D, 0);
3514       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3515       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3516       mov(tmp, v1, T1D, 1);
3517       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3518       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3519 
3520       add(len, len, 32);
3521   }
3522 
3523   BIND(L_by16);
3524     subs(len, len, 16);
3525     br(Assembler::GE, L_by16_loop);
3526     adds(len, len, 16-4);
3527     br(Assembler::GE, L_by4_loop);
3528     adds(len, len, 4);
3529     br(Assembler::GT, L_by1_loop);
3530     b(L_exit);
3531 
3532   BIND(L_by4_loop);
3533     ldrw(tmp, Address(post(buf, 4)));
3534     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3535     subs(len, len, 4);
3536     br(Assembler::GE, L_by4_loop);
3537     adds(len, len, 4);
3538     br(Assembler::LE, L_exit);
3539   BIND(L_by1_loop);
3540     subs(len, len, 1);
3541     ldrb(tmp, Address(post(buf, 1)));
3542     update_byte_crc32(crc, tmp, table0);
3543     br(Assembler::GT, L_by1_loop);
3544     b(L_exit);
3545 
3546     align(CodeEntryAlignment);
3547   BIND(L_by16_loop);
3548     subs(len, len, 16);
3549     ldp(tmp, tmp3, Address(post(buf, 16)));
3550     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3551     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3552     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3553     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3554     br(Assembler::GE, L_by16_loop);
3555     adds(len, len, 16-4);
3556     br(Assembler::GE, L_by4_loop);
3557     adds(len, len, 4);
3558     br(Assembler::GT, L_by1_loop);
3559   BIND(L_exit);
3560     mvnw(crc, crc);
3561 }
3562 
3563 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3564         Register len, Register tmp0, Register tmp1, Register tmp2,
3565         Register tmp3) {
3566     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3567     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3568 
3569     subs(len, len, 128);
3570     br(Assembler::GE, CRC_by64_pre);
3571   BIND(CRC_less64);
3572     adds(len, len, 128-32);
3573     br(Assembler::GE, CRC_by32_loop);
3574   BIND(CRC_less32);
3575     adds(len, len, 32-4);
3576     br(Assembler::GE, CRC_by4_loop);
3577     adds(len, len, 4);
3578     br(Assembler::GT, CRC_by1_loop);
3579     b(L_exit);
3580 
3581   BIND(CRC_by32_loop);
3582     ldp(tmp0, tmp1, Address(post(buf, 16)));
3583     subs(len, len, 32);
3584     crc32cx(crc, crc, tmp0);
3585     ldr(tmp2, Address(post(buf, 8)));
3586     crc32cx(crc, crc, tmp1);
3587     ldr(tmp3, Address(post(buf, 8)));
3588     crc32cx(crc, crc, tmp2);
3589     crc32cx(crc, crc, tmp3);
3590     br(Assembler::GE, CRC_by32_loop);
3591     cmn(len, 32);
3592     br(Assembler::NE, CRC_less32);
3593     b(L_exit);
3594 
3595   BIND(CRC_by4_loop);
3596     ldrw(tmp0, Address(post(buf, 4)));
3597     subs(len, len, 4);
3598     crc32cw(crc, crc, tmp0);
3599     br(Assembler::GE, CRC_by4_loop);
3600     adds(len, len, 4);
3601     br(Assembler::LE, L_exit);
3602   BIND(CRC_by1_loop);
3603     ldrb(tmp0, Address(post(buf, 1)));
3604     subs(len, len, 1);
3605     crc32cb(crc, crc, tmp0);
3606     br(Assembler::GT, CRC_by1_loop);
3607     b(L_exit);
3608 
3609   BIND(CRC_by64_pre);
3610     sub(buf, buf, 8);
3611     ldp(tmp0, tmp1, Address(buf, 8));
3612     crc32cx(crc, crc, tmp0);
3613     ldr(tmp2, Address(buf, 24));
3614     crc32cx(crc, crc, tmp1);
3615     ldr(tmp3, Address(buf, 32));
3616     crc32cx(crc, crc, tmp2);
3617     ldr(tmp0, Address(buf, 40));
3618     crc32cx(crc, crc, tmp3);
3619     ldr(tmp1, Address(buf, 48));
3620     crc32cx(crc, crc, tmp0);
3621     ldr(tmp2, Address(buf, 56));
3622     crc32cx(crc, crc, tmp1);
3623     ldr(tmp3, Address(pre(buf, 64)));
3624 
3625     b(CRC_by64_loop);
3626 
3627     align(CodeEntryAlignment);
3628   BIND(CRC_by64_loop);
3629     subs(len, len, 64);
3630     crc32cx(crc, crc, tmp2);
3631     ldr(tmp0, Address(buf, 8));
3632     crc32cx(crc, crc, tmp3);
3633     ldr(tmp1, Address(buf, 16));
3634     crc32cx(crc, crc, tmp0);
3635     ldr(tmp2, Address(buf, 24));
3636     crc32cx(crc, crc, tmp1);
3637     ldr(tmp3, Address(buf, 32));
3638     crc32cx(crc, crc, tmp2);
3639     ldr(tmp0, Address(buf, 40));
3640     crc32cx(crc, crc, tmp3);
3641     ldr(tmp1, Address(buf, 48));
3642     crc32cx(crc, crc, tmp0);
3643     ldr(tmp2, Address(buf, 56));
3644     crc32cx(crc, crc, tmp1);
3645     ldr(tmp3, Address(pre(buf, 64)));
3646     br(Assembler::GE, CRC_by64_loop);
3647 
3648     // post-loop
3649     crc32cx(crc, crc, tmp2);
3650     crc32cx(crc, crc, tmp3);
3651 
3652     sub(len, len, 64);
3653     add(buf, buf, 8);
3654     cmn(len, 128);
3655     br(Assembler::NE, CRC_less64);
3656   BIND(L_exit);
3657 }
3658 
3659 /**
3660  * @param crc   register containing existing CRC (32-bit)
3661  * @param buf   register pointing to input byte buffer (byte*)
3662  * @param len   register containing number of bytes
3663  * @param table register that will contain address of CRC table
3664  * @param tmp   scratch register
3665  */
3666 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3667         Register table0, Register table1, Register table2, Register table3,
3668         Register tmp, Register tmp2, Register tmp3) {
3669   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3670 }
3671 
3672 
3673 SkipIfEqual::SkipIfEqual(
3674     MacroAssembler* masm, const bool* flag_addr, bool value) {
3675   _masm = masm;
3676   unsigned long offset;
3677   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3678   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3679   _masm->cbzw(rscratch1, _label);
3680 }
3681 
3682 SkipIfEqual::~SkipIfEqual() {
3683   _masm->bind(_label);
3684 }
3685 
3686 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3687   Address adr;
3688   switch(dst.getMode()) {
3689   case Address::base_plus_offset:
3690     // This is the expected mode, although we allow all the other
3691     // forms below.
3692     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3693     break;
3694   default:
3695     lea(rscratch2, dst);
3696     adr = Address(rscratch2);
3697     break;
3698   }
3699   ldr(rscratch1, adr);
3700   add(rscratch1, rscratch1, src);
3701   str(rscratch1, adr);
3702 }
3703 
3704 void MacroAssembler::cmpptr(Register src1, Address src2) {
3705   unsigned long offset;
3706   adrp(rscratch1, src2, offset);
3707   ldr(rscratch1, Address(rscratch1, offset));
3708   cmp(src1, rscratch1);
3709 }
3710 
3711 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3712   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3713   bs->obj_equals(this, obj1, obj2);
3714 }
3715 
3716 void MacroAssembler::load_klass(Register dst, Register src) {
3717   if (UseCompressedClassPointers) {
3718     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3719     decode_klass_not_null(dst);
3720   } else {
3721     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3722   }
3723 }
3724 
3725 // ((OopHandle)result).resolve();
3726 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3727   // OopHandle::resolve is an indirection.
3728   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3729 }
3730 
3731 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3732   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3733   ldr(dst, Address(rmethod, Method::const_offset()));
3734   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3735   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3736   ldr(dst, Address(dst, mirror_offset));
3737   resolve_oop_handle(dst, tmp);
3738 }
3739 
3740 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3741   if (UseCompressedClassPointers) {
3742     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3743     if (Universe::narrow_klass_base() == NULL) {
3744       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3745       return;
3746     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3747                && Universe::narrow_klass_shift() == 0) {
3748       // Only the bottom 32 bits matter
3749       cmpw(trial_klass, tmp);
3750       return;
3751     }
3752     decode_klass_not_null(tmp);
3753   } else {
3754     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3755   }
3756   cmp(trial_klass, tmp);
3757 }
3758 
3759 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3760   load_klass(dst, src);
3761   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3762 }
3763 
3764 void MacroAssembler::store_klass(Register dst, Register src) {
3765   // FIXME: Should this be a store release?  concurrent gcs assumes
3766   // klass length is valid if klass field is not null.
3767   if (UseCompressedClassPointers) {
3768     encode_klass_not_null(src);
3769     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3770   } else {
3771     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3772   }
3773 }
3774 
3775 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3776   if (UseCompressedClassPointers) {
3777     // Store to klass gap in destination
3778     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3779   }
3780 }
3781 
3782 // Algorithm must match CompressedOops::encode.
3783 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3784 #ifdef ASSERT
3785   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3786 #endif
3787   verify_oop(s, "broken oop in encode_heap_oop");
3788   if (Universe::narrow_oop_base() == NULL) {
3789     if (Universe::narrow_oop_shift() != 0) {
3790       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3791       lsr(d, s, LogMinObjAlignmentInBytes);
3792     } else {
3793       mov(d, s);
3794     }
3795   } else {
3796     subs(d, s, rheapbase);
3797     csel(d, d, zr, Assembler::HS);
3798     lsr(d, d, LogMinObjAlignmentInBytes);
3799 
3800     /*  Old algorithm: is this any worse?
3801     Label nonnull;
3802     cbnz(r, nonnull);
3803     sub(r, r, rheapbase);
3804     bind(nonnull);
3805     lsr(r, r, LogMinObjAlignmentInBytes);
3806     */
3807   }
3808 }
3809 
3810 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3811 #ifdef ASSERT
3812   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3813   if (CheckCompressedOops) {
3814     Label ok;
3815     cbnz(r, ok);
3816     stop("null oop passed to encode_heap_oop_not_null");
3817     bind(ok);
3818   }
3819 #endif
3820   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3821   if (Universe::narrow_oop_base() != NULL) {
3822     sub(r, r, rheapbase);
3823   }
3824   if (Universe::narrow_oop_shift() != 0) {
3825     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3826     lsr(r, r, LogMinObjAlignmentInBytes);
3827   }
3828 }
3829 
3830 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3831 #ifdef ASSERT
3832   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3833   if (CheckCompressedOops) {
3834     Label ok;
3835     cbnz(src, ok);
3836     stop("null oop passed to encode_heap_oop_not_null2");
3837     bind(ok);
3838   }
3839 #endif
3840   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3841 
3842   Register data = src;
3843   if (Universe::narrow_oop_base() != NULL) {
3844     sub(dst, src, rheapbase);
3845     data = dst;
3846   }
3847   if (Universe::narrow_oop_shift() != 0) {
3848     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3849     lsr(dst, data, LogMinObjAlignmentInBytes);
3850     data = dst;
3851   }
3852   if (data == src)
3853     mov(dst, src);
3854 }
3855 
3856 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3857 #ifdef ASSERT
3858   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3859 #endif
3860   if (Universe::narrow_oop_base() == NULL) {
3861     if (Universe::narrow_oop_shift() != 0 || d != s) {
3862       lsl(d, s, Universe::narrow_oop_shift());
3863     }
3864   } else {
3865     Label done;
3866     if (d != s)
3867       mov(d, s);
3868     cbz(s, done);
3869     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3870     bind(done);
3871   }
3872   verify_oop(d, "broken oop in decode_heap_oop");
3873 }
3874 
3875 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3876   assert (UseCompressedOops, "should only be used for compressed headers");
3877   assert (Universe::heap() != NULL, "java heap should be initialized");
3878   // Cannot assert, unverified entry point counts instructions (see .ad file)
3879   // vtableStubs also counts instructions in pd_code_size_limit.
3880   // Also do not verify_oop as this is called by verify_oop.
3881   if (Universe::narrow_oop_shift() != 0) {
3882     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3883     if (Universe::narrow_oop_base() != NULL) {
3884       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3885     } else {
3886       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3887     }
3888   } else {
3889     assert (Universe::narrow_oop_base() == NULL, "sanity");
3890   }
3891 }
3892 
3893 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3894   assert (UseCompressedOops, "should only be used for compressed headers");
3895   assert (Universe::heap() != NULL, "java heap should be initialized");
3896   // Cannot assert, unverified entry point counts instructions (see .ad file)
3897   // vtableStubs also counts instructions in pd_code_size_limit.
3898   // Also do not verify_oop as this is called by verify_oop.
3899   if (Universe::narrow_oop_shift() != 0) {
3900     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3901     if (Universe::narrow_oop_base() != NULL) {
3902       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3903     } else {
3904       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3905     }
3906   } else {
3907     assert (Universe::narrow_oop_base() == NULL, "sanity");
3908     if (dst != src) {
3909       mov(dst, src);
3910     }
3911   }
3912 }
3913 
3914 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3915   if (Universe::narrow_klass_base() == NULL) {
3916     if (Universe::narrow_klass_shift() != 0) {
3917       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3918       lsr(dst, src, LogKlassAlignmentInBytes);
3919     } else {
3920       if (dst != src) mov(dst, src);
3921     }
3922     return;
3923   }
3924 
3925   if (use_XOR_for_compressed_class_base) {
3926     if (Universe::narrow_klass_shift() != 0) {
3927       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3928       lsr(dst, dst, LogKlassAlignmentInBytes);
3929     } else {
3930       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3931     }
3932     return;
3933   }
3934 
3935   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3936       && Universe::narrow_klass_shift() == 0) {
3937     movw(dst, src);
3938     return;
3939   }
3940 
3941 #ifdef ASSERT
3942   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3943 #endif
3944 
3945   Register rbase = dst;
3946   if (dst == src) rbase = rheapbase;
3947   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3948   sub(dst, src, rbase);
3949   if (Universe::narrow_klass_shift() != 0) {
3950     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3951     lsr(dst, dst, LogKlassAlignmentInBytes);
3952   }
3953   if (dst == src) reinit_heapbase();
3954 }
3955 
3956 void MacroAssembler::encode_klass_not_null(Register r) {
3957   encode_klass_not_null(r, r);
3958 }
3959 
3960 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3961   Register rbase = dst;
3962   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3963 
3964   if (Universe::narrow_klass_base() == NULL) {
3965     if (Universe::narrow_klass_shift() != 0) {
3966       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3967       lsl(dst, src, LogKlassAlignmentInBytes);
3968     } else {
3969       if (dst != src) mov(dst, src);
3970     }
3971     return;
3972   }
3973 
3974   if (use_XOR_for_compressed_class_base) {
3975     if (Universe::narrow_klass_shift() != 0) {
3976       lsl(dst, src, LogKlassAlignmentInBytes);
3977       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3978     } else {
3979       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3980     }
3981     return;
3982   }
3983 
3984   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3985       && Universe::narrow_klass_shift() == 0) {
3986     if (dst != src)
3987       movw(dst, src);
3988     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3989     return;
3990   }
3991 
3992   // Cannot assert, unverified entry point counts instructions (see .ad file)
3993   // vtableStubs also counts instructions in pd_code_size_limit.
3994   // Also do not verify_oop as this is called by verify_oop.
3995   if (dst == src) rbase = rheapbase;
3996   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3997   if (Universe::narrow_klass_shift() != 0) {
3998     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3999     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
4000   } else {
4001     add(dst, rbase, src);
4002   }
4003   if (dst == src) reinit_heapbase();
4004 }
4005 
4006 void  MacroAssembler::decode_klass_not_null(Register r) {
4007   decode_klass_not_null(r, r);
4008 }
4009 
4010 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
4011 #ifdef ASSERT
4012   {
4013     ThreadInVMfromUnknown tiv;
4014     assert (UseCompressedOops, "should only be used for compressed oops");
4015     assert (Universe::heap() != NULL, "java heap should be initialized");
4016     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4017     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4018   }
4019 #endif
4020   int oop_index = oop_recorder()->find_index(obj);
4021   InstructionMark im(this);
4022   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4023   code_section()->relocate(inst_mark(), rspec);
4024   movz(dst, 0xDEAD, 16);
4025   movk(dst, 0xBEEF);
4026 }
4027 
4028 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4029   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4030   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4031   int index = oop_recorder()->find_index(k);
4032   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4033 
4034   InstructionMark im(this);
4035   RelocationHolder rspec = metadata_Relocation::spec(index);
4036   code_section()->relocate(inst_mark(), rspec);
4037   narrowKlass nk = Klass::encode_klass(k);
4038   movz(dst, (nk >> 16), 16);
4039   movk(dst, nk & 0xffff);
4040 }
4041 
4042 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4043                                     Register dst, Address src,
4044                                     Register tmp1, Register thread_tmp) {
4045   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4046   decorators = AccessInternal::decorator_fixup(decorators);
4047   bool as_raw = (decorators & AS_RAW) != 0;
4048   if (as_raw) {
4049     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4050   } else {
4051     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4052   }
4053 }
4054 
4055 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4056                                      Address dst, Register src,
4057                                      Register tmp1, Register thread_tmp) {
4058   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4059   decorators = AccessInternal::decorator_fixup(decorators);
4060   bool as_raw = (decorators & AS_RAW) != 0;
4061   if (as_raw) {
4062     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4063   } else {
4064     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4065   }
4066 }
4067 
4068 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4069   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4070   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4071     decorators |= ACCESS_READ | ACCESS_WRITE;
4072   }
4073   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4074   return bs->resolve(this, decorators, obj);
4075 }
4076 
4077 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4078                                    Register thread_tmp, DecoratorSet decorators) {
4079   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4080 }
4081 
4082 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4083                                             Register thread_tmp, DecoratorSet decorators) {
4084   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4085 }
4086 
4087 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4088                                     Register thread_tmp, DecoratorSet decorators) {
4089   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4090 }
4091 
4092 // Used for storing NULLs.
4093 void MacroAssembler::store_heap_oop_null(Address dst) {
4094   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4095 }
4096 
4097 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4098   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4099   int index = oop_recorder()->allocate_metadata_index(obj);
4100   RelocationHolder rspec = metadata_Relocation::spec(index);
4101   return Address((address)obj, rspec);
4102 }
4103 
4104 // Move an oop into a register.  immediate is true if we want
4105 // immediate instrcutions, i.e. we are not going to patch this
4106 // instruction while the code is being executed by another thread.  In
4107 // that case we can use move immediates rather than the constant pool.
4108 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4109   int oop_index;
4110   if (obj == NULL) {
4111     oop_index = oop_recorder()->allocate_oop_index(obj);
4112   } else {
4113 #ifdef ASSERT
4114     {
4115       ThreadInVMfromUnknown tiv;
4116       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4117     }
4118 #endif
4119     oop_index = oop_recorder()->find_index(obj);
4120   }
4121   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4122   if (! immediate) {
4123     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4124     ldr_constant(dst, Address(dummy, rspec));
4125   } else
4126     mov(dst, Address((address)obj, rspec));
4127 }
4128 
4129 // Move a metadata address into a register.
4130 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4131   int oop_index;
4132   if (obj == NULL) {
4133     oop_index = oop_recorder()->allocate_metadata_index(obj);
4134   } else {
4135     oop_index = oop_recorder()->find_index(obj);
4136   }
4137   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4138   mov(dst, Address((address)obj, rspec));
4139 }
4140 
4141 Address MacroAssembler::constant_oop_address(jobject obj) {
4142 #ifdef ASSERT
4143   {
4144     ThreadInVMfromUnknown tiv;
4145     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4146     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4147   }
4148 #endif
4149   int oop_index = oop_recorder()->find_index(obj);
4150   return Address((address)obj, oop_Relocation::spec(oop_index));
4151 }
4152 
4153 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4154 void MacroAssembler::tlab_allocate(Register obj,
4155                                    Register var_size_in_bytes,
4156                                    int con_size_in_bytes,
4157                                    Register t1,
4158                                    Register t2,
4159                                    Label& slow_case) {
4160   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4161   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4162 }
4163 
4164 // Defines obj, preserves var_size_in_bytes
4165 void MacroAssembler::eden_allocate(Register obj,
4166                                    Register var_size_in_bytes,
4167                                    int con_size_in_bytes,
4168                                    Register t1,
4169                                    Label& slow_case) {
4170   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4171   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4172 }
4173 
4174 // Zero words; len is in bytes
4175 // Destroys all registers except addr
4176 // len must be a nonzero multiple of wordSize
4177 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4178   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4179 
4180 #ifdef ASSERT
4181   { Label L;
4182     tst(len, BytesPerWord - 1);
4183     br(Assembler::EQ, L);
4184     stop("len is not a multiple of BytesPerWord");
4185     bind(L);
4186   }
4187 #endif
4188 
4189 #ifndef PRODUCT
4190   block_comment("zero memory");
4191 #endif
4192 
4193   Label loop;
4194   Label entry;
4195 
4196 //  Algorithm:
4197 //
4198 //    scratch1 = cnt & 7;
4199 //    cnt -= scratch1;
4200 //    p += scratch1;
4201 //    switch (scratch1) {
4202 //      do {
4203 //        cnt -= 8;
4204 //          p[-8] = 0;
4205 //        case 7:
4206 //          p[-7] = 0;
4207 //        case 6:
4208 //          p[-6] = 0;
4209 //          // ...
4210 //        case 1:
4211 //          p[-1] = 0;
4212 //        case 0:
4213 //          p += 8;
4214 //      } while (cnt);
4215 //    }
4216 
4217   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4218 
4219   lsr(len, len, LogBytesPerWord);
4220   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4221   sub(len, len, rscratch1);      // cnt -= unroll
4222   // t1 always points to the end of the region we're about to zero
4223   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4224   adr(rscratch2, entry);
4225   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4226   br(rscratch2);
4227   bind(loop);
4228   sub(len, len, unroll);
4229   for (int i = -unroll; i < 0; i++)
4230     Assembler::str(zr, Address(t1, i * wordSize));
4231   bind(entry);
4232   add(t1, t1, unroll * wordSize);
4233   cbnz(len, loop);
4234 }
4235 
4236 void MacroAssembler::verify_tlab() {
4237 #ifdef ASSERT
4238   if (UseTLAB && VerifyOops) {
4239     Label next, ok;
4240 
4241     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4242 
4243     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4244     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4245     cmp(rscratch2, rscratch1);
4246     br(Assembler::HS, next);
4247     STOP("assert(top >= start)");
4248     should_not_reach_here();
4249 
4250     bind(next);
4251     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4252     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4253     cmp(rscratch2, rscratch1);
4254     br(Assembler::HS, ok);
4255     STOP("assert(top <= end)");
4256     should_not_reach_here();
4257 
4258     bind(ok);
4259     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4260   }
4261 #endif
4262 }
4263 
4264 // Writes to stack successive pages until offset reached to check for
4265 // stack overflow + shadow pages.  This clobbers tmp.
4266 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4267   assert_different_registers(tmp, size, rscratch1);
4268   mov(tmp, sp);
4269   // Bang stack for total size given plus shadow page size.
4270   // Bang one page at a time because large size can bang beyond yellow and
4271   // red zones.
4272   Label loop;
4273   mov(rscratch1, os::vm_page_size());
4274   bind(loop);
4275   lea(tmp, Address(tmp, -os::vm_page_size()));
4276   subsw(size, size, rscratch1);
4277   str(size, Address(tmp));
4278   br(Assembler::GT, loop);
4279 
4280   // Bang down shadow pages too.
4281   // At this point, (tmp-0) is the last address touched, so don't
4282   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4283   // was post-decremented.)  Skip this address by starting at i=1, and
4284   // touch a few more pages below.  N.B.  It is important to touch all
4285   // the way down to and including i=StackShadowPages.
4286   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4287     // this could be any sized move but this is can be a debugging crumb
4288     // so the bigger the better.
4289     lea(tmp, Address(tmp, -os::vm_page_size()));
4290     str(size, Address(tmp));
4291   }
4292 }
4293 
4294 
4295 // Move the address of the polling page into dest.
4296 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4297   if (SafepointMechanism::uses_thread_local_poll()) {
4298     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4299   } else {
4300     unsigned long off;
4301     adrp(dest, Address(page, rtype), off);
4302     assert(off == 0, "polling page must be page aligned");
4303   }
4304 }
4305 
4306 // Move the address of the polling page into r, then read the polling
4307 // page.
4308 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4309   get_polling_page(r, page, rtype);
4310   return read_polling_page(r, rtype);
4311 }
4312 
4313 // Read the polling page.  The address of the polling page must
4314 // already be in r.
4315 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4316   InstructionMark im(this);
4317   code_section()->relocate(inst_mark(), rtype);
4318   ldrw(zr, Address(r, 0));
4319   return inst_mark();
4320 }
4321 
4322 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4323   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4324   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4325   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4326   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4327   long offset_low = dest_page - low_page;
4328   long offset_high = dest_page - high_page;
4329 
4330   assert(is_valid_AArch64_address(dest.target()), "bad address");
4331   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4332 
4333   InstructionMark im(this);
4334   code_section()->relocate(inst_mark(), dest.rspec());
4335   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4336   // the code cache so that if it is relocated we know it will still reach
4337   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4338     _adrp(reg1, dest.target());
4339   } else {
4340     unsigned long target = (unsigned long)dest.target();
4341     unsigned long adrp_target
4342       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4343 
4344     _adrp(reg1, (address)adrp_target);
4345     movk(reg1, target >> 32, 32);
4346   }
4347   byte_offset = (unsigned long)dest.target() & 0xfff;
4348 }
4349 
4350 void MacroAssembler::load_byte_map_base(Register reg) {
4351   jbyte *byte_map_base =
4352     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4353 
4354   if (is_valid_AArch64_address((address)byte_map_base)) {
4355     // Strictly speaking the byte_map_base isn't an address at all,
4356     // and it might even be negative.
4357     unsigned long offset;
4358     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4359     // We expect offset to be zero with most collectors.
4360     if (offset != 0) {
4361       add(reg, reg, offset);
4362     }
4363   } else {
4364     mov(reg, (uint64_t)byte_map_base);
4365   }
4366 }
4367 
4368 void MacroAssembler::build_frame(int framesize) {
4369   assert(framesize > 0, "framesize must be > 0");
4370   if (framesize < ((1 << 9) + 2 * wordSize)) {
4371     sub(sp, sp, framesize);
4372     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4373     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4374   } else {
4375     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4376     if (PreserveFramePointer) mov(rfp, sp);
4377     if (framesize < ((1 << 12) + 2 * wordSize))
4378       sub(sp, sp, framesize - 2 * wordSize);
4379     else {
4380       mov(rscratch1, framesize - 2 * wordSize);
4381       sub(sp, sp, rscratch1);
4382     }
4383   }
4384 }
4385 
4386 void MacroAssembler::remove_frame(int framesize) {
4387   assert(framesize > 0, "framesize must be > 0");
4388   if (framesize < ((1 << 9) + 2 * wordSize)) {
4389     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4390     add(sp, sp, framesize);
4391   } else {
4392     if (framesize < ((1 << 12) + 2 * wordSize))
4393       add(sp, sp, framesize - 2 * wordSize);
4394     else {
4395       mov(rscratch1, framesize - 2 * wordSize);
4396       add(sp, sp, rscratch1);
4397     }
4398     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4399   }
4400 }
4401 
4402 #ifdef COMPILER2
4403 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4404 
4405 // Search for str1 in str2 and return index or -1
4406 void MacroAssembler::string_indexof(Register str2, Register str1,
4407                                     Register cnt2, Register cnt1,
4408                                     Register tmp1, Register tmp2,
4409                                     Register tmp3, Register tmp4,
4410                                     Register tmp5, Register tmp6,
4411                                     int icnt1, Register result, int ae) {
4412   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4413   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4414 
4415   Register ch1 = rscratch1;
4416   Register ch2 = rscratch2;
4417   Register cnt1tmp = tmp1;
4418   Register cnt2tmp = tmp2;
4419   Register cnt1_neg = cnt1;
4420   Register cnt2_neg = cnt2;
4421   Register result_tmp = tmp4;
4422 
4423   bool isL = ae == StrIntrinsicNode::LL;
4424 
4425   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4426   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4427   int str1_chr_shift = str1_isL ? 0:1;
4428   int str2_chr_shift = str2_isL ? 0:1;
4429   int str1_chr_size = str1_isL ? 1:2;
4430   int str2_chr_size = str2_isL ? 1:2;
4431   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4432                                       (chr_insn)&MacroAssembler::ldrh;
4433   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4434                                       (chr_insn)&MacroAssembler::ldrh;
4435   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4436   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4437 
4438   // Note, inline_string_indexOf() generates checks:
4439   // if (substr.count > string.count) return -1;
4440   // if (substr.count == 0) return 0;
4441 
4442   // We have two strings, a source string in str2, cnt2 and a pattern string
4443   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4444 
4445   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4446   // With a small pattern and source we use linear scan.
4447 
4448   if (icnt1 == -1) {
4449     sub(result_tmp, cnt2, cnt1);
4450     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4451     br(LT, LINEARSEARCH);
4452     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4453     subs(zr, cnt1, 256);
4454     lsr(tmp1, cnt2, 2);
4455     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4456     br(GE, LINEARSTUB);
4457   }
4458 
4459 // The Boyer Moore alogorithm is based on the description here:-
4460 //
4461 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4462 //
4463 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4464 // and the 'Good Suffix' rule.
4465 //
4466 // These rules are essentially heuristics for how far we can shift the
4467 // pattern along the search string.
4468 //
4469 // The implementation here uses the 'Bad Character' rule only because of the
4470 // complexity of initialisation for the 'Good Suffix' rule.
4471 //
4472 // This is also known as the Boyer-Moore-Horspool algorithm:-
4473 //
4474 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4475 //
4476 // This particular implementation has few java-specific optimizations.
4477 //
4478 // #define ASIZE 256
4479 //
4480 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4481 //       int i, j;
4482 //       unsigned c;
4483 //       unsigned char bc[ASIZE];
4484 //
4485 //       /* Preprocessing */
4486 //       for (i = 0; i < ASIZE; ++i)
4487 //          bc[i] = m;
4488 //       for (i = 0; i < m - 1; ) {
4489 //          c = x[i];
4490 //          ++i;
4491 //          // c < 256 for Latin1 string, so, no need for branch
4492 //          #ifdef PATTERN_STRING_IS_LATIN1
4493 //          bc[c] = m - i;
4494 //          #else
4495 //          if (c < ASIZE) bc[c] = m - i;
4496 //          #endif
4497 //       }
4498 //
4499 //       /* Searching */
4500 //       j = 0;
4501 //       while (j <= n - m) {
4502 //          c = y[i+j];
4503 //          if (x[m-1] == c)
4504 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4505 //          if (i < 0) return j;
4506 //          // c < 256 for Latin1 string, so, no need for branch
4507 //          #ifdef SOURCE_STRING_IS_LATIN1
4508 //          // LL case: (c< 256) always true. Remove branch
4509 //          j += bc[y[j+m-1]];
4510 //          #endif
4511 //          #ifndef PATTERN_STRING_IS_UTF
4512 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4513 //          if (c < ASIZE)
4514 //            j += bc[y[j+m-1]];
4515 //          else
4516 //            j += 1
4517 //          #endif
4518 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4519 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4520 //          if (c < ASIZE)
4521 //            j += bc[y[j+m-1]];
4522 //          else
4523 //            j += m
4524 //          #endif
4525 //       }
4526 //    }
4527 
4528   if (icnt1 == -1) {
4529     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4530         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4531     Register cnt1end = tmp2;
4532     Register str2end = cnt2;
4533     Register skipch = tmp2;
4534 
4535     // str1 length is >=8, so, we can read at least 1 register for cases when
4536     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4537     // UL case. We'll re-read last character in inner pre-loop code to have
4538     // single outer pre-loop load
4539     const int firstStep = isL ? 7 : 3;
4540 
4541     const int ASIZE = 256;
4542     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4543     sub(sp, sp, ASIZE);
4544     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4545     mov(ch1, sp);
4546     BIND(BM_INIT_LOOP);
4547       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4548       subs(tmp5, tmp5, 1);
4549       br(GT, BM_INIT_LOOP);
4550 
4551       sub(cnt1tmp, cnt1, 1);
4552       mov(tmp5, str2);
4553       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4554       sub(ch2, cnt1, 1);
4555       mov(tmp3, str1);
4556     BIND(BCLOOP);
4557       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4558       if (!str1_isL) {
4559         subs(zr, ch1, ASIZE);
4560         br(HS, BCSKIP);
4561       }
4562       strb(ch2, Address(sp, ch1));
4563     BIND(BCSKIP);
4564       subs(ch2, ch2, 1);
4565       br(GT, BCLOOP);
4566 
4567       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4568       if (str1_isL == str2_isL) {
4569         // load last 8 bytes (8LL/4UU symbols)
4570         ldr(tmp6, Address(tmp6, -wordSize));
4571       } else {
4572         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4573         // convert Latin1 to UTF. We'll have to wait until load completed, but
4574         // it's still faster than per-character loads+checks
4575         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4576         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4577         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4578         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4579         orr(ch2, ch1, ch2, LSL, 16);
4580         orr(tmp6, tmp6, tmp3, LSL, 48);
4581         orr(tmp6, tmp6, ch2, LSL, 16);
4582       }
4583     BIND(BMLOOPSTR2);
4584       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4585       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4586       if (str1_isL == str2_isL) {
4587         // re-init tmp3. It's for free because it's executed in parallel with
4588         // load above. Alternative is to initialize it before loop, but it'll
4589         // affect performance on in-order systems with 2 or more ld/st pipelines
4590         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4591       }
4592       if (!isL) { // UU/UL case
4593         lsl(ch2, cnt1tmp, 1); // offset in bytes
4594       }
4595       cmp(tmp3, skipch);
4596       br(NE, BMSKIP);
4597       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4598       mov(ch1, tmp6);
4599       if (isL) {
4600         b(BMLOOPSTR1_AFTER_LOAD);
4601       } else {
4602         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4603         b(BMLOOPSTR1_CMP);
4604       }
4605     BIND(BMLOOPSTR1);
4606       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4607       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4608     BIND(BMLOOPSTR1_AFTER_LOAD);
4609       subs(cnt1tmp, cnt1tmp, 1);
4610       br(LT, BMLOOPSTR1_LASTCMP);
4611     BIND(BMLOOPSTR1_CMP);
4612       cmp(ch1, ch2);
4613       br(EQ, BMLOOPSTR1);
4614     BIND(BMSKIP);
4615       if (!isL) {
4616         // if we've met UTF symbol while searching Latin1 pattern, then we can
4617         // skip cnt1 symbols
4618         if (str1_isL != str2_isL) {
4619           mov(result_tmp, cnt1);
4620         } else {
4621           mov(result_tmp, 1);
4622         }
4623         subs(zr, skipch, ASIZE);
4624         br(HS, BMADV);
4625       }
4626       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4627     BIND(BMADV);
4628       sub(cnt1tmp, cnt1, 1);
4629       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4630       cmp(str2, str2end);
4631       br(LE, BMLOOPSTR2);
4632       add(sp, sp, ASIZE);
4633       b(NOMATCH);
4634     BIND(BMLOOPSTR1_LASTCMP);
4635       cmp(ch1, ch2);
4636       br(NE, BMSKIP);
4637     BIND(BMMATCH);
4638       sub(result, str2, tmp5);
4639       if (!str2_isL) lsr(result, result, 1);
4640       add(sp, sp, ASIZE);
4641       b(DONE);
4642 
4643     BIND(LINEARSTUB);
4644     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4645     br(LT, LINEAR_MEDIUM);
4646     mov(result, zr);
4647     RuntimeAddress stub = NULL;
4648     if (isL) {
4649       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4650       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4651     } else if (str1_isL) {
4652       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4653        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4654     } else {
4655       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4656       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4657     }
4658     trampoline_call(stub);
4659     b(DONE);
4660   }
4661 
4662   BIND(LINEARSEARCH);
4663   {
4664     Label DO1, DO2, DO3;
4665 
4666     Register str2tmp = tmp2;
4667     Register first = tmp3;
4668 
4669     if (icnt1 == -1)
4670     {
4671         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4672 
4673         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4674         br(LT, DOSHORT);
4675       BIND(LINEAR_MEDIUM);
4676         (this->*str1_load_1chr)(first, Address(str1));
4677         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4678         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4679         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4680         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4681 
4682       BIND(FIRST_LOOP);
4683         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4684         cmp(first, ch2);
4685         br(EQ, STR1_LOOP);
4686       BIND(STR2_NEXT);
4687         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4688         br(LE, FIRST_LOOP);
4689         b(NOMATCH);
4690 
4691       BIND(STR1_LOOP);
4692         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4693         add(cnt2tmp, cnt2_neg, str2_chr_size);
4694         br(GE, MATCH);
4695 
4696       BIND(STR1_NEXT);
4697         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4698         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4699         cmp(ch1, ch2);
4700         br(NE, STR2_NEXT);
4701         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4702         add(cnt2tmp, cnt2tmp, str2_chr_size);
4703         br(LT, STR1_NEXT);
4704         b(MATCH);
4705 
4706       BIND(DOSHORT);
4707       if (str1_isL == str2_isL) {
4708         cmp(cnt1, (u1)2);
4709         br(LT, DO1);
4710         br(GT, DO3);
4711       }
4712     }
4713 
4714     if (icnt1 == 4) {
4715       Label CH1_LOOP;
4716 
4717         (this->*load_4chr)(ch1, str1);
4718         sub(result_tmp, cnt2, 4);
4719         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4720         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4721 
4722       BIND(CH1_LOOP);
4723         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4724         cmp(ch1, ch2);
4725         br(EQ, MATCH);
4726         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4727         br(LE, CH1_LOOP);
4728         b(NOMATCH);
4729       }
4730 
4731     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4732       Label CH1_LOOP;
4733 
4734       BIND(DO2);
4735         (this->*load_2chr)(ch1, str1);
4736         if (icnt1 == 2) {
4737           sub(result_tmp, cnt2, 2);
4738         }
4739         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4740         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4741       BIND(CH1_LOOP);
4742         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4743         cmp(ch1, ch2);
4744         br(EQ, MATCH);
4745         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4746         br(LE, CH1_LOOP);
4747         b(NOMATCH);
4748     }
4749 
4750     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4751       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4752 
4753       BIND(DO3);
4754         (this->*load_2chr)(first, str1);
4755         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4756         if (icnt1 == 3) {
4757           sub(result_tmp, cnt2, 3);
4758         }
4759         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4760         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4761       BIND(FIRST_LOOP);
4762         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4763         cmpw(first, ch2);
4764         br(EQ, STR1_LOOP);
4765       BIND(STR2_NEXT);
4766         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4767         br(LE, FIRST_LOOP);
4768         b(NOMATCH);
4769 
4770       BIND(STR1_LOOP);
4771         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4772         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4773         cmp(ch1, ch2);
4774         br(NE, STR2_NEXT);
4775         b(MATCH);
4776     }
4777 
4778     if (icnt1 == -1 || icnt1 == 1) {
4779       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4780 
4781       BIND(DO1);
4782         (this->*str1_load_1chr)(ch1, str1);
4783         cmp(cnt2, (u1)8);
4784         br(LT, DO1_SHORT);
4785 
4786         sub(result_tmp, cnt2, 8/str2_chr_size);
4787         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4788         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4789         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4790 
4791         if (str2_isL) {
4792           orr(ch1, ch1, ch1, LSL, 8);
4793         }
4794         orr(ch1, ch1, ch1, LSL, 16);
4795         orr(ch1, ch1, ch1, LSL, 32);
4796       BIND(CH1_LOOP);
4797         ldr(ch2, Address(str2, cnt2_neg));
4798         eor(ch2, ch1, ch2);
4799         sub(tmp1, ch2, tmp3);
4800         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4801         bics(tmp1, tmp1, tmp2);
4802         br(NE, HAS_ZERO);
4803         adds(cnt2_neg, cnt2_neg, 8);
4804         br(LT, CH1_LOOP);
4805 
4806         cmp(cnt2_neg, (u1)8);
4807         mov(cnt2_neg, 0);
4808         br(LT, CH1_LOOP);
4809         b(NOMATCH);
4810 
4811       BIND(HAS_ZERO);
4812         rev(tmp1, tmp1);
4813         clz(tmp1, tmp1);
4814         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4815         b(MATCH);
4816 
4817       BIND(DO1_SHORT);
4818         mov(result_tmp, cnt2);
4819         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4820         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4821       BIND(DO1_LOOP);
4822         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4823         cmpw(ch1, ch2);
4824         br(EQ, MATCH);
4825         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4826         br(LT, DO1_LOOP);
4827     }
4828   }
4829   BIND(NOMATCH);
4830     mov(result, -1);
4831     b(DONE);
4832   BIND(MATCH);
4833     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4834   BIND(DONE);
4835 }
4836 
4837 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4838 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4839 
4840 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4841                                          Register ch, Register result,
4842                                          Register tmp1, Register tmp2, Register tmp3)
4843 {
4844   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4845   Register cnt1_neg = cnt1;
4846   Register ch1 = rscratch1;
4847   Register result_tmp = rscratch2;
4848 
4849   cmp(cnt1, (u1)4);
4850   br(LT, DO1_SHORT);
4851 
4852   orr(ch, ch, ch, LSL, 16);
4853   orr(ch, ch, ch, LSL, 32);
4854 
4855   sub(cnt1, cnt1, 4);
4856   mov(result_tmp, cnt1);
4857   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4858   sub(cnt1_neg, zr, cnt1, LSL, 1);
4859 
4860   mov(tmp3, 0x0001000100010001);
4861 
4862   BIND(CH1_LOOP);
4863     ldr(ch1, Address(str1, cnt1_neg));
4864     eor(ch1, ch, ch1);
4865     sub(tmp1, ch1, tmp3);
4866     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4867     bics(tmp1, tmp1, tmp2);
4868     br(NE, HAS_ZERO);
4869     adds(cnt1_neg, cnt1_neg, 8);
4870     br(LT, CH1_LOOP);
4871 
4872     cmp(cnt1_neg, (u1)8);
4873     mov(cnt1_neg, 0);
4874     br(LT, CH1_LOOP);
4875     b(NOMATCH);
4876 
4877   BIND(HAS_ZERO);
4878     rev(tmp1, tmp1);
4879     clz(tmp1, tmp1);
4880     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4881     b(MATCH);
4882 
4883   BIND(DO1_SHORT);
4884     mov(result_tmp, cnt1);
4885     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4886     sub(cnt1_neg, zr, cnt1, LSL, 1);
4887   BIND(DO1_LOOP);
4888     ldrh(ch1, Address(str1, cnt1_neg));
4889     cmpw(ch, ch1);
4890     br(EQ, MATCH);
4891     adds(cnt1_neg, cnt1_neg, 2);
4892     br(LT, DO1_LOOP);
4893   BIND(NOMATCH);
4894     mov(result, -1);
4895     b(DONE);
4896   BIND(MATCH);
4897     add(result, result_tmp, cnt1_neg, ASR, 1);
4898   BIND(DONE);
4899 }
4900 
4901 // Compare strings.
4902 void MacroAssembler::string_compare(Register str1, Register str2,
4903     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4904     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4905   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4906       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4907       SHORT_LOOP_START, TAIL_CHECK;
4908 
4909   const u1 STUB_THRESHOLD = 64 + 8;
4910   bool isLL = ae == StrIntrinsicNode::LL;
4911   bool isLU = ae == StrIntrinsicNode::LU;
4912   bool isUL = ae == StrIntrinsicNode::UL;
4913 
4914   bool str1_isL = isLL || isLU;
4915   bool str2_isL = isLL || isUL;
4916 
4917   int str1_chr_shift = str1_isL ? 0 : 1;
4918   int str2_chr_shift = str2_isL ? 0 : 1;
4919   int str1_chr_size = str1_isL ? 1 : 2;
4920   int str2_chr_size = str2_isL ? 1 : 2;
4921   int minCharsInWord = isLL ? wordSize : wordSize/2;
4922 
4923   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4924   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4925                                       (chr_insn)&MacroAssembler::ldrh;
4926   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4927                                       (chr_insn)&MacroAssembler::ldrh;
4928   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4929                             (uxt_insn)&MacroAssembler::uxthw;
4930 
4931   BLOCK_COMMENT("string_compare {");
4932 
4933   // Bizzarely, the counts are passed in bytes, regardless of whether they
4934   // are L or U strings, however the result is always in characters.
4935   if (!str1_isL) asrw(cnt1, cnt1, 1);
4936   if (!str2_isL) asrw(cnt2, cnt2, 1);
4937 
4938   // Compute the minimum of the string lengths and save the difference.
4939   subsw(result, cnt1, cnt2);
4940   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4941 
4942   // A very short string
4943   cmpw(cnt2, minCharsInWord);
4944   br(Assembler::LE, SHORT_STRING);
4945 
4946   // Compare longwords
4947   // load first parts of strings and finish initialization while loading
4948   {
4949     if (str1_isL == str2_isL) { // LL or UU
4950       ldr(tmp1, Address(str1));
4951       cmp(str1, str2);
4952       br(Assembler::EQ, DONE);
4953       ldr(tmp2, Address(str2));
4954       cmp(cnt2, STUB_THRESHOLD);
4955       br(GE, STUB);
4956       subsw(cnt2, cnt2, minCharsInWord);
4957       br(EQ, TAIL_CHECK);
4958       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4959       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4960       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4961     } else if (isLU) {
4962       ldrs(vtmp, Address(str1));
4963       cmp(str1, str2);
4964       br(Assembler::EQ, DONE);
4965       ldr(tmp2, Address(str2));
4966       cmp(cnt2, STUB_THRESHOLD);
4967       br(GE, STUB);
4968       subw(cnt2, cnt2, 4);
4969       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4970       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4971       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4972       zip1(vtmp, T8B, vtmp, vtmpZ);
4973       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4974       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4975       add(cnt1, cnt1, 4);
4976       fmovd(tmp1, vtmp);
4977     } else { // UL case
4978       ldr(tmp1, Address(str1));
4979       cmp(str1, str2);
4980       br(Assembler::EQ, DONE);
4981       ldrs(vtmp, Address(str2));
4982       cmp(cnt2, STUB_THRESHOLD);
4983       br(GE, STUB);
4984       subw(cnt2, cnt2, 4);
4985       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4986       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4987       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4988       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4989       zip1(vtmp, T8B, vtmp, vtmpZ);
4990       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4991       add(cnt1, cnt1, 8);
4992       fmovd(tmp2, vtmp);
4993     }
4994     adds(cnt2, cnt2, isUL ? 4 : 8);
4995     br(GE, TAIL);
4996     eor(rscratch2, tmp1, tmp2);
4997     cbnz(rscratch2, DIFFERENCE);
4998     // main loop
4999     bind(NEXT_WORD);
5000     if (str1_isL == str2_isL) {
5001       ldr(tmp1, Address(str1, cnt2));
5002       ldr(tmp2, Address(str2, cnt2));
5003       adds(cnt2, cnt2, 8);
5004     } else if (isLU) {
5005       ldrs(vtmp, Address(str1, cnt1));
5006       ldr(tmp2, Address(str2, cnt2));
5007       add(cnt1, cnt1, 4);
5008       zip1(vtmp, T8B, vtmp, vtmpZ);
5009       fmovd(tmp1, vtmp);
5010       adds(cnt2, cnt2, 8);
5011     } else { // UL
5012       ldrs(vtmp, Address(str2, cnt2));
5013       ldr(tmp1, Address(str1, cnt1));
5014       zip1(vtmp, T8B, vtmp, vtmpZ);
5015       add(cnt1, cnt1, 8);
5016       fmovd(tmp2, vtmp);
5017       adds(cnt2, cnt2, 4);
5018     }
5019     br(GE, TAIL);
5020 
5021     eor(rscratch2, tmp1, tmp2);
5022     cbz(rscratch2, NEXT_WORD);
5023     b(DIFFERENCE);
5024     bind(TAIL);
5025     eor(rscratch2, tmp1, tmp2);
5026     cbnz(rscratch2, DIFFERENCE);
5027     // Last longword.  In the case where length == 4 we compare the
5028     // same longword twice, but that's still faster than another
5029     // conditional branch.
5030     if (str1_isL == str2_isL) {
5031       ldr(tmp1, Address(str1));
5032       ldr(tmp2, Address(str2));
5033     } else if (isLU) {
5034       ldrs(vtmp, Address(str1));
5035       ldr(tmp2, Address(str2));
5036       zip1(vtmp, T8B, vtmp, vtmpZ);
5037       fmovd(tmp1, vtmp);
5038     } else { // UL
5039       ldrs(vtmp, Address(str2));
5040       ldr(tmp1, Address(str1));
5041       zip1(vtmp, T8B, vtmp, vtmpZ);
5042       fmovd(tmp2, vtmp);
5043     }
5044     bind(TAIL_CHECK);
5045     eor(rscratch2, tmp1, tmp2);
5046     cbz(rscratch2, DONE);
5047 
5048     // Find the first different characters in the longwords and
5049     // compute their difference.
5050     bind(DIFFERENCE);
5051     rev(rscratch2, rscratch2);
5052     clz(rscratch2, rscratch2);
5053     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5054     lsrv(tmp1, tmp1, rscratch2);
5055     (this->*ext_chr)(tmp1, tmp1);
5056     lsrv(tmp2, tmp2, rscratch2);
5057     (this->*ext_chr)(tmp2, tmp2);
5058     subw(result, tmp1, tmp2);
5059     b(DONE);
5060   }
5061 
5062   bind(STUB);
5063     RuntimeAddress stub = NULL;
5064     switch(ae) {
5065       case StrIntrinsicNode::LL:
5066         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5067         break;
5068       case StrIntrinsicNode::UU:
5069         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5070         break;
5071       case StrIntrinsicNode::LU:
5072         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5073         break;
5074       case StrIntrinsicNode::UL:
5075         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5076         break;
5077       default:
5078         ShouldNotReachHere();
5079      }
5080     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5081     trampoline_call(stub);
5082     b(DONE);
5083 
5084   bind(SHORT_STRING);
5085   // Is the minimum length zero?
5086   cbz(cnt2, DONE);
5087   // arrange code to do most branches while loading and loading next characters
5088   // while comparing previous
5089   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5090   subs(cnt2, cnt2, 1);
5091   br(EQ, SHORT_LAST_INIT);
5092   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5093   b(SHORT_LOOP_START);
5094   bind(SHORT_LOOP);
5095   subs(cnt2, cnt2, 1);
5096   br(EQ, SHORT_LAST);
5097   bind(SHORT_LOOP_START);
5098   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5099   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5100   cmp(tmp1, cnt1);
5101   br(NE, SHORT_LOOP_TAIL);
5102   subs(cnt2, cnt2, 1);
5103   br(EQ, SHORT_LAST2);
5104   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5105   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5106   cmp(tmp2, rscratch1);
5107   br(EQ, SHORT_LOOP);
5108   sub(result, tmp2, rscratch1);
5109   b(DONE);
5110   bind(SHORT_LOOP_TAIL);
5111   sub(result, tmp1, cnt1);
5112   b(DONE);
5113   bind(SHORT_LAST2);
5114   cmp(tmp2, rscratch1);
5115   br(EQ, DONE);
5116   sub(result, tmp2, rscratch1);
5117 
5118   b(DONE);
5119   bind(SHORT_LAST_INIT);
5120   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5121   bind(SHORT_LAST);
5122   cmp(tmp1, cnt1);
5123   br(EQ, DONE);
5124   sub(result, tmp1, cnt1);
5125 
5126   bind(DONE);
5127 
5128   BLOCK_COMMENT("} string_compare");
5129 }
5130 #endif // COMPILER2
5131 
5132 // This method checks if provided byte array contains byte with highest bit set.
5133 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5134     // Simple and most common case of aligned small array which is not at the
5135     // end of memory page is placed here. All other cases are in stub.
5136     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5137     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5138     assert_different_registers(ary1, len, result);
5139 
5140     cmpw(len, 0);
5141     br(LE, SET_RESULT);
5142     cmpw(len, 4 * wordSize);
5143     br(GE, STUB_LONG); // size > 32 then go to stub
5144 
5145     int shift = 64 - exact_log2(os::vm_page_size());
5146     lsl(rscratch1, ary1, shift);
5147     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5148     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5149     br(CS, STUB); // at the end of page then go to stub
5150     subs(len, len, wordSize);
5151     br(LT, END);
5152 
5153   BIND(LOOP);
5154     ldr(rscratch1, Address(post(ary1, wordSize)));
5155     tst(rscratch1, UPPER_BIT_MASK);
5156     br(NE, SET_RESULT);
5157     subs(len, len, wordSize);
5158     br(GE, LOOP);
5159     cmpw(len, -wordSize);
5160     br(EQ, SET_RESULT);
5161 
5162   BIND(END);
5163     ldr(result, Address(ary1));
5164     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5165     lslv(result, result, len);
5166     tst(result, UPPER_BIT_MASK);
5167     b(SET_RESULT);
5168 
5169   BIND(STUB);
5170     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5171     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5172     trampoline_call(has_neg);
5173     b(DONE);
5174 
5175   BIND(STUB_LONG);
5176     RuntimeAddress has_neg_long =  RuntimeAddress(
5177             StubRoutines::aarch64::has_negatives_long());
5178     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5179     trampoline_call(has_neg_long);
5180     b(DONE);
5181 
5182   BIND(SET_RESULT);
5183     cset(result, NE); // set true or false
5184 
5185   BIND(DONE);
5186 }
5187 
5188 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5189                                    Register tmp4, Register tmp5, Register result,
5190                                    Register cnt1, int elem_size) {
5191   Label DONE, SAME;
5192   Register tmp1 = rscratch1;
5193   Register tmp2 = rscratch2;
5194   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5195   int elem_per_word = wordSize/elem_size;
5196   int log_elem_size = exact_log2(elem_size);
5197   int length_offset = arrayOopDesc::length_offset_in_bytes();
5198   int base_offset
5199     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5200   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5201 
5202   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5203   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5204 
5205 #ifndef PRODUCT
5206   {
5207     const char kind = (elem_size == 2) ? 'U' : 'L';
5208     char comment[64];
5209     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5210     BLOCK_COMMENT(comment);
5211   }
5212 #endif
5213 
5214   // if (a1 == a2)
5215   //     return true;
5216   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5217   br(EQ, SAME);
5218 
5219   if (UseSimpleArrayEquals) {
5220     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5221     // if (a1 == null || a2 == null)
5222     //     return false;
5223     // a1 & a2 == 0 means (some-pointer is null) or
5224     // (very-rare-or-even-probably-impossible-pointer-values)
5225     // so, we can save one branch in most cases
5226     tst(a1, a2);
5227     mov(result, false);
5228     br(EQ, A_MIGHT_BE_NULL);
5229     // if (a1.length != a2.length)
5230     //      return false;
5231     bind(A_IS_NOT_NULL);
5232     ldrw(cnt1, Address(a1, length_offset));
5233     ldrw(cnt2, Address(a2, length_offset));
5234     eorw(tmp5, cnt1, cnt2);
5235     cbnzw(tmp5, DONE);
5236     lea(a1, Address(a1, base_offset));
5237     lea(a2, Address(a2, base_offset));
5238     // Check for short strings, i.e. smaller than wordSize.
5239     subs(cnt1, cnt1, elem_per_word);
5240     br(Assembler::LT, SHORT);
5241     // Main 8 byte comparison loop.
5242     bind(NEXT_WORD); {
5243       ldr(tmp1, Address(post(a1, wordSize)));
5244       ldr(tmp2, Address(post(a2, wordSize)));
5245       subs(cnt1, cnt1, elem_per_word);
5246       eor(tmp5, tmp1, tmp2);
5247       cbnz(tmp5, DONE);
5248     } br(GT, NEXT_WORD);
5249     // Last longword.  In the case where length == 4 we compare the
5250     // same longword twice, but that's still faster than another
5251     // conditional branch.
5252     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5253     // length == 4.
5254     if (log_elem_size > 0)
5255       lsl(cnt1, cnt1, log_elem_size);
5256     ldr(tmp3, Address(a1, cnt1));
5257     ldr(tmp4, Address(a2, cnt1));
5258     eor(tmp5, tmp3, tmp4);
5259     cbnz(tmp5, DONE);
5260     b(SAME);
5261     bind(A_MIGHT_BE_NULL);
5262     // in case both a1 and a2 are not-null, proceed with loads
5263     cbz(a1, DONE);
5264     cbz(a2, DONE);
5265     b(A_IS_NOT_NULL);
5266     bind(SHORT);
5267 
5268     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5269     {
5270       ldrw(tmp1, Address(post(a1, 4)));
5271       ldrw(tmp2, Address(post(a2, 4)));
5272       eorw(tmp5, tmp1, tmp2);
5273       cbnzw(tmp5, DONE);
5274     }
5275     bind(TAIL03);
5276     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5277     {
5278       ldrh(tmp3, Address(post(a1, 2)));
5279       ldrh(tmp4, Address(post(a2, 2)));
5280       eorw(tmp5, tmp3, tmp4);
5281       cbnzw(tmp5, DONE);
5282     }
5283     bind(TAIL01);
5284     if (elem_size == 1) { // Only needed when comparing byte arrays.
5285       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5286       {
5287         ldrb(tmp1, a1);
5288         ldrb(tmp2, a2);
5289         eorw(tmp5, tmp1, tmp2);
5290         cbnzw(tmp5, DONE);
5291       }
5292     }
5293   } else {
5294     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5295         CSET_EQ, LAST_CHECK;
5296     mov(result, false);
5297     cbz(a1, DONE);
5298     ldrw(cnt1, Address(a1, length_offset));
5299     cbz(a2, DONE);
5300     ldrw(cnt2, Address(a2, length_offset));
5301     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5302     // faster to perform another branch before comparing a1 and a2
5303     cmp(cnt1, (u1)elem_per_word);
5304     br(LE, SHORT); // short or same
5305     ldr(tmp3, Address(pre(a1, base_offset)));
5306     subs(zr, cnt1, stubBytesThreshold);
5307     br(GE, STUB);
5308     ldr(tmp4, Address(pre(a2, base_offset)));
5309     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5310     cmp(cnt2, cnt1);
5311     br(NE, DONE);
5312 
5313     // Main 16 byte comparison loop with 2 exits
5314     bind(NEXT_DWORD); {
5315       ldr(tmp1, Address(pre(a1, wordSize)));
5316       ldr(tmp2, Address(pre(a2, wordSize)));
5317       subs(cnt1, cnt1, 2 * elem_per_word);
5318       br(LE, TAIL);
5319       eor(tmp4, tmp3, tmp4);
5320       cbnz(tmp4, DONE);
5321       ldr(tmp3, Address(pre(a1, wordSize)));
5322       ldr(tmp4, Address(pre(a2, wordSize)));
5323       cmp(cnt1, (u1)elem_per_word);
5324       br(LE, TAIL2);
5325       cmp(tmp1, tmp2);
5326     } br(EQ, NEXT_DWORD);
5327     b(DONE);
5328 
5329     bind(TAIL);
5330     eor(tmp4, tmp3, tmp4);
5331     eor(tmp2, tmp1, tmp2);
5332     lslv(tmp2, tmp2, tmp5);
5333     orr(tmp5, tmp4, tmp2);
5334     cmp(tmp5, zr);
5335     b(CSET_EQ);
5336 
5337     bind(TAIL2);
5338     eor(tmp2, tmp1, tmp2);
5339     cbnz(tmp2, DONE);
5340     b(LAST_CHECK);
5341 
5342     bind(STUB);
5343     ldr(tmp4, Address(pre(a2, base_offset)));
5344     cmp(cnt2, cnt1);
5345     br(NE, DONE);
5346     if (elem_size == 2) { // convert to byte counter
5347       lsl(cnt1, cnt1, 1);
5348     }
5349     eor(tmp5, tmp3, tmp4);
5350     cbnz(tmp5, DONE);
5351     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5352     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5353     trampoline_call(stub);
5354     b(DONE);
5355 
5356     bind(EARLY_OUT);
5357     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5358     // so, if a2 == null => return false(0), else return true, so we can return a2
5359     mov(result, a2);
5360     b(DONE);
5361     bind(SHORT);
5362     cmp(cnt2, cnt1);
5363     br(NE, DONE);
5364     cbz(cnt1, SAME);
5365     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5366     ldr(tmp3, Address(a1, base_offset));
5367     ldr(tmp4, Address(a2, base_offset));
5368     bind(LAST_CHECK);
5369     eor(tmp4, tmp3, tmp4);
5370     lslv(tmp5, tmp4, tmp5);
5371     cmp(tmp5, zr);
5372     bind(CSET_EQ);
5373     cset(result, EQ);
5374     b(DONE);
5375   }
5376 
5377   bind(SAME);
5378   mov(result, true);
5379   // That's it.
5380   bind(DONE);
5381 
5382   BLOCK_COMMENT("} array_equals");
5383 }
5384 
5385 // Compare Strings
5386 
5387 // For Strings we're passed the address of the first characters in a1
5388 // and a2 and the length in cnt1.
5389 // elem_size is the element size in bytes: either 1 or 2.
5390 // There are two implementations.  For arrays >= 8 bytes, all
5391 // comparisons (including the final one, which may overlap) are
5392 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5393 // halfword, then a short, and then a byte.
5394 
5395 void MacroAssembler::string_equals(Register a1, Register a2,
5396                                    Register result, Register cnt1, int elem_size)
5397 {
5398   Label SAME, DONE, SHORT, NEXT_WORD;
5399   Register tmp1 = rscratch1;
5400   Register tmp2 = rscratch2;
5401   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5402 
5403   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5404   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5405 
5406 #ifndef PRODUCT
5407   {
5408     const char kind = (elem_size == 2) ? 'U' : 'L';
5409     char comment[64];
5410     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5411     BLOCK_COMMENT(comment);
5412   }
5413 #endif
5414 
5415   mov(result, false);
5416 
5417   // Check for short strings, i.e. smaller than wordSize.
5418   subs(cnt1, cnt1, wordSize);
5419   br(Assembler::LT, SHORT);
5420   // Main 8 byte comparison loop.
5421   bind(NEXT_WORD); {
5422     ldr(tmp1, Address(post(a1, wordSize)));
5423     ldr(tmp2, Address(post(a2, wordSize)));
5424     subs(cnt1, cnt1, wordSize);
5425     eor(tmp1, tmp1, tmp2);
5426     cbnz(tmp1, DONE);
5427   } br(GT, NEXT_WORD);
5428   // Last longword.  In the case where length == 4 we compare the
5429   // same longword twice, but that's still faster than another
5430   // conditional branch.
5431   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5432   // length == 4.
5433   ldr(tmp1, Address(a1, cnt1));
5434   ldr(tmp2, Address(a2, cnt1));
5435   eor(tmp2, tmp1, tmp2);
5436   cbnz(tmp2, DONE);
5437   b(SAME);
5438 
5439   bind(SHORT);
5440   Label TAIL03, TAIL01;
5441 
5442   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5443   {
5444     ldrw(tmp1, Address(post(a1, 4)));
5445     ldrw(tmp2, Address(post(a2, 4)));
5446     eorw(tmp1, tmp1, tmp2);
5447     cbnzw(tmp1, DONE);
5448   }
5449   bind(TAIL03);
5450   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5451   {
5452     ldrh(tmp1, Address(post(a1, 2)));
5453     ldrh(tmp2, Address(post(a2, 2)));
5454     eorw(tmp1, tmp1, tmp2);
5455     cbnzw(tmp1, DONE);
5456   }
5457   bind(TAIL01);
5458   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5459     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5460     {
5461       ldrb(tmp1, a1);
5462       ldrb(tmp2, a2);
5463       eorw(tmp1, tmp1, tmp2);
5464       cbnzw(tmp1, DONE);
5465     }
5466   }
5467   // Arrays are equal.
5468   bind(SAME);
5469   mov(result, true);
5470 
5471   // That's it.
5472   bind(DONE);
5473   BLOCK_COMMENT("} string_equals");
5474 }
5475 
5476 
5477 // The size of the blocks erased by the zero_blocks stub.  We must
5478 // handle anything smaller than this ourselves in zero_words().
5479 const int MacroAssembler::zero_words_block_size = 8;
5480 
5481 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5482 // possible, handling small word counts locally and delegating
5483 // anything larger to the zero_blocks stub.  It is expanded many times
5484 // in compiled code, so it is important to keep it short.
5485 
5486 // ptr:   Address of a buffer to be zeroed.
5487 // cnt:   Count in HeapWords.
5488 //
5489 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5490 void MacroAssembler::zero_words(Register ptr, Register cnt)
5491 {
5492   assert(is_power_of_2(zero_words_block_size), "adjust this");
5493   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5494 
5495   BLOCK_COMMENT("zero_words {");
5496   cmp(cnt, (u1)zero_words_block_size);
5497   Label around;
5498   br(LO, around);
5499   {
5500     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5501     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5502     if (StubRoutines::aarch64::complete()) {
5503       trampoline_call(zero_blocks);
5504     } else {
5505       bl(zero_blocks);
5506     }
5507   }
5508   bind(around);
5509   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5510     Label l;
5511     tbz(cnt, exact_log2(i), l);
5512     for (int j = 0; j < i; j += 2) {
5513       stp(zr, zr, post(ptr, 16));
5514     }
5515     bind(l);
5516   }
5517   {
5518     Label l;
5519     tbz(cnt, 0, l);
5520     str(zr, Address(ptr));
5521     bind(l);
5522   }
5523   BLOCK_COMMENT("} zero_words");
5524 }
5525 
5526 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5527 // cnt:          Immediate count in HeapWords.
5528 #define SmallArraySize (18 * BytesPerLong)
5529 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5530 {
5531   BLOCK_COMMENT("zero_words {");
5532   int i = cnt & 1;  // store any odd word to start
5533   if (i) str(zr, Address(base));
5534 
5535   if (cnt <= SmallArraySize / BytesPerLong) {
5536     for (; i < (int)cnt; i += 2)
5537       stp(zr, zr, Address(base, i * wordSize));
5538   } else {
5539     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5540     int remainder = cnt % (2 * unroll);
5541     for (; i < remainder; i += 2)
5542       stp(zr, zr, Address(base, i * wordSize));
5543 
5544     Label loop;
5545     Register cnt_reg = rscratch1;
5546     Register loop_base = rscratch2;
5547     cnt = cnt - remainder;
5548     mov(cnt_reg, cnt);
5549     // adjust base and prebias by -2 * wordSize so we can pre-increment
5550     add(loop_base, base, (remainder - 2) * wordSize);
5551     bind(loop);
5552     sub(cnt_reg, cnt_reg, 2 * unroll);
5553     for (i = 1; i < unroll; i++)
5554       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5555     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5556     cbnz(cnt_reg, loop);
5557   }
5558   BLOCK_COMMENT("} zero_words");
5559 }
5560 
5561 // Zero blocks of memory by using DC ZVA.
5562 //
5563 // Aligns the base address first sufficently for DC ZVA, then uses
5564 // DC ZVA repeatedly for every full block.  cnt is the size to be
5565 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5566 // in cnt.
5567 //
5568 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5569 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5570 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5571   Register tmp = rscratch1;
5572   Register tmp2 = rscratch2;
5573   int zva_length = VM_Version::zva_length();
5574   Label initial_table_end, loop_zva;
5575   Label fini;
5576 
5577   // Base must be 16 byte aligned. If not just return and let caller handle it
5578   tst(base, 0x0f);
5579   br(Assembler::NE, fini);
5580   // Align base with ZVA length.
5581   neg(tmp, base);
5582   andr(tmp, tmp, zva_length - 1);
5583 
5584   // tmp: the number of bytes to be filled to align the base with ZVA length.
5585   add(base, base, tmp);
5586   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5587   adr(tmp2, initial_table_end);
5588   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5589   br(tmp2);
5590 
5591   for (int i = -zva_length + 16; i < 0; i += 16)
5592     stp(zr, zr, Address(base, i));
5593   bind(initial_table_end);
5594 
5595   sub(cnt, cnt, zva_length >> 3);
5596   bind(loop_zva);
5597   dc(Assembler::ZVA, base);
5598   subs(cnt, cnt, zva_length >> 3);
5599   add(base, base, zva_length);
5600   br(Assembler::GE, loop_zva);
5601   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5602   bind(fini);
5603 }
5604 
5605 // base:   Address of a buffer to be filled, 8 bytes aligned.
5606 // cnt:    Count in 8-byte unit.
5607 // value:  Value to be filled with.
5608 // base will point to the end of the buffer after filling.
5609 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5610 {
5611 //  Algorithm:
5612 //
5613 //    scratch1 = cnt & 7;
5614 //    cnt -= scratch1;
5615 //    p += scratch1;
5616 //    switch (scratch1) {
5617 //      do {
5618 //        cnt -= 8;
5619 //          p[-8] = v;
5620 //        case 7:
5621 //          p[-7] = v;
5622 //        case 6:
5623 //          p[-6] = v;
5624 //          // ...
5625 //        case 1:
5626 //          p[-1] = v;
5627 //        case 0:
5628 //          p += 8;
5629 //      } while (cnt);
5630 //    }
5631 
5632   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5633 
5634   Label fini, skip, entry, loop;
5635   const int unroll = 8; // Number of stp instructions we'll unroll
5636 
5637   cbz(cnt, fini);
5638   tbz(base, 3, skip);
5639   str(value, Address(post(base, 8)));
5640   sub(cnt, cnt, 1);
5641   bind(skip);
5642 
5643   andr(rscratch1, cnt, (unroll-1) * 2);
5644   sub(cnt, cnt, rscratch1);
5645   add(base, base, rscratch1, Assembler::LSL, 3);
5646   adr(rscratch2, entry);
5647   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5648   br(rscratch2);
5649 
5650   bind(loop);
5651   add(base, base, unroll * 16);
5652   for (int i = -unroll; i < 0; i++)
5653     stp(value, value, Address(base, i * 16));
5654   bind(entry);
5655   subs(cnt, cnt, unroll * 2);
5656   br(Assembler::GE, loop);
5657 
5658   tbz(cnt, 0, fini);
5659   str(value, Address(post(base, 8)));
5660   bind(fini);
5661 }
5662 
5663 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5664 // java/lang/StringUTF16.compress.
5665 void MacroAssembler::encode_iso_array(Register src, Register dst,
5666                       Register len, Register result,
5667                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5668                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5669 {
5670     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5671         NEXT_32_START, NEXT_32_PRFM_START;
5672     Register tmp1 = rscratch1, tmp2 = rscratch2;
5673 
5674       mov(result, len); // Save initial len
5675 
5676 #ifndef BUILTIN_SIM
5677       cmp(len, (u1)8); // handle shortest strings first
5678       br(LT, LOOP_1);
5679       cmp(len, (u1)32);
5680       br(LT, NEXT_8);
5681       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5682       // to convert chars to bytes
5683       if (SoftwarePrefetchHintDistance >= 0) {
5684         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5685         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5686         br(LE, NEXT_32_START);
5687         b(NEXT_32_PRFM_START);
5688         BIND(NEXT_32_PRFM);
5689           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5690         BIND(NEXT_32_PRFM_START);
5691           prfm(Address(src, SoftwarePrefetchHintDistance));
5692           orr(v4, T16B, Vtmp1, Vtmp2);
5693           orr(v5, T16B, Vtmp3, Vtmp4);
5694           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5695           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5696           uzp2(v5, T16B, v4, v5); // high bytes
5697           umov(tmp2, v5, D, 1);
5698           fmovd(tmp1, v5);
5699           orr(tmp1, tmp1, tmp2);
5700           cbnz(tmp1, LOOP_8);
5701           stpq(Vtmp1, Vtmp3, dst);
5702           sub(len, len, 32);
5703           add(dst, dst, 32);
5704           add(src, src, 64);
5705           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5706           br(GE, NEXT_32_PRFM);
5707           cmp(len, (u1)32);
5708           br(LT, LOOP_8);
5709         BIND(NEXT_32);
5710           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5711         BIND(NEXT_32_START);
5712       } else {
5713         BIND(NEXT_32);
5714           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5715       }
5716       prfm(Address(src, SoftwarePrefetchHintDistance));
5717       uzp1(v4, T16B, Vtmp1, Vtmp2);
5718       uzp1(v5, T16B, Vtmp3, Vtmp4);
5719       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5720       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5721       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5722       umov(tmp2, Vtmp1, D, 1);
5723       fmovd(tmp1, Vtmp1);
5724       orr(tmp1, tmp1, tmp2);
5725       cbnz(tmp1, LOOP_8);
5726       stpq(v4, v5, dst);
5727       sub(len, len, 32);
5728       add(dst, dst, 32);
5729       add(src, src, 64);
5730       cmp(len, (u1)32);
5731       br(GE, NEXT_32);
5732       cbz(len, DONE);
5733 
5734     BIND(LOOP_8);
5735       cmp(len, (u1)8);
5736       br(LT, LOOP_1);
5737     BIND(NEXT_8);
5738       ld1(Vtmp1, T8H, src);
5739       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5740       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5741       fmovd(tmp1, Vtmp3);
5742       cbnz(tmp1, NEXT_1);
5743       strd(Vtmp2, dst);
5744 
5745       sub(len, len, 8);
5746       add(dst, dst, 8);
5747       add(src, src, 16);
5748       cmp(len, (u1)8);
5749       br(GE, NEXT_8);
5750 
5751     BIND(LOOP_1);
5752 #endif
5753     cbz(len, DONE);
5754     BIND(NEXT_1);
5755       ldrh(tmp1, Address(post(src, 2)));
5756       tst(tmp1, 0xff00);
5757       br(NE, SET_RESULT);
5758       strb(tmp1, Address(post(dst, 1)));
5759       subs(len, len, 1);
5760       br(GT, NEXT_1);
5761 
5762     BIND(SET_RESULT);
5763       sub(result, result, len); // Return index where we stopped
5764                                 // Return len == 0 if we processed all
5765                                 // characters
5766     BIND(DONE);
5767 }
5768 
5769 
5770 // Inflate byte[] array to char[].
5771 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5772                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5773                                         Register tmp4) {
5774   Label big, done, after_init, to_stub;
5775 
5776   assert_different_registers(src, dst, len, tmp4, rscratch1);
5777 
5778   fmovd(vtmp1, zr);
5779   lsrw(tmp4, len, 3);
5780   bind(after_init);
5781   cbnzw(tmp4, big);
5782   // Short string: less than 8 bytes.
5783   {
5784     Label loop, tiny;
5785 
5786     cmpw(len, 4);
5787     br(LT, tiny);
5788     // Use SIMD to do 4 bytes.
5789     ldrs(vtmp2, post(src, 4));
5790     zip1(vtmp3, T8B, vtmp2, vtmp1);
5791     subw(len, len, 4);
5792     strd(vtmp3, post(dst, 8));
5793 
5794     cbzw(len, done);
5795 
5796     // Do the remaining bytes by steam.
5797     bind(loop);
5798     ldrb(tmp4, post(src, 1));
5799     strh(tmp4, post(dst, 2));
5800     subw(len, len, 1);
5801 
5802     bind(tiny);
5803     cbnz(len, loop);
5804 
5805     b(done);
5806   }
5807 
5808   if (SoftwarePrefetchHintDistance >= 0) {
5809     bind(to_stub);
5810       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5811       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5812       trampoline_call(stub);
5813       b(after_init);
5814   }
5815 
5816   // Unpack the bytes 8 at a time.
5817   bind(big);
5818   {
5819     Label loop, around, loop_last, loop_start;
5820 
5821     if (SoftwarePrefetchHintDistance >= 0) {
5822       const int large_loop_threshold = (64 + 16)/8;
5823       ldrd(vtmp2, post(src, 8));
5824       andw(len, len, 7);
5825       cmp(tmp4, (u1)large_loop_threshold);
5826       br(GE, to_stub);
5827       b(loop_start);
5828 
5829       bind(loop);
5830       ldrd(vtmp2, post(src, 8));
5831       bind(loop_start);
5832       subs(tmp4, tmp4, 1);
5833       br(EQ, loop_last);
5834       zip1(vtmp2, T16B, vtmp2, vtmp1);
5835       ldrd(vtmp3, post(src, 8));
5836       st1(vtmp2, T8H, post(dst, 16));
5837       subs(tmp4, tmp4, 1);
5838       zip1(vtmp3, T16B, vtmp3, vtmp1);
5839       st1(vtmp3, T8H, post(dst, 16));
5840       br(NE, loop);
5841       b(around);
5842       bind(loop_last);
5843       zip1(vtmp2, T16B, vtmp2, vtmp1);
5844       st1(vtmp2, T8H, post(dst, 16));
5845       bind(around);
5846       cbz(len, done);
5847     } else {
5848       andw(len, len, 7);
5849       bind(loop);
5850       ldrd(vtmp2, post(src, 8));
5851       sub(tmp4, tmp4, 1);
5852       zip1(vtmp3, T16B, vtmp2, vtmp1);
5853       st1(vtmp3, T8H, post(dst, 16));
5854       cbnz(tmp4, loop);
5855     }
5856   }
5857 
5858   // Do the tail of up to 8 bytes.
5859   add(src, src, len);
5860   ldrd(vtmp3, Address(src, -8));
5861   add(dst, dst, len, ext::uxtw, 1);
5862   zip1(vtmp3, T16B, vtmp3, vtmp1);
5863   strq(vtmp3, Address(dst, -16));
5864 
5865   bind(done);
5866 }
5867 
5868 // Compress char[] array to byte[].
5869 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5870                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5871                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5872                                          Register result) {
5873   encode_iso_array(src, dst, len, result,
5874                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5875   cmp(len, zr);
5876   csel(result, result, zr, EQ);
5877 }
5878 
5879 // get_thread() can be called anywhere inside generated code so we
5880 // need to save whatever non-callee save context might get clobbered
5881 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5882 // the call setup code.
5883 //
5884 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5885 //
5886 void MacroAssembler::get_thread(Register dst) {
5887   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5888   push(saved_regs, sp);
5889 
5890   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5891   blrt(lr, 1, 0, 1);
5892   if (dst != c_rarg0) {
5893     mov(dst, c_rarg0);
5894   }
5895 
5896   pop(saved_regs, sp);
5897 }
5898 
5899 // DMS TODO ValueType MachVVEPNode support
5900 void MacroAssembler::unpack_value_args(Compile* C) {
5901   // Not implemented
5902   guarantee(false, "Support for MachVVEPNode is not implemented");
5903 }
5904