1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 or 64 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   // 64 bit addresses are only enabled with Use64BitLiteralAddresses set.
 177   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 178     // Move narrow OOP
 179     narrowOop n = CompressedOops::encode((oop)o);
 180     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 181     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 182     instructions = 2;
 183   } else {
 184     // Move wide OOP
 185     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 186     uintptr_t dest = (uintptr_t)o;
 187     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 189     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 190     instructions = 3;
 191 
 192     if (Use64BitLiteralAddresses) {
 193       assert(nativeInstruction_at(insn_addr+12)->is_movk(), "wrong insns in patch");
 194       Instruction_aarch64::patch(insn_addr+12, 20, 5, (dest >>= 16) & 0xffff);
 195       instructions = 4;
 196     }
 197   }
 198   return instructions * NativeInstruction::instruction_size;
 199 }
 200 
 201 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 202   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 203   // We encode narrow ones by setting the upper 16 bits in the first
 204   // instruction.
 205   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 206   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 207          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 208 
 209   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 210   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 211   return 2 * NativeInstruction::instruction_size;
 212 }
 213 
 214 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 215   long offset = 0;
 216   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 217     // Load register (literal)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219     return address(((uint64_t)insn_addr + (offset << 2)));
 220   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 221     // Unconditional branch (immediate)
 222     offset = Instruction_aarch64::sextract(insn, 25, 0);
 223   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 224     // Conditional branch (immediate)
 225     offset = Instruction_aarch64::sextract(insn, 23, 5);
 226   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 227     // Compare & branch (immediate)
 228     offset = Instruction_aarch64::sextract(insn, 23, 5);
 229    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 230     // Test & branch (immediate)
 231     offset = Instruction_aarch64::sextract(insn, 18, 5);
 232   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 233     // PC-rel. addressing
 234     offset = Instruction_aarch64::extract(insn, 30, 29);
 235     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 236     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 237     if (shift) {
 238       offset <<= shift;
 239       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 240       target_page &= ((uint64_t)-1) << shift;
 241       // Return the target address for the following sequences
 242       //   1 - adrp    Rx, target_page
 243       //       ldr/str Ry, [Rx, #offset_in_page]
 244       //   2 - adrp    Rx, target_page
 245       //       add     Ry, Rx, #offset_in_page
 246       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 247       //       movk    Rx, #imm12<<32
 248       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 249       //
 250       // In the first two cases  we check that the register is the same and
 251       // return the target_page + the offset within the page.
 252       // Otherwise we assume it is a page aligned relocation and return
 253       // the target page only.
 254       //
 255       unsigned insn2 = ((unsigned*)insn_addr)[1];
 256       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 9, 5)) {
 259         // Load/store register (unsigned immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 262         return address(target_page + (byte_offset << size));
 263       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 264                 Instruction_aarch64::extract(insn, 4, 0) ==
 265                         Instruction_aarch64::extract(insn2, 4, 0)) {
 266         // add (immediate)
 267         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 268         return address(target_page + byte_offset);
 269       } else {
 270         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 271                Instruction_aarch64::extract(insn, 4, 0) ==
 272                  Instruction_aarch64::extract(insn2, 4, 0)) {
 273           target_page = (target_page & 0xffffffff) |
 274                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 275         }
 276         return (address)target_page;
 277       }
 278     } else {
 279       ShouldNotReachHere();
 280     }
 281   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 282     u_int32_t *insns = (u_int32_t *)insn_addr;
 283     // Move wide constant: movz, movk, movk [, movk].  See movptr().
 284     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch - 2nd movk missing");
 285     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch - 3rd movk missing");
 286     u_int64_t addr = u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 287                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 288                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32);
 289 
 290     // Allow for getting the target address of a possible adddress.
 291     if (Use64BitLiteralAddresses) {
 292       assert(nativeInstruction_at(insns+3)->is_movk(), "wrong insns in patch - 4th movk missing.");
 293       addr += u_int64_t(Instruction_aarch64::extract(insns[3], 20, 5)) << 48;
 294     }
 295     return (address) addr;
 296   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 297              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 298     return 0;
 299   } else {
 300     ShouldNotReachHere();
 301   }
 302   return address(((uint64_t)insn_addr + (offset << 2)));
 303 }
 304 
 305 void MacroAssembler::safepoint_poll(Label& slow_path) {
 306   if (SafepointMechanism::uses_thread_local_poll()) {
 307     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 308     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 309   } else {
 310     unsigned long offset;
 311     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 312     ldrw(rscratch1, Address(rscratch1, offset));
 313     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 314     cbnz(rscratch1, slow_path);
 315   }
 316 }
 317 
 318 // Just like safepoint_poll, but use an acquiring load for thread-
 319 // local polling.
 320 //
 321 // We need an acquire here to ensure that any subsequent load of the
 322 // global SafepointSynchronize::_state flag is ordered after this load
 323 // of the local Thread::_polling page.  We don't want this poll to
 324 // return false (i.e. not safepointing) and a later poll of the global
 325 // SafepointSynchronize::_state spuriously to return true.
 326 //
 327 // This is to avoid a race when we're in a native->Java transition
 328 // racing the code which wakes up from a safepoint.
 329 //
 330 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 331   if (SafepointMechanism::uses_thread_local_poll()) {
 332     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 333     ldar(rscratch1, rscratch1);
 334     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 335   } else {
 336     safepoint_poll(slow_path);
 337   }
 338 }
 339 
 340 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 341   // we must set sp to zero to clear frame
 342   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 343 
 344   // must clear fp, so that compiled frames are not confused; it is
 345   // possible that we need it only for debugging
 346   if (clear_fp) {
 347     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 348   }
 349 
 350   // Always clear the pc because it could have been set by make_walkable()
 351   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 352 }
 353 
 354 // Calls to C land
 355 //
 356 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 357 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 358 // has to be reset to 0. This is required to allow proper stack traversal.
 359 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 360                                          Register last_java_fp,
 361                                          Register last_java_pc,
 362                                          Register scratch) {
 363 
 364   if (last_java_pc->is_valid()) {
 365       str(last_java_pc, Address(rthread,
 366                                 JavaThread::frame_anchor_offset()
 367                                 + JavaFrameAnchor::last_Java_pc_offset()));
 368     }
 369 
 370   // determine last_java_sp register
 371   if (last_java_sp == sp) {
 372     mov(scratch, sp);
 373     last_java_sp = scratch;
 374   } else if (!last_java_sp->is_valid()) {
 375     last_java_sp = esp;
 376   }
 377 
 378   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 379 
 380   // last_java_fp is optional
 381   if (last_java_fp->is_valid()) {
 382     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 383   }
 384 }
 385 
 386 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 387                                          Register last_java_fp,
 388                                          address  last_java_pc,
 389                                          Register scratch) {
 390   assert(last_java_pc != NULL, "must provide a valid PC");
 391 
 392   adr(scratch, last_java_pc);
 393   str(scratch, Address(rthread,
 394                        JavaThread::frame_anchor_offset()
 395                        + JavaFrameAnchor::last_Java_pc_offset()));
 396 
 397   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 398 }
 399 
 400 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 401                                          Register last_java_fp,
 402                                          Label &L,
 403                                          Register scratch) {
 404   if (L.is_bound()) {
 405     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 406   } else {
 407     InstructionMark im(this);
 408     L.add_patch_at(code(), locator());
 409     set_last_Java_frame(last_java_sp, last_java_fp, pc() /* Patched later */, scratch);
 410   }
 411 }
 412 
 413 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 414   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 415   assert(CodeCache::find_blob(entry.target()) != NULL,
 416          "destination of far call not found in code cache");
 417   if (far_branches()) {
 418     unsigned long offset;
 419     // We can use ADRP here because we know that the total size of
 420     // the code cache cannot exceed 2Gb.
 421     adrp(tmp, entry, offset);
 422     add(tmp, tmp, offset);
 423     if (cbuf) cbuf->set_insts_mark();
 424     blr(tmp);
 425   } else {
 426     if (cbuf) cbuf->set_insts_mark();
 427     bl(entry);
 428   }
 429 }
 430 
 431 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 432   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 433   assert(CodeCache::find_blob(entry.target()) != NULL,
 434          "destination of far call not found in code cache");
 435   if (far_branches()) {
 436     unsigned long offset;
 437     // We can use ADRP here because we know that the total size of
 438     // the code cache cannot exceed 2Gb.
 439     adrp(tmp, entry, offset);
 440     add(tmp, tmp, offset);
 441     if (cbuf) cbuf->set_insts_mark();
 442     br(tmp);
 443   } else {
 444     if (cbuf) cbuf->set_insts_mark();
 445     b(entry);
 446   }
 447 }
 448 
 449 void MacroAssembler::reserved_stack_check() {
 450     // testing if reserved zone needs to be enabled
 451     Label no_reserved_zone_enabling;
 452 
 453     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 454     cmp(sp, rscratch1);
 455     br(Assembler::LO, no_reserved_zone_enabling);
 456 
 457     enter();   // LR and FP are live.
 458     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 459     mov(c_rarg0, rthread);
 460     blr(rscratch1);
 461     leave();
 462 
 463     // We have already removed our own frame.
 464     // throw_delayed_StackOverflowError will think that it's been
 465     // called by our caller.
 466     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 467     br(rscratch1);
 468     should_not_reach_here();
 469 
 470     bind(no_reserved_zone_enabling);
 471 }
 472 
 473 int MacroAssembler::biased_locking_enter(Register lock_reg,
 474                                          Register obj_reg,
 475                                          Register swap_reg,
 476                                          Register tmp_reg,
 477                                          bool swap_reg_contains_mark,
 478                                          Label& done,
 479                                          Label* slow_case,
 480                                          BiasedLockingCounters* counters) {
 481   assert(UseBiasedLocking, "why call this otherwise?");
 482   assert_different_registers(lock_reg, obj_reg, swap_reg);
 483 
 484   if (PrintBiasedLockingStatistics && counters == NULL)
 485     counters = BiasedLocking::counters();
 486 
 487   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 488   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 489   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 490   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 491   Address saved_mark_addr(lock_reg, 0);
 492 
 493   // Biased locking
 494   // See whether the lock is currently biased toward our thread and
 495   // whether the epoch is still valid
 496   // Note that the runtime guarantees sufficient alignment of JavaThread
 497   // pointers to allow age to be placed into low bits
 498   // First check to see whether biasing is even enabled for this object
 499   Label cas_label;
 500   int null_check_offset = -1;
 501   if (!swap_reg_contains_mark) {
 502     null_check_offset = offset();
 503     ldr(swap_reg, mark_addr);
 504   }
 505   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 506   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 507   br(Assembler::NE, cas_label);
 508   // The bias pattern is present in the object's header. Need to check
 509   // whether the bias owner and the epoch are both still current.
 510   load_prototype_header(tmp_reg, obj_reg);
 511   orr(tmp_reg, tmp_reg, rthread);
 512   eor(tmp_reg, swap_reg, tmp_reg);
 513   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 514   if (counters != NULL) {
 515     Label around;
 516     cbnz(tmp_reg, around);
 517     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 518     b(done);
 519     bind(around);
 520   } else {
 521     cbz(tmp_reg, done);
 522   }
 523 
 524   Label try_revoke_bias;
 525   Label try_rebias;
 526 
 527   // At this point we know that the header has the bias pattern and
 528   // that we are not the bias owner in the current epoch. We need to
 529   // figure out more details about the state of the header in order to
 530   // know what operations can be legally performed on the object's
 531   // header.
 532 
 533   // If the low three bits in the xor result aren't clear, that means
 534   // the prototype header is no longer biased and we have to revoke
 535   // the bias on this object.
 536   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 537   cbnz(rscratch1, try_revoke_bias);
 538 
 539   // Biasing is still enabled for this data type. See whether the
 540   // epoch of the current bias is still valid, meaning that the epoch
 541   // bits of the mark word are equal to the epoch bits of the
 542   // prototype header. (Note that the prototype header's epoch bits
 543   // only change at a safepoint.) If not, attempt to rebias the object
 544   // toward the current thread. Note that we must be absolutely sure
 545   // that the current epoch is invalid in order to do this because
 546   // otherwise the manipulations it performs on the mark word are
 547   // illegal.
 548   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 549   cbnz(rscratch1, try_rebias);
 550 
 551   // The epoch of the current bias is still valid but we know nothing
 552   // about the owner; it might be set or it might be clear. Try to
 553   // acquire the bias of the object using an atomic operation. If this
 554   // fails we will go in to the runtime to revoke the object's bias.
 555   // Note that we first construct the presumed unbiased header so we
 556   // don't accidentally blow away another thread's valid bias.
 557   {
 558     Label here;
 559     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 560     andr(swap_reg, swap_reg, rscratch1);
 561     orr(tmp_reg, swap_reg, rthread);
 562     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 563     // If the biasing toward our thread failed, this means that
 564     // another thread succeeded in biasing it toward itself and we
 565     // need to revoke that bias. The revocation will occur in the
 566     // interpreter runtime in the slow case.
 567     bind(here);
 568     if (counters != NULL) {
 569       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 570                   tmp_reg, rscratch1, rscratch2);
 571     }
 572   }
 573   b(done);
 574 
 575   bind(try_rebias);
 576   // At this point we know the epoch has expired, meaning that the
 577   // current "bias owner", if any, is actually invalid. Under these
 578   // circumstances _only_, we are allowed to use the current header's
 579   // value as the comparison value when doing the cas to acquire the
 580   // bias in the current epoch. In other words, we allow transfer of
 581   // the bias from one thread to another directly in this situation.
 582   //
 583   // FIXME: due to a lack of registers we currently blow away the age
 584   // bits in this situation. Should attempt to preserve them.
 585   {
 586     Label here;
 587     load_prototype_header(tmp_reg, obj_reg);
 588     orr(tmp_reg, rthread, tmp_reg);
 589     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 590     // If the biasing toward our thread failed, then another thread
 591     // succeeded in biasing it toward itself and we need to revoke that
 592     // bias. The revocation will occur in the runtime in the slow case.
 593     bind(here);
 594     if (counters != NULL) {
 595       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 596                   tmp_reg, rscratch1, rscratch2);
 597     }
 598   }
 599   b(done);
 600 
 601   bind(try_revoke_bias);
 602   // The prototype mark in the klass doesn't have the bias bit set any
 603   // more, indicating that objects of this data type are not supposed
 604   // to be biased any more. We are going to try to reset the mark of
 605   // this object to the prototype value and fall through to the
 606   // CAS-based locking scheme. Note that if our CAS fails, it means
 607   // that another thread raced us for the privilege of revoking the
 608   // bias of this particular object, so it's okay to continue in the
 609   // normal locking code.
 610   //
 611   // FIXME: due to a lack of registers we currently blow away the age
 612   // bits in this situation. Should attempt to preserve them.
 613   {
 614     Label here, nope;
 615     load_prototype_header(tmp_reg, obj_reg);
 616     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 617     bind(here);
 618 
 619     // Fall through to the normal CAS-based lock, because no matter what
 620     // the result of the above CAS, some thread must have succeeded in
 621     // removing the bias bit from the object's header.
 622     if (counters != NULL) {
 623       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 624                   rscratch1, rscratch2);
 625     }
 626     bind(nope);
 627   }
 628 
 629   bind(cas_label);
 630 
 631   return null_check_offset;
 632 }
 633 
 634 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 635   assert(UseBiasedLocking, "why call this otherwise?");
 636 
 637   // Check for biased locking unlock case, which is a no-op
 638   // Note: we do not have to check the thread ID for two reasons.
 639   // First, the interpreter checks for IllegalMonitorStateException at
 640   // a higher level. Second, if the bias was revoked while we held the
 641   // lock, the object could not be rebiased toward another thread, so
 642   // the bias bit would be clear.
 643   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 644   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 645   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 646   br(Assembler::EQ, done);
 647 }
 648 
 649 static void pass_arg0(MacroAssembler* masm, Register arg) {
 650   if (c_rarg0 != arg ) {
 651     masm->mov(c_rarg0, arg);
 652   }
 653 }
 654 
 655 static void pass_arg1(MacroAssembler* masm, Register arg) {
 656   if (c_rarg1 != arg ) {
 657     masm->mov(c_rarg1, arg);
 658   }
 659 }
 660 
 661 static void pass_arg2(MacroAssembler* masm, Register arg) {
 662   if (c_rarg2 != arg ) {
 663     masm->mov(c_rarg2, arg);
 664   }
 665 }
 666 
 667 static void pass_arg3(MacroAssembler* masm, Register arg) {
 668   if (c_rarg3 != arg ) {
 669     masm->mov(c_rarg3, arg);
 670   }
 671 }
 672 
 673 void MacroAssembler::call_VM_base(Register oop_result,
 674                                   Register java_thread,
 675                                   Register last_java_sp,
 676                                   address  entry_point,
 677                                   int      number_of_arguments,
 678                                   bool     check_exceptions) {
 679    // determine java_thread register
 680   if (!java_thread->is_valid()) {
 681     java_thread = rthread;
 682   }
 683 
 684   // determine last_java_sp register
 685   if (!last_java_sp->is_valid()) {
 686     last_java_sp = esp;
 687   }
 688 
 689   // debugging support
 690   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 691   assert(java_thread == rthread, "unexpected register");
 692 #ifdef ASSERT
 693   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 694   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 695 #endif // ASSERT
 696 
 697   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 698   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 699 
 700   // push java thread (becomes first argument of C function)
 701 
 702   mov(c_rarg0, java_thread);
 703 
 704   // set last Java frame before call
 705   assert(last_java_sp != rfp, "can't use rfp");
 706 
 707   Label l;
 708   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 709 
 710   // do the call, remove parameters
 711   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 712 
 713   // reset last Java frame
 714   // Only interpreter should have to clear fp
 715   reset_last_Java_frame(true);
 716 
 717    // C++ interp handles this in the interpreter
 718   check_and_handle_popframe(java_thread);
 719   check_and_handle_earlyret(java_thread);
 720 
 721   if (check_exceptions) {
 722     // check for pending exceptions (java_thread is set upon return)
 723     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 724     Label ok;
 725     cbz(rscratch1, ok);
 726     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 727     br(rscratch1);
 728     bind(ok);
 729   }
 730 
 731   // get oop result if there is one and reset the value in the thread
 732   if (oop_result->is_valid()) {
 733     get_vm_result(oop_result, java_thread);
 734   }
 735 }
 736 
 737 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 738   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 739 }
 740 
 741 // Maybe emit a call via a trampoline.  If the code cache is small
 742 // trampolines won't be emitted.
 743 
 744 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 745   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 746   assert(entry.rspec().type() == relocInfo::runtime_call_type
 747          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 748          || entry.rspec().type() == relocInfo::static_call_type
 749          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 750 
 751   // We need a trampoline if branches are far.
 752   if (far_branches()) {
 753     bool in_scratch_emit_size = false;
 754 #ifdef COMPILER2
 755     // We don't want to emit a trampoline if C2 is generating dummy
 756     // code during its branch shortening phase.
 757     CompileTask* task = ciEnv::current()->task();
 758     in_scratch_emit_size =
 759       (task != NULL && is_c2_compile(task->comp_level()) &&
 760        Compile::current()->in_scratch_emit_size());
 761 #endif
 762     if (!in_scratch_emit_size) {
 763       address stub = emit_trampoline_stub(offset(), entry.target());
 764       if (stub == NULL) {
 765         return NULL; // CodeCache is full
 766       }
 767     }
 768   }
 769 
 770   if (cbuf) cbuf->set_insts_mark();
 771   relocate(entry.rspec());
 772   if (!far_branches()) {
 773     bl(entry.target());
 774   } else {
 775     bl(pc());
 776   }
 777   // just need to return a non-null address
 778   return pc();
 779 }
 780 
 781 
 782 // Emit a trampoline stub for a call to a target which is too far away.
 783 //
 784 // code sequences:
 785 //
 786 // call-site:
 787 //   branch-and-link to <destination> or <trampoline stub>
 788 //
 789 // Related trampoline stub for this call site in the stub section:
 790 //   load the call target from the constant pool
 791 //   branch (LR still points to the call site above)
 792 
 793 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 794                                              address dest) {
 795   // Max stub size: alignment nop, TrampolineStub.
 796   address stub = start_a_stub(NativeInstruction::instruction_size
 797                    + NativeCallTrampolineStub::instruction_size);
 798   if (stub == NULL) {
 799     return NULL;  // CodeBuffer::expand failed
 800   }
 801 
 802   // Create a trampoline stub relocation which relates this trampoline stub
 803   // with the call instruction at insts_call_instruction_offset in the
 804   // instructions code-section.
 805   align(wordSize);
 806   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 807                                             + insts_call_instruction_offset));
 808   const int stub_start_offset = offset();
 809 
 810   // Now, create the trampoline stub's code:
 811   // - load the call
 812   // - call
 813   Label target;
 814   ldr(rscratch1, target);
 815   br(rscratch1);
 816   bind(target);
 817   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 818          "should be");
 819   emit_int64((int64_t)dest);
 820 
 821   const address stub_start_addr = addr_at(stub_start_offset);
 822 
 823   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 824 
 825   end_a_stub();
 826   return stub_start_addr;
 827 }
 828 
 829 void MacroAssembler::c2bool(Register x) {
 830   // implements x == 0 ? 0 : 1
 831   // note: must only look at least-significant byte of x
 832   //       since C-style booleans are stored in one byte
 833   //       only! (was bug)
 834   tst(x, 0xff);
 835   cset(x, Assembler::NE);
 836 }
 837 
 838 address MacroAssembler::ic_call(address entry, jint method_index) {
 839   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 840   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 841   // unsigned long offset;
 842   // ldr_constant(rscratch2, const_ptr);
 843   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 844   return trampoline_call(Address(entry, rh));
 845 }
 846 
 847 // Implementation of call_VM versions
 848 
 849 void MacroAssembler::call_VM(Register oop_result,
 850                              address entry_point,
 851                              bool check_exceptions) {
 852   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              bool check_exceptions) {
 859   pass_arg1(this, arg_1);
 860   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 861 }
 862 
 863 void MacroAssembler::call_VM(Register oop_result,
 864                              address entry_point,
 865                              Register arg_1,
 866                              Register arg_2,
 867                              bool check_exceptions) {
 868   assert(arg_1 != c_rarg2, "smashed arg");
 869   pass_arg2(this, arg_2);
 870   pass_arg1(this, arg_1);
 871   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 872 }
 873 
 874 void MacroAssembler::call_VM(Register oop_result,
 875                              address entry_point,
 876                              Register arg_1,
 877                              Register arg_2,
 878                              Register arg_3,
 879                              bool check_exceptions) {
 880   assert(arg_1 != c_rarg3, "smashed arg");
 881   assert(arg_2 != c_rarg3, "smashed arg");
 882   pass_arg3(this, arg_3);
 883 
 884   assert(arg_1 != c_rarg2, "smashed arg");
 885   pass_arg2(this, arg_2);
 886 
 887   pass_arg1(this, arg_1);
 888   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              int number_of_arguments,
 895                              bool check_exceptions) {
 896   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 897 }
 898 
 899 void MacroAssembler::call_VM(Register oop_result,
 900                              Register last_java_sp,
 901                              address entry_point,
 902                              Register arg_1,
 903                              bool check_exceptions) {
 904   pass_arg1(this, arg_1);
 905   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 906 }
 907 
 908 void MacroAssembler::call_VM(Register oop_result,
 909                              Register last_java_sp,
 910                              address entry_point,
 911                              Register arg_1,
 912                              Register arg_2,
 913                              bool check_exceptions) {
 914 
 915   assert(arg_1 != c_rarg2, "smashed arg");
 916   pass_arg2(this, arg_2);
 917   pass_arg1(this, arg_1);
 918   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 919 }
 920 
 921 void MacroAssembler::call_VM(Register oop_result,
 922                              Register last_java_sp,
 923                              address entry_point,
 924                              Register arg_1,
 925                              Register arg_2,
 926                              Register arg_3,
 927                              bool check_exceptions) {
 928   assert(arg_1 != c_rarg3, "smashed arg");
 929   assert(arg_2 != c_rarg3, "smashed arg");
 930   pass_arg3(this, arg_3);
 931   assert(arg_1 != c_rarg2, "smashed arg");
 932   pass_arg2(this, arg_2);
 933   pass_arg1(this, arg_1);
 934   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 935 }
 936 
 937 
 938 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 939   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 940   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 941   verify_oop(oop_result, "broken oop in call_VM_base");
 942 }
 943 
 944 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 945   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 946   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 947 }
 948 
 949 void MacroAssembler::align(int modulus) {
 950   while (offset() % modulus != 0) nop();
 951 }
 952 
 953 // these are no-ops overridden by InterpreterMacroAssembler
 954 
 955 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 956 
 957 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 958 
 959 
 960 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 961                                                       Register tmp,
 962                                                       int offset) {
 963   intptr_t value = *delayed_value_addr;
 964   if (value != 0)
 965     return RegisterOrConstant(value + offset);
 966 
 967   // load indirectly to solve generation ordering problem
 968   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 969 
 970   if (offset != 0)
 971     add(tmp, tmp, offset);
 972 
 973   return RegisterOrConstant(tmp);
 974 }
 975 
 976 
 977 void MacroAssembler:: notify(int type) {
 978   if (type == bytecode_start) {
 979     // set_last_Java_frame(esp, rfp, (address)NULL);
 980     Assembler:: notify(type);
 981     // reset_last_Java_frame(true);
 982   }
 983   else
 984     Assembler:: notify(type);
 985 }
 986 
 987 // Look up the method for a megamorphic invokeinterface call.
 988 // The target method is determined by <intf_klass, itable_index>.
 989 // The receiver klass is in recv_klass.
 990 // On success, the result will be in method_result, and execution falls through.
 991 // On failure, execution transfers to the given label.
 992 void MacroAssembler::lookup_interface_method(Register recv_klass,
 993                                              Register intf_klass,
 994                                              RegisterOrConstant itable_index,
 995                                              Register method_result,
 996                                              Register scan_temp,
 997                                              Label& L_no_such_interface,
 998                          bool return_method) {
 999   assert_different_registers(recv_klass, intf_klass, scan_temp);
1000   assert_different_registers(method_result, intf_klass, scan_temp);
1001   assert(recv_klass != method_result || !return_method,
1002      "recv_klass can be destroyed when method isn't needed");
1003   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1004          "caller must use same register for non-constant itable index as for method");
1005 
1006   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1007   int vtable_base = in_bytes(Klass::vtable_start_offset());
1008   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1009   int scan_step   = itableOffsetEntry::size() * wordSize;
1010   int vte_size    = vtableEntry::size_in_bytes();
1011   assert(vte_size == wordSize, "else adjust times_vte_scale");
1012 
1013   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1014 
1015   // %%% Could store the aligned, prescaled offset in the klassoop.
1016   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1017   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1018   add(scan_temp, scan_temp, vtable_base);
1019 
1020   if (return_method) {
1021     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1022     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1023     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1024     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1025     if (itentry_off)
1026       add(recv_klass, recv_klass, itentry_off);
1027   }
1028 
1029   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1030   //   if (scan->interface() == intf) {
1031   //     result = (klass + scan->offset() + itable_index);
1032   //   }
1033   // }
1034   Label search, found_method;
1035 
1036   for (int peel = 1; peel >= 0; peel--) {
1037     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1038     cmp(intf_klass, method_result);
1039 
1040     if (peel) {
1041       br(Assembler::EQ, found_method);
1042     } else {
1043       br(Assembler::NE, search);
1044       // (invert the test to fall through to found_method...)
1045     }
1046 
1047     if (!peel)  break;
1048 
1049     bind(search);
1050 
1051     // Check that the previous entry is non-null.  A null entry means that
1052     // the receiver class doesn't implement the interface, and wasn't the
1053     // same as when the caller was compiled.
1054     cbz(method_result, L_no_such_interface);
1055     add(scan_temp, scan_temp, scan_step);
1056   }
1057 
1058   bind(found_method);
1059 
1060   // Got a hit.
1061   if (return_method) {
1062     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1063     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1064   }
1065 }
1066 
1067 // virtual method calling
1068 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1069                                            RegisterOrConstant vtable_index,
1070                                            Register method_result) {
1071   const int base = in_bytes(Klass::vtable_start_offset());
1072   assert(vtableEntry::size() * wordSize == 8,
1073          "adjust the scaling in the code below");
1074   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1075 
1076   if (vtable_index.is_register()) {
1077     lea(method_result, Address(recv_klass,
1078                                vtable_index.as_register(),
1079                                Address::lsl(LogBytesPerWord)));
1080     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1081   } else {
1082     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1083     ldr(method_result,
1084         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1085   }
1086 }
1087 
1088 void MacroAssembler::check_klass_subtype(Register sub_klass,
1089                            Register super_klass,
1090                            Register temp_reg,
1091                            Label& L_success) {
1092   Label L_failure;
1093   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1094   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1095   bind(L_failure);
1096 }
1097 
1098 
1099 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1100                                                    Register super_klass,
1101                                                    Register temp_reg,
1102                                                    Label* L_success,
1103                                                    Label* L_failure,
1104                                                    Label* L_slow_path,
1105                                         RegisterOrConstant super_check_offset) {
1106   assert_different_registers(sub_klass, super_klass, temp_reg);
1107   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1108   if (super_check_offset.is_register()) {
1109     assert_different_registers(sub_klass, super_klass,
1110                                super_check_offset.as_register());
1111   } else if (must_load_sco) {
1112     assert(temp_reg != noreg, "supply either a temp or a register offset");
1113   }
1114 
1115   Label L_fallthrough;
1116   int label_nulls = 0;
1117   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1118   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1119   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1120   assert(label_nulls <= 1, "at most one NULL in the batch");
1121 
1122   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1123   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1124   Address super_check_offset_addr(super_klass, sco_offset);
1125 
1126   // Hacked jmp, which may only be used just before L_fallthrough.
1127 #define final_jmp(label)                                                \
1128   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1129   else                            b(label)                /*omit semi*/
1130 
1131   // If the pointers are equal, we are done (e.g., String[] elements).
1132   // This self-check enables sharing of secondary supertype arrays among
1133   // non-primary types such as array-of-interface.  Otherwise, each such
1134   // type would need its own customized SSA.
1135   // We move this check to the front of the fast path because many
1136   // type checks are in fact trivially successful in this manner,
1137   // so we get a nicely predicted branch right at the start of the check.
1138   cmp(sub_klass, super_klass);
1139   br(Assembler::EQ, *L_success);
1140 
1141   // Check the supertype display:
1142   if (must_load_sco) {
1143     ldrw(temp_reg, super_check_offset_addr);
1144     super_check_offset = RegisterOrConstant(temp_reg);
1145   }
1146   Address super_check_addr(sub_klass, super_check_offset);
1147   ldr(rscratch1, super_check_addr);
1148   cmp(super_klass, rscratch1); // load displayed supertype
1149 
1150   // This check has worked decisively for primary supers.
1151   // Secondary supers are sought in the super_cache ('super_cache_addr').
1152   // (Secondary supers are interfaces and very deeply nested subtypes.)
1153   // This works in the same check above because of a tricky aliasing
1154   // between the super_cache and the primary super display elements.
1155   // (The 'super_check_addr' can address either, as the case requires.)
1156   // Note that the cache is updated below if it does not help us find
1157   // what we need immediately.
1158   // So if it was a primary super, we can just fail immediately.
1159   // Otherwise, it's the slow path for us (no success at this point).
1160 
1161   if (super_check_offset.is_register()) {
1162     br(Assembler::EQ, *L_success);
1163     subs(zr, super_check_offset.as_register(), sc_offset);
1164     if (L_failure == &L_fallthrough) {
1165       br(Assembler::EQ, *L_slow_path);
1166     } else {
1167       br(Assembler::NE, *L_failure);
1168       final_jmp(*L_slow_path);
1169     }
1170   } else if (super_check_offset.as_constant() == sc_offset) {
1171     // Need a slow path; fast failure is impossible.
1172     if (L_slow_path == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_slow_path);
1176       final_jmp(*L_success);
1177     }
1178   } else {
1179     // No slow path; it's a fast decision.
1180     if (L_failure == &L_fallthrough) {
1181       br(Assembler::EQ, *L_success);
1182     } else {
1183       br(Assembler::NE, *L_failure);
1184       final_jmp(*L_success);
1185     }
1186   }
1187 
1188   bind(L_fallthrough);
1189 
1190 #undef final_jmp
1191 }
1192 
1193 // These two are taken from x86, but they look generally useful
1194 
1195 // scans count pointer sized words at [addr] for occurence of value,
1196 // generic
1197 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1198                                 Register scratch) {
1199   Label Lloop, Lexit;
1200   cbz(count, Lexit);
1201   bind(Lloop);
1202   ldr(scratch, post(addr, wordSize));
1203   cmp(value, scratch);
1204   br(EQ, Lexit);
1205   sub(count, count, 1);
1206   cbnz(count, Lloop);
1207   bind(Lexit);
1208 }
1209 
1210 // scans count 4 byte words at [addr] for occurence of value,
1211 // generic
1212 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1213                                 Register scratch) {
1214   Label Lloop, Lexit;
1215   cbz(count, Lexit);
1216   bind(Lloop);
1217   ldrw(scratch, post(addr, wordSize));
1218   cmpw(value, scratch);
1219   br(EQ, Lexit);
1220   sub(count, count, 1);
1221   cbnz(count, Lloop);
1222   bind(Lexit);
1223 }
1224 
1225 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1226                                                    Register super_klass,
1227                                                    Register temp_reg,
1228                                                    Register temp2_reg,
1229                                                    Label* L_success,
1230                                                    Label* L_failure,
1231                                                    bool set_cond_codes) {
1232   assert_different_registers(sub_klass, super_klass, temp_reg);
1233   if (temp2_reg != noreg)
1234     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1235 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1236 
1237   Label L_fallthrough;
1238   int label_nulls = 0;
1239   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1240   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1241   assert(label_nulls <= 1, "at most one NULL in the batch");
1242 
1243   // a couple of useful fields in sub_klass:
1244   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1245   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1246   Address secondary_supers_addr(sub_klass, ss_offset);
1247   Address super_cache_addr(     sub_klass, sc_offset);
1248 
1249   BLOCK_COMMENT("check_klass_subtype_slow_path");
1250 
1251   // Do a linear scan of the secondary super-klass chain.
1252   // This code is rarely used, so simplicity is a virtue here.
1253   // The repne_scan instruction uses fixed registers, which we must spill.
1254   // Don't worry too much about pre-existing connections with the input regs.
1255 
1256   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1257   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1258 
1259   RegSet pushed_registers;
1260   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1261   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1262 
1263   if (super_klass != r0 || UseCompressedOops) {
1264     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1265   }
1266 
1267   push(pushed_registers, sp);
1268 
1269   // Get super_klass value into r0 (even if it was in r5 or r2).
1270   if (super_klass != r0) {
1271     mov(r0, super_klass);
1272   }
1273 
1274 #ifndef PRODUCT
1275   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1276   Address pst_counter_addr(rscratch2);
1277   ldr(rscratch1, pst_counter_addr);
1278   add(rscratch1, rscratch1, 1);
1279   str(rscratch1, pst_counter_addr);
1280 #endif //PRODUCT
1281 
1282   // We will consult the secondary-super array.
1283   ldr(r5, secondary_supers_addr);
1284   // Load the array length.
1285   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1286   // Skip to start of data.
1287   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1288 
1289   cmp(sp, zr); // Clear Z flag; SP is never zero
1290   // Scan R2 words at [R5] for an occurrence of R0.
1291   // Set NZ/Z based on last compare.
1292   repne_scan(r5, r0, r2, rscratch1);
1293 
1294   // Unspill the temp. registers:
1295   pop(pushed_registers, sp);
1296 
1297   br(Assembler::NE, *L_failure);
1298 
1299   // Success.  Cache the super we found and proceed in triumph.
1300   str(super_klass, super_cache_addr);
1301 
1302   if (L_success != &L_fallthrough) {
1303     b(*L_success);
1304   }
1305 
1306 #undef IS_A_TEMP
1307 
1308   bind(L_fallthrough);
1309 }
1310 
1311 
1312 void MacroAssembler::verify_oop(Register reg, const char* s) {
1313   if (!VerifyOops) return;
1314 
1315   // Pass register number to verify_oop_subroutine
1316   const char* b = NULL;
1317   {
1318     ResourceMark rm;
1319     stringStream ss;
1320     ss.print("verify_oop: %s: %s", reg->name(), s);
1321     b = code_string(ss.as_string());
1322   }
1323   BLOCK_COMMENT("verify_oop {");
1324 
1325   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1326   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1327 
1328   mov(r0, reg);
1329   mov(rscratch1, (address)b);
1330 
1331   // call indirectly to solve generation ordering problem
1332   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1333   ldr(rscratch2, Address(rscratch2));
1334   blr(rscratch2);
1335 
1336   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1337   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1338 
1339   BLOCK_COMMENT("} verify_oop");
1340 }
1341 
1342 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1343   if (!VerifyOops) return;
1344 
1345   const char* b = NULL;
1346   {
1347     ResourceMark rm;
1348     stringStream ss;
1349     ss.print("verify_oop_addr: %s", s);
1350     b = code_string(ss.as_string());
1351   }
1352   BLOCK_COMMENT("verify_oop_addr {");
1353 
1354   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1355   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1356 
1357   // addr may contain sp so we will have to adjust it based on the
1358   // pushes that we just did.
1359   if (addr.uses(sp)) {
1360     lea(r0, addr);
1361     ldr(r0, Address(r0, 4 * wordSize));
1362   } else {
1363     ldr(r0, addr);
1364   }
1365   mov(rscratch1, (address)b);
1366 
1367   // call indirectly to solve generation ordering problem
1368   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1369   ldr(rscratch2, Address(rscratch2));
1370   blr(rscratch2);
1371 
1372   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1373   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1374 
1375   BLOCK_COMMENT("} verify_oop_addr");
1376 }
1377 
1378 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1379                                          int extra_slot_offset) {
1380   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1381   int stackElementSize = Interpreter::stackElementSize;
1382   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1383 #ifdef ASSERT
1384   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1385   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1386 #endif
1387   if (arg_slot.is_constant()) {
1388     return Address(esp, arg_slot.as_constant() * stackElementSize
1389                    + offset);
1390   } else {
1391     add(rscratch1, esp, arg_slot.as_register(),
1392         ext::uxtx, exact_log2(stackElementSize));
1393     return Address(rscratch1, offset);
1394   }
1395 }
1396 
1397 void MacroAssembler::call_VM_leaf_base(address entry_point,
1398                                        int number_of_arguments,
1399                                        Label *retaddr) {
1400   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1401 }
1402 
1403 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1404                                         int number_of_gp_arguments,
1405                                         int number_of_fp_arguments,
1406                                         ret_type type,
1407                                         Label *retaddr) {
1408   Label E, L;
1409 
1410   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1411 
1412   // We add 1 to number_of_arguments because the thread in arg0 is
1413   // not counted
1414   mov(rscratch1, entry_point);
1415   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1416   if (retaddr)
1417     bind(*retaddr);
1418 
1419   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1420   maybe_isb();
1421 }
1422 
1423 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1424   call_VM_leaf_base(entry_point, number_of_arguments);
1425 }
1426 
1427 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1428   pass_arg0(this, arg_0);
1429   call_VM_leaf_base(entry_point, 1);
1430 }
1431 
1432 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1433   pass_arg0(this, arg_0);
1434   pass_arg1(this, arg_1);
1435   call_VM_leaf_base(entry_point, 2);
1436 }
1437 
1438 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1439                                   Register arg_1, Register arg_2) {
1440   pass_arg0(this, arg_0);
1441   pass_arg1(this, arg_1);
1442   pass_arg2(this, arg_2);
1443   call_VM_leaf_base(entry_point, 3);
1444 }
1445 
1446 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1447   pass_arg0(this, arg_0);
1448   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1449 }
1450 
1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1452 
1453   assert(arg_0 != c_rarg1, "smashed arg");
1454   pass_arg1(this, arg_1);
1455   pass_arg0(this, arg_0);
1456   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1457 }
1458 
1459 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1460   assert(arg_0 != c_rarg2, "smashed arg");
1461   assert(arg_1 != c_rarg2, "smashed arg");
1462   pass_arg2(this, arg_2);
1463   assert(arg_0 != c_rarg1, "smashed arg");
1464   pass_arg1(this, arg_1);
1465   pass_arg0(this, arg_0);
1466   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1467 }
1468 
1469 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1470   assert(arg_0 != c_rarg3, "smashed arg");
1471   assert(arg_1 != c_rarg3, "smashed arg");
1472   assert(arg_2 != c_rarg3, "smashed arg");
1473   pass_arg3(this, arg_3);
1474   assert(arg_0 != c_rarg2, "smashed arg");
1475   assert(arg_1 != c_rarg2, "smashed arg");
1476   pass_arg2(this, arg_2);
1477   assert(arg_0 != c_rarg1, "smashed arg");
1478   pass_arg1(this, arg_1);
1479   pass_arg0(this, arg_0);
1480   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1481 }
1482 
1483 void MacroAssembler::null_check(Register reg, int offset) {
1484   if (needs_explicit_null_check(offset)) {
1485     // provoke OS NULL exception if reg = NULL by
1486     // accessing M[reg] w/o changing any registers
1487     // NOTE: this is plenty to provoke a segv
1488     ldr(zr, Address(reg));
1489   } else {
1490     // nothing to do, (later) access of M[reg + offset]
1491     // will provoke OS NULL exception if reg = NULL
1492   }
1493 }
1494 
1495 // MacroAssembler protected routines needed to implement
1496 // public methods
1497 
1498 void MacroAssembler::mov(Register r, Address dest) {
1499   code_section()->relocate(pc(), dest.rspec());
1500   u_int64_t imm64 = (u_int64_t)dest.target();
1501   movptr(r, imm64);
1502 }
1503 
1504 // Move a constant pointer into r.  In AArch64 mode the virtual address space
1505 // is 48 bits in size or 52 bits. We need three or four instructions to create
1506 // a patchable instruction sequence that can reach anywhere.
1507 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1508 #ifndef PRODUCT
1509   {
1510     char buffer[64];
1511     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1512     block_comment(buffer);
1513   }
1514 #endif
1515   if (!Use64BitLiteralAddresses) {
1516     assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1517   }
1518   movz(r, imm64 & 0xffff);
1519   imm64 >>= 16;
1520   movk(r, imm64 & 0xffff, 16);
1521   imm64 >>= 16;
1522   movk(r, imm64 & 0xffff, 32);
1523 
1524   if (Use64BitLiteralAddresses) {
1525     imm64 >>= 16;
1526     movk(r, imm64 & 0xffff, 48);
1527   }
1528 }
1529 
1530 // Macro to mov replicated immediate to vector register.
1531 //  Vd will get the following values for different arrangements in T
1532 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1533 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1534 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1535 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1536 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1537 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1538 //   T1D/T2D: invalid
1539 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1540   assert(T != T1D && T != T2D, "invalid arrangement");
1541   if (T == T8B || T == T16B) {
1542     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1543     movi(Vd, T, imm32 & 0xff, 0);
1544     return;
1545   }
1546   u_int32_t nimm32 = ~imm32;
1547   if (T == T4H || T == T8H) {
1548     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1549     imm32 &= 0xffff;
1550     nimm32 &= 0xffff;
1551   }
1552   u_int32_t x = imm32;
1553   int movi_cnt = 0;
1554   int movn_cnt = 0;
1555   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1556   x = nimm32;
1557   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1558   if (movn_cnt < movi_cnt) imm32 = nimm32;
1559   unsigned lsl = 0;
1560   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1561   if (movn_cnt < movi_cnt)
1562     mvni(Vd, T, imm32 & 0xff, lsl);
1563   else
1564     movi(Vd, T, imm32 & 0xff, lsl);
1565   imm32 >>= 8; lsl += 8;
1566   while (imm32) {
1567     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1568     if (movn_cnt < movi_cnt)
1569       bici(Vd, T, imm32 & 0xff, lsl);
1570     else
1571       orri(Vd, T, imm32 & 0xff, lsl);
1572     lsl += 8; imm32 >>= 8;
1573   }
1574 }
1575 
1576 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1577 {
1578 #ifndef PRODUCT
1579   {
1580     char buffer[64];
1581     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1582     block_comment(buffer);
1583   }
1584 #endif
1585   if (operand_valid_for_logical_immediate(false, imm64)) {
1586     orr(dst, zr, imm64);
1587   } else {
1588     // we can use a combination of MOVZ or MOVN with
1589     // MOVK to build up the constant
1590     u_int64_t imm_h[4];
1591     int zero_count = 0;
1592     int neg_count = 0;
1593     int i;
1594     for (i = 0; i < 4; i++) {
1595       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1596       if (imm_h[i] == 0) {
1597         zero_count++;
1598       } else if (imm_h[i] == 0xffffL) {
1599         neg_count++;
1600       }
1601     }
1602     if (zero_count == 4) {
1603       // one MOVZ will do
1604       movz(dst, 0);
1605     } else if (neg_count == 4) {
1606       // one MOVN will do
1607       movn(dst, 0);
1608     } else if (zero_count == 3) {
1609       for (i = 0; i < 4; i++) {
1610         if (imm_h[i] != 0L) {
1611           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1612           break;
1613         }
1614       }
1615     } else if (neg_count == 3) {
1616       // one MOVN will do
1617       for (int i = 0; i < 4; i++) {
1618         if (imm_h[i] != 0xffffL) {
1619           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1620           break;
1621         }
1622       }
1623     } else if (zero_count == 2) {
1624       // one MOVZ and one MOVK will do
1625       for (i = 0; i < 3; i++) {
1626         if (imm_h[i] != 0L) {
1627           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1628           i++;
1629           break;
1630         }
1631       }
1632       for (;i < 4; i++) {
1633         if (imm_h[i] != 0L) {
1634           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1635         }
1636       }
1637     } else if (neg_count == 2) {
1638       // one MOVN and one MOVK will do
1639       for (i = 0; i < 4; i++) {
1640         if (imm_h[i] != 0xffffL) {
1641           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1642           i++;
1643           break;
1644         }
1645       }
1646       for (;i < 4; i++) {
1647         if (imm_h[i] != 0xffffL) {
1648           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649         }
1650       }
1651     } else if (zero_count == 1) {
1652       // one MOVZ and two MOVKs will do
1653       for (i = 0; i < 4; i++) {
1654         if (imm_h[i] != 0L) {
1655           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1656           i++;
1657           break;
1658         }
1659       }
1660       for (;i < 4; i++) {
1661         if (imm_h[i] != 0x0L) {
1662           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1663         }
1664       }
1665     } else if (neg_count == 1) {
1666       // one MOVN and two MOVKs will do
1667       for (i = 0; i < 4; i++) {
1668         if (imm_h[i] != 0xffffL) {
1669           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1670           i++;
1671           break;
1672         }
1673       }
1674       for (;i < 4; i++) {
1675         if (imm_h[i] != 0xffffL) {
1676           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1677         }
1678       }
1679     } else {
1680       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1681       movz(dst, (u_int32_t)imm_h[0], 0);
1682       for (i = 1; i < 4; i++) {
1683         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1684       }
1685     }
1686   }
1687 }
1688 
1689 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1690 {
1691 #ifndef PRODUCT
1692     {
1693       char buffer[64];
1694       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1695       block_comment(buffer);
1696     }
1697 #endif
1698   if (operand_valid_for_logical_immediate(true, imm32)) {
1699     orrw(dst, zr, imm32);
1700   } else {
1701     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1702     // constant
1703     u_int32_t imm_h[2];
1704     imm_h[0] = imm32 & 0xffff;
1705     imm_h[1] = ((imm32 >> 16) & 0xffff);
1706     if (imm_h[0] == 0) {
1707       movzw(dst, imm_h[1], 16);
1708     } else if (imm_h[0] == 0xffff) {
1709       movnw(dst, imm_h[1] ^ 0xffff, 16);
1710     } else if (imm_h[1] == 0) {
1711       movzw(dst, imm_h[0], 0);
1712     } else if (imm_h[1] == 0xffff) {
1713       movnw(dst, imm_h[0] ^ 0xffff, 0);
1714     } else {
1715       // use a MOVZ and MOVK (makes it easier to debug)
1716       movzw(dst, imm_h[0], 0);
1717       movkw(dst, imm_h[1], 16);
1718     }
1719   }
1720 }
1721 
1722 // Form an address from base + offset in Rd.  Rd may or may
1723 // not actually be used: you must use the Address that is returned.
1724 // It is up to you to ensure that the shift provided matches the size
1725 // of your data.
1726 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1727   if (Address::offset_ok_for_immed(byte_offset, shift))
1728     // It fits; no need for any heroics
1729     return Address(base, byte_offset);
1730 
1731   // Don't do anything clever with negative or misaligned offsets
1732   unsigned mask = (1 << shift) - 1;
1733   if (byte_offset < 0 || byte_offset & mask) {
1734     mov(Rd, byte_offset);
1735     add(Rd, base, Rd);
1736     return Address(Rd);
1737   }
1738 
1739   // See if we can do this with two 12-bit offsets
1740   {
1741     unsigned long word_offset = byte_offset >> shift;
1742     unsigned long masked_offset = word_offset & 0xfff000;
1743     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1744         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1745       add(Rd, base, masked_offset << shift);
1746       word_offset -= masked_offset;
1747       return Address(Rd, word_offset << shift);
1748     }
1749   }
1750 
1751   // Do it the hard way
1752   mov(Rd, byte_offset);
1753   add(Rd, base, Rd);
1754   return Address(Rd);
1755 }
1756 
1757 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1758   if (UseLSE) {
1759     mov(tmp, 1);
1760     ldadd(Assembler::word, tmp, zr, counter_addr);
1761     return;
1762   }
1763   Label retry_load;
1764   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1765     prfm(Address(counter_addr), PSTL1STRM);
1766   bind(retry_load);
1767   // flush and load exclusive from the memory location
1768   ldxrw(tmp, counter_addr);
1769   addw(tmp, tmp, 1);
1770   // if we store+flush with no intervening write tmp wil be zero
1771   stxrw(tmp2, tmp, counter_addr);
1772   cbnzw(tmp2, retry_load);
1773 }
1774 
1775 
1776 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1777                                     bool want_remainder, Register scratch)
1778 {
1779   // Full implementation of Java idiv and irem.  The function
1780   // returns the (pc) offset of the div instruction - may be needed
1781   // for implicit exceptions.
1782   //
1783   // constraint : ra/rb =/= scratch
1784   //         normal case
1785   //
1786   // input : ra: dividend
1787   //         rb: divisor
1788   //
1789   // result: either
1790   //         quotient  (= ra idiv rb)
1791   //         remainder (= ra irem rb)
1792 
1793   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1794 
1795   int idivl_offset = offset();
1796   if (! want_remainder) {
1797     sdivw(result, ra, rb);
1798   } else {
1799     sdivw(scratch, ra, rb);
1800     Assembler::msubw(result, scratch, rb, ra);
1801   }
1802 
1803   return idivl_offset;
1804 }
1805 
1806 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1807                                     bool want_remainder, Register scratch)
1808 {
1809   // Full implementation of Java ldiv and lrem.  The function
1810   // returns the (pc) offset of the div instruction - may be needed
1811   // for implicit exceptions.
1812   //
1813   // constraint : ra/rb =/= scratch
1814   //         normal case
1815   //
1816   // input : ra: dividend
1817   //         rb: divisor
1818   //
1819   // result: either
1820   //         quotient  (= ra idiv rb)
1821   //         remainder (= ra irem rb)
1822 
1823   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1824 
1825   int idivq_offset = offset();
1826   if (! want_remainder) {
1827     sdiv(result, ra, rb);
1828   } else {
1829     sdiv(scratch, ra, rb);
1830     Assembler::msub(result, scratch, rb, ra);
1831   }
1832 
1833   return idivq_offset;
1834 }
1835 
1836 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1837   address prev = pc() - NativeMembar::instruction_size;
1838   address last = code()->last_insn();
1839   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1840     NativeMembar *bar = NativeMembar_at(prev);
1841     // We are merging two memory barrier instructions.  On AArch64 we
1842     // can do this simply by ORing them together.
1843     bar->set_kind(bar->get_kind() | order_constraint);
1844     BLOCK_COMMENT("merged membar");
1845   } else {
1846     code()->set_last_insn(pc());
1847     dmb(Assembler::barrier(order_constraint));
1848   }
1849 }
1850 
1851 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1852   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1853     merge_ldst(rt, adr, size_in_bytes, is_store);
1854     code()->clear_last_insn();
1855     return true;
1856   } else {
1857     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1858     const unsigned mask = size_in_bytes - 1;
1859     if (adr.getMode() == Address::base_plus_offset &&
1860         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1861       code()->set_last_insn(pc());
1862     }
1863     return false;
1864   }
1865 }
1866 
1867 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1868   // We always try to merge two adjacent loads into one ldp.
1869   if (!try_merge_ldst(Rx, adr, 8, false)) {
1870     Assembler::ldr(Rx, adr);
1871   }
1872 }
1873 
1874 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1875   // We always try to merge two adjacent loads into one ldp.
1876   if (!try_merge_ldst(Rw, adr, 4, false)) {
1877     Assembler::ldrw(Rw, adr);
1878   }
1879 }
1880 
1881 void MacroAssembler::str(Register Rx, const Address &adr) {
1882   // We always try to merge two adjacent stores into one stp.
1883   if (!try_merge_ldst(Rx, adr, 8, true)) {
1884     Assembler::str(Rx, adr);
1885   }
1886 }
1887 
1888 void MacroAssembler::strw(Register Rw, const Address &adr) {
1889   // We always try to merge two adjacent stores into one stp.
1890   if (!try_merge_ldst(Rw, adr, 4, true)) {
1891     Assembler::strw(Rw, adr);
1892   }
1893 }
1894 
1895 // MacroAssembler routines found actually to be needed
1896 
1897 void MacroAssembler::push(Register src)
1898 {
1899   str(src, Address(pre(esp, -1 * wordSize)));
1900 }
1901 
1902 void MacroAssembler::pop(Register dst)
1903 {
1904   ldr(dst, Address(post(esp, 1 * wordSize)));
1905 }
1906 
1907 // Note: load_unsigned_short used to be called load_unsigned_word.
1908 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1909   int off = offset();
1910   ldrh(dst, src);
1911   return off;
1912 }
1913 
1914 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1915   int off = offset();
1916   ldrb(dst, src);
1917   return off;
1918 }
1919 
1920 int MacroAssembler::load_signed_short(Register dst, Address src) {
1921   int off = offset();
1922   ldrsh(dst, src);
1923   return off;
1924 }
1925 
1926 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1927   int off = offset();
1928   ldrsb(dst, src);
1929   return off;
1930 }
1931 
1932 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1933   int off = offset();
1934   ldrshw(dst, src);
1935   return off;
1936 }
1937 
1938 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1939   int off = offset();
1940   ldrsbw(dst, src);
1941   return off;
1942 }
1943 
1944 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1945   switch (size_in_bytes) {
1946   case  8:  ldr(dst, src); break;
1947   case  4:  ldrw(dst, src); break;
1948   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1949   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1950   default:  ShouldNotReachHere();
1951   }
1952 }
1953 
1954 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1955   switch (size_in_bytes) {
1956   case  8:  str(src, dst); break;
1957   case  4:  strw(src, dst); break;
1958   case  2:  strh(src, dst); break;
1959   case  1:  strb(src, dst); break;
1960   default:  ShouldNotReachHere();
1961   }
1962 }
1963 
1964 void MacroAssembler::decrementw(Register reg, int value)
1965 {
1966   if (value < 0)  { incrementw(reg, -value);      return; }
1967   if (value == 0) {                               return; }
1968   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1969   /* else */ {
1970     guarantee(reg != rscratch2, "invalid dst for register decrement");
1971     movw(rscratch2, (unsigned)value);
1972     subw(reg, reg, rscratch2);
1973   }
1974 }
1975 
1976 void MacroAssembler::decrement(Register reg, int value)
1977 {
1978   if (value < 0)  { increment(reg, -value);      return; }
1979   if (value == 0) {                              return; }
1980   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1981   /* else */ {
1982     assert(reg != rscratch2, "invalid dst for register decrement");
1983     mov(rscratch2, (unsigned long)value);
1984     sub(reg, reg, rscratch2);
1985   }
1986 }
1987 
1988 void MacroAssembler::decrementw(Address dst, int value)
1989 {
1990   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1991   if (dst.getMode() == Address::literal) {
1992     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1993     lea(rscratch2, dst);
1994     dst = Address(rscratch2);
1995   }
1996   ldrw(rscratch1, dst);
1997   decrementw(rscratch1, value);
1998   strw(rscratch1, dst);
1999 }
2000 
2001 void MacroAssembler::decrement(Address dst, int value)
2002 {
2003   assert(!dst.uses(rscratch1), "invalid address for decrement");
2004   if (dst.getMode() == Address::literal) {
2005     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2006     lea(rscratch2, dst);
2007     dst = Address(rscratch2);
2008   }
2009   ldr(rscratch1, dst);
2010   decrement(rscratch1, value);
2011   str(rscratch1, dst);
2012 }
2013 
2014 void MacroAssembler::incrementw(Register reg, int value)
2015 {
2016   if (value < 0)  { decrementw(reg, -value);      return; }
2017   if (value == 0) {                               return; }
2018   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2019   /* else */ {
2020     assert(reg != rscratch2, "invalid dst for register increment");
2021     movw(rscratch2, (unsigned)value);
2022     addw(reg, reg, rscratch2);
2023   }
2024 }
2025 
2026 void MacroAssembler::increment(Register reg, int value)
2027 {
2028   if (value < 0)  { decrement(reg, -value);      return; }
2029   if (value == 0) {                              return; }
2030   if (value < (1 << 12)) { add(reg, reg, value); return; }
2031   /* else */ {
2032     assert(reg != rscratch2, "invalid dst for register increment");
2033     movw(rscratch2, (unsigned)value);
2034     add(reg, reg, rscratch2);
2035   }
2036 }
2037 
2038 void MacroAssembler::incrementw(Address dst, int value)
2039 {
2040   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2041   if (dst.getMode() == Address::literal) {
2042     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2043     lea(rscratch2, dst);
2044     dst = Address(rscratch2);
2045   }
2046   ldrw(rscratch1, dst);
2047   incrementw(rscratch1, value);
2048   strw(rscratch1, dst);
2049 }
2050 
2051 void MacroAssembler::increment(Address dst, int value)
2052 {
2053   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2054   if (dst.getMode() == Address::literal) {
2055     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2056     lea(rscratch2, dst);
2057     dst = Address(rscratch2);
2058   }
2059   ldr(rscratch1, dst);
2060   increment(rscratch1, value);
2061   str(rscratch1, dst);
2062 }
2063 
2064 
2065 void MacroAssembler::pusha() {
2066   push(0x7fffffff, sp);
2067 }
2068 
2069 void MacroAssembler::popa() {
2070   pop(0x7fffffff, sp);
2071 }
2072 
2073 // Push lots of registers in the bit set supplied.  Don't push sp.
2074 // Return the number of words pushed
2075 int MacroAssembler::push(unsigned int bitset, Register stack) {
2076   int words_pushed = 0;
2077 
2078   // Scan bitset to accumulate register pairs
2079   unsigned char regs[32];
2080   int count = 0;
2081   for (int reg = 0; reg <= 30; reg++) {
2082     if (1 & bitset)
2083       regs[count++] = reg;
2084     bitset >>= 1;
2085   }
2086   regs[count++] = zr->encoding_nocheck();
2087   count &= ~1;  // Only push an even nuber of regs
2088 
2089   if (count) {
2090     stp(as_Register(regs[0]), as_Register(regs[1]),
2091        Address(pre(stack, -count * wordSize)));
2092     words_pushed += 2;
2093   }
2094   for (int i = 2; i < count; i += 2) {
2095     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2096        Address(stack, i * wordSize));
2097     words_pushed += 2;
2098   }
2099 
2100   assert(words_pushed == count, "oops, pushed != count");
2101 
2102   return count;
2103 }
2104 
2105 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2106   int words_pushed = 0;
2107 
2108   // Scan bitset to accumulate register pairs
2109   unsigned char regs[32];
2110   int count = 0;
2111   for (int reg = 0; reg <= 30; reg++) {
2112     if (1 & bitset)
2113       regs[count++] = reg;
2114     bitset >>= 1;
2115   }
2116   regs[count++] = zr->encoding_nocheck();
2117   count &= ~1;
2118 
2119   for (int i = 2; i < count; i += 2) {
2120     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2121        Address(stack, i * wordSize));
2122     words_pushed += 2;
2123   }
2124   if (count) {
2125     ldp(as_Register(regs[0]), as_Register(regs[1]),
2126        Address(post(stack, count * wordSize)));
2127     words_pushed += 2;
2128   }
2129 
2130   assert(words_pushed == count, "oops, pushed != count");
2131 
2132   return count;
2133 }
2134 #ifdef ASSERT
2135 void MacroAssembler::verify_heapbase(const char* msg) {
2136 #if 0
2137   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2138   assert (Universe::heap() != NULL, "java heap should be initialized");
2139   if (CheckCompressedOops) {
2140     Label ok;
2141     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2142     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2143     br(Assembler::EQ, ok);
2144     stop(msg);
2145     bind(ok);
2146     pop(1 << rscratch1->encoding(), sp);
2147   }
2148 #endif
2149 }
2150 #endif
2151 
2152 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2153   Label done, not_weak;
2154   cbz(value, done);           // Use NULL as-is.
2155 
2156   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2157   tbz(r0, 0, not_weak);    // Test for jweak tag.
2158 
2159   // Resolve jweak.
2160   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2161                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2162   verify_oop(value);
2163   b(done);
2164 
2165   bind(not_weak);
2166   // Resolve (untagged) jobject.
2167   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2168   verify_oop(value);
2169   bind(done);
2170 }
2171 
2172 void MacroAssembler::stop(const char* msg) {
2173   address ip = pc();
2174   pusha();
2175   mov(c_rarg0, (address)msg);
2176   mov(c_rarg1, (address)ip);
2177   mov(c_rarg2, sp);
2178   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2179   // call(c_rarg3);
2180   blrt(c_rarg3, 3, 0, 1);
2181   hlt(0);
2182 }
2183 
2184 void MacroAssembler::warn(const char* msg) {
2185   pusha();
2186   mov(c_rarg0, (address)msg);
2187   mov(lr, CAST_FROM_FN_PTR(address, warning));
2188   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2189   popa();
2190 }
2191 
2192 void MacroAssembler::unimplemented(const char* what) {
2193   const char* buf = NULL;
2194   {
2195     ResourceMark rm;
2196     stringStream ss;
2197     ss.print("unimplemented: %s", what);
2198     buf = code_string(ss.as_string());
2199   }
2200   stop(buf);
2201 }
2202 
2203 // If a constant does not fit in an immediate field, generate some
2204 // number of MOV instructions and then perform the operation.
2205 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2206                                            add_sub_imm_insn insn1,
2207                                            add_sub_reg_insn insn2) {
2208   assert(Rd != zr, "Rd = zr and not setting flags?");
2209   if (operand_valid_for_add_sub_immediate((int)imm)) {
2210     (this->*insn1)(Rd, Rn, imm);
2211   } else {
2212     if (uabs(imm) < (1 << 24)) {
2213        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2214        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2215     } else {
2216        assert_different_registers(Rd, Rn);
2217        mov(Rd, (uint64_t)imm);
2218        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2219     }
2220   }
2221 }
2222 
2223 // Seperate vsn which sets the flags. Optimisations are more restricted
2224 // because we must set the flags correctly.
2225 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2226                                            add_sub_imm_insn insn1,
2227                                            add_sub_reg_insn insn2) {
2228   if (operand_valid_for_add_sub_immediate((int)imm)) {
2229     (this->*insn1)(Rd, Rn, imm);
2230   } else {
2231     assert_different_registers(Rd, Rn);
2232     assert(Rd != zr, "overflow in immediate operand");
2233     mov(Rd, (uint64_t)imm);
2234     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2235   }
2236 }
2237 
2238 
2239 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2240   if (increment.is_register()) {
2241     add(Rd, Rn, increment.as_register());
2242   } else {
2243     add(Rd, Rn, increment.as_constant());
2244   }
2245 }
2246 
2247 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2248   if (increment.is_register()) {
2249     addw(Rd, Rn, increment.as_register());
2250   } else {
2251     addw(Rd, Rn, increment.as_constant());
2252   }
2253 }
2254 
2255 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2256   if (decrement.is_register()) {
2257     sub(Rd, Rn, decrement.as_register());
2258   } else {
2259     sub(Rd, Rn, decrement.as_constant());
2260   }
2261 }
2262 
2263 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2264   if (decrement.is_register()) {
2265     subw(Rd, Rn, decrement.as_register());
2266   } else {
2267     subw(Rd, Rn, decrement.as_constant());
2268   }
2269 }
2270 
2271 void MacroAssembler::reinit_heapbase()
2272 {
2273   if (UseCompressedOops) {
2274     if (Universe::is_fully_initialized()) {
2275       mov(rheapbase, Universe::narrow_ptrs_base());
2276     } else {
2277       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2278       ldr(rheapbase, Address(rheapbase));
2279     }
2280   }
2281 }
2282 
2283 // this simulates the behaviour of the x86 cmpxchg instruction using a
2284 // load linked/store conditional pair. we use the acquire/release
2285 // versions of these instructions so that we flush pending writes as
2286 // per Java semantics.
2287 
2288 // n.b the x86 version assumes the old value to be compared against is
2289 // in rax and updates rax with the value located in memory if the
2290 // cmpxchg fails. we supply a register for the old value explicitly
2291 
2292 // the aarch64 load linked/store conditional instructions do not
2293 // accept an offset. so, unlike x86, we must provide a plain register
2294 // to identify the memory word to be compared/exchanged rather than a
2295 // register+offset Address.
2296 
2297 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2298                                 Label &succeed, Label *fail) {
2299   // oldv holds comparison value
2300   // newv holds value to write in exchange
2301   // addr identifies memory word to compare against/update
2302   if (UseLSE) {
2303     mov(tmp, oldv);
2304     casal(Assembler::xword, oldv, newv, addr);
2305     cmp(tmp, oldv);
2306     br(Assembler::EQ, succeed);
2307     membar(AnyAny);
2308   } else {
2309     Label retry_load, nope;
2310     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2311       prfm(Address(addr), PSTL1STRM);
2312     bind(retry_load);
2313     // flush and load exclusive from the memory location
2314     // and fail if it is not what we expect
2315     ldaxr(tmp, addr);
2316     cmp(tmp, oldv);
2317     br(Assembler::NE, nope);
2318     // if we store+flush with no intervening write tmp wil be zero
2319     stlxr(tmp, newv, addr);
2320     cbzw(tmp, succeed);
2321     // retry so we only ever return after a load fails to compare
2322     // ensures we don't return a stale value after a failed write.
2323     b(retry_load);
2324     // if the memory word differs we return it in oldv and signal a fail
2325     bind(nope);
2326     membar(AnyAny);
2327     mov(oldv, tmp);
2328   }
2329   if (fail)
2330     b(*fail);
2331 }
2332 
2333 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2334                                         Label &succeed, Label *fail) {
2335   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2336   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2337 }
2338 
2339 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2340                                 Label &succeed, Label *fail) {
2341   // oldv holds comparison value
2342   // newv holds value to write in exchange
2343   // addr identifies memory word to compare against/update
2344   // tmp returns 0/1 for success/failure
2345   if (UseLSE) {
2346     mov(tmp, oldv);
2347     casal(Assembler::word, oldv, newv, addr);
2348     cmp(tmp, oldv);
2349     br(Assembler::EQ, succeed);
2350     membar(AnyAny);
2351   } else {
2352     Label retry_load, nope;
2353     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2354       prfm(Address(addr), PSTL1STRM);
2355     bind(retry_load);
2356     // flush and load exclusive from the memory location
2357     // and fail if it is not what we expect
2358     ldaxrw(tmp, addr);
2359     cmp(tmp, oldv);
2360     br(Assembler::NE, nope);
2361     // if we store+flush with no intervening write tmp wil be zero
2362     stlxrw(tmp, newv, addr);
2363     cbzw(tmp, succeed);
2364     // retry so we only ever return after a load fails to compare
2365     // ensures we don't return a stale value after a failed write.
2366     b(retry_load);
2367     // if the memory word differs we return it in oldv and signal a fail
2368     bind(nope);
2369     membar(AnyAny);
2370     mov(oldv, tmp);
2371   }
2372   if (fail)
2373     b(*fail);
2374 }
2375 
2376 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2377 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2378 // Pass a register for the result, otherwise pass noreg.
2379 
2380 // Clobbers rscratch1
2381 void MacroAssembler::cmpxchg(Register addr, Register expected,
2382                              Register new_val,
2383                              enum operand_size size,
2384                              bool acquire, bool release,
2385                              bool weak,
2386                              Register result) {
2387   if (result == noreg)  result = rscratch1;
2388   BLOCK_COMMENT("cmpxchg {");
2389   if (UseLSE) {
2390     mov(result, expected);
2391     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2392     compare_eq(result, expected, size);
2393   } else {
2394     Label retry_load, done;
2395     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2396       prfm(Address(addr), PSTL1STRM);
2397     bind(retry_load);
2398     load_exclusive(result, addr, size, acquire);
2399     compare_eq(result, expected, size);
2400     br(Assembler::NE, done);
2401     store_exclusive(rscratch1, new_val, addr, size, release);
2402     if (weak) {
2403       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2404     } else {
2405       cbnzw(rscratch1, retry_load);
2406     }
2407     bind(done);
2408   }
2409   BLOCK_COMMENT("} cmpxchg");
2410 }
2411 
2412 // A generic comparison. Only compares for equality, clobbers rscratch1.
2413 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2414   if (size == xword) {
2415     cmp(rm, rn);
2416   } else if (size == word) {
2417     cmpw(rm, rn);
2418   } else if (size == halfword) {
2419     eorw(rscratch1, rm, rn);
2420     ands(zr, rscratch1, 0xffff);
2421   } else if (size == byte) {
2422     eorw(rscratch1, rm, rn);
2423     ands(zr, rscratch1, 0xff);
2424   } else {
2425     ShouldNotReachHere();
2426   }
2427 }
2428 
2429 
2430 static bool different(Register a, RegisterOrConstant b, Register c) {
2431   if (b.is_constant())
2432     return a != c;
2433   else
2434     return a != b.as_register() && a != c && b.as_register() != c;
2435 }
2436 
2437 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2438 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2439   if (UseLSE) {                                                         \
2440     prev = prev->is_valid() ? prev : zr;                                \
2441     if (incr.is_register()) {                                           \
2442       AOP(sz, incr.as_register(), prev, addr);                          \
2443     } else {                                                            \
2444       mov(rscratch2, incr.as_constant());                               \
2445       AOP(sz, rscratch2, prev, addr);                                   \
2446     }                                                                   \
2447     return;                                                             \
2448   }                                                                     \
2449   Register result = rscratch2;                                          \
2450   if (prev->is_valid())                                                 \
2451     result = different(prev, incr, addr) ? prev : rscratch2;            \
2452                                                                         \
2453   Label retry_load;                                                     \
2454   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2455     prfm(Address(addr), PSTL1STRM);                                     \
2456   bind(retry_load);                                                     \
2457   LDXR(result, addr);                                                   \
2458   OP(rscratch1, result, incr);                                          \
2459   STXR(rscratch2, rscratch1, addr);                                     \
2460   cbnzw(rscratch2, retry_load);                                         \
2461   if (prev->is_valid() && prev != result) {                             \
2462     IOP(prev, rscratch1, incr);                                         \
2463   }                                                                     \
2464 }
2465 
2466 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2467 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2468 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2469 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2470 
2471 #undef ATOMIC_OP
2472 
2473 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2474 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2475   if (UseLSE) {                                                         \
2476     prev = prev->is_valid() ? prev : zr;                                \
2477     AOP(sz, newv, prev, addr);                                          \
2478     return;                                                             \
2479   }                                                                     \
2480   Register result = rscratch2;                                          \
2481   if (prev->is_valid())                                                 \
2482     result = different(prev, newv, addr) ? prev : rscratch2;            \
2483                                                                         \
2484   Label retry_load;                                                     \
2485   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2486     prfm(Address(addr), PSTL1STRM);                                     \
2487   bind(retry_load);                                                     \
2488   LDXR(result, addr);                                                   \
2489   STXR(rscratch1, newv, addr);                                          \
2490   cbnzw(rscratch1, retry_load);                                         \
2491   if (prev->is_valid() && prev != result)                               \
2492     mov(prev, result);                                                  \
2493 }
2494 
2495 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2496 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2497 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2498 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2499 
2500 #undef ATOMIC_XCHG
2501 
2502 #ifndef PRODUCT
2503 extern "C" void findpc(intptr_t x);
2504 #endif
2505 
2506 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2507 {
2508   // In order to get locks to work, we need to fake a in_VM state
2509   if (ShowMessageBoxOnError ) {
2510     JavaThread* thread = JavaThread::current();
2511     JavaThreadState saved_state = thread->thread_state();
2512     thread->set_thread_state(_thread_in_vm);
2513 #ifndef PRODUCT
2514     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2515       ttyLocker ttyl;
2516       BytecodeCounter::print();
2517     }
2518 #endif
2519     if (os::message_box(msg, "Execution stopped, print registers?")) {
2520       ttyLocker ttyl;
2521       tty->print_cr(" pc = 0x%016lx", pc);
2522 #ifndef PRODUCT
2523       tty->cr();
2524       findpc(pc);
2525       tty->cr();
2526 #endif
2527       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2528       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2529       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2530       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2531       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2532       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2533       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2534       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2535       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2536       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2537       tty->print_cr("r10 = 0x%016lx", regs[10]);
2538       tty->print_cr("r11 = 0x%016lx", regs[11]);
2539       tty->print_cr("r12 = 0x%016lx", regs[12]);
2540       tty->print_cr("r13 = 0x%016lx", regs[13]);
2541       tty->print_cr("r14 = 0x%016lx", regs[14]);
2542       tty->print_cr("r15 = 0x%016lx", regs[15]);
2543       tty->print_cr("r16 = 0x%016lx", regs[16]);
2544       tty->print_cr("r17 = 0x%016lx", regs[17]);
2545       tty->print_cr("r18 = 0x%016lx", regs[18]);
2546       tty->print_cr("r19 = 0x%016lx", regs[19]);
2547       tty->print_cr("r20 = 0x%016lx", regs[20]);
2548       tty->print_cr("r21 = 0x%016lx", regs[21]);
2549       tty->print_cr("r22 = 0x%016lx", regs[22]);
2550       tty->print_cr("r23 = 0x%016lx", regs[23]);
2551       tty->print_cr("r24 = 0x%016lx", regs[24]);
2552       tty->print_cr("r25 = 0x%016lx", regs[25]);
2553       tty->print_cr("r26 = 0x%016lx", regs[26]);
2554       tty->print_cr("r27 = 0x%016lx", regs[27]);
2555       tty->print_cr("r28 = 0x%016lx", regs[28]);
2556       tty->print_cr("r30 = 0x%016lx", regs[30]);
2557       tty->print_cr("r31 = 0x%016lx", regs[31]);
2558       BREAKPOINT;
2559     }
2560     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2561   } else {
2562     ttyLocker ttyl;
2563     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2564                     msg);
2565     assert(false, "DEBUG MESSAGE: %s", msg);
2566   }
2567 }
2568 
2569 #ifdef BUILTIN_SIM
2570 // routine to generate an x86 prolog for a stub function which
2571 // bootstraps into the generated ARM code which directly follows the
2572 // stub
2573 //
2574 // the argument encodes the number of general and fp registers
2575 // passed by the caller and the callng convention (currently just
2576 // the number of general registers and assumes C argument passing)
2577 
2578 extern "C" {
2579 int aarch64_stub_prolog_size();
2580 void aarch64_stub_prolog();
2581 void aarch64_prolog();
2582 }
2583 
2584 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2585                                    address *prolog_ptr)
2586 {
2587   int calltype = (((ret_type & 0x3) << 8) |
2588                   ((fp_arg_count & 0xf) << 4) |
2589                   (gp_arg_count & 0xf));
2590 
2591   // the addresses for the x86 to ARM entry code we need to use
2592   address start = pc();
2593   // printf("start = %lx\n", start);
2594   int byteCount =  aarch64_stub_prolog_size();
2595   // printf("byteCount = %x\n", byteCount);
2596   int instructionCount = (byteCount + 3)/ 4;
2597   // printf("instructionCount = %x\n", instructionCount);
2598   for (int i = 0; i < instructionCount; i++) {
2599     nop();
2600   }
2601 
2602   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2603 
2604   // write the address of the setup routine and the call format at the
2605   // end of into the copied code
2606   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2607   if (prolog_ptr)
2608     patch_end[-2] = (u_int64_t)prolog_ptr;
2609   patch_end[-1] = calltype;
2610 }
2611 #endif
2612 
2613 void MacroAssembler::push_call_clobbered_registers() {
2614   int step = 4 * wordSize;
2615   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2616   sub(sp, sp, step);
2617   mov(rscratch1, -step);
2618   // Push v0-v7, v16-v31.
2619   for (int i = 31; i>= 4; i -= 4) {
2620     if (i <= v7->encoding() || i >= v16->encoding())
2621       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2622           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2623   }
2624   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2625       as_FloatRegister(3), T1D, Address(sp));
2626 }
2627 
2628 void MacroAssembler::pop_call_clobbered_registers() {
2629   for (int i = 0; i < 32; i += 4) {
2630     if (i <= v7->encoding() || i >= v16->encoding())
2631       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2632           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2633   }
2634 
2635   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2636 }
2637 
2638 void MacroAssembler::push_CPU_state(bool save_vectors) {
2639   int step = (save_vectors ? 8 : 4) * wordSize;
2640   push(0x3fffffff, sp);         // integer registers except lr & sp
2641   mov(rscratch1, -step);
2642   sub(sp, sp, step);
2643   for (int i = 28; i >= 4; i -= 4) {
2644     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2645         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2646   }
2647   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2648 }
2649 
2650 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2651   int step = (restore_vectors ? 8 : 4) * wordSize;
2652   for (int i = 0; i <= 28; i += 4)
2653     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2654         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2655   pop(0x3fffffff, sp);         // integer registers except lr & sp
2656 }
2657 
2658 /**
2659  * Helpers for multiply_to_len().
2660  */
2661 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2662                                      Register src1, Register src2) {
2663   adds(dest_lo, dest_lo, src1);
2664   adc(dest_hi, dest_hi, zr);
2665   adds(dest_lo, dest_lo, src2);
2666   adc(final_dest_hi, dest_hi, zr);
2667 }
2668 
2669 // Generate an address from (r + r1 extend offset).  "size" is the
2670 // size of the operand.  The result may be in rscratch2.
2671 Address MacroAssembler::offsetted_address(Register r, Register r1,
2672                                           Address::extend ext, int offset, int size) {
2673   if (offset || (ext.shift() % size != 0)) {
2674     lea(rscratch2, Address(r, r1, ext));
2675     return Address(rscratch2, offset);
2676   } else {
2677     return Address(r, r1, ext);
2678   }
2679 }
2680 
2681 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2682 {
2683   assert(offset >= 0, "spill to negative address?");
2684   // Offset reachable ?
2685   //   Not aligned - 9 bits signed offset
2686   //   Aligned - 12 bits unsigned offset shifted
2687   Register base = sp;
2688   if ((offset & (size-1)) && offset >= (1<<8)) {
2689     add(tmp, base, offset & ((1<<12)-1));
2690     base = tmp;
2691     offset &= -1<<12;
2692   }
2693 
2694   if (offset >= (1<<12) * size) {
2695     add(tmp, base, offset & (((1<<12)-1)<<12));
2696     base = tmp;
2697     offset &= ~(((1<<12)-1)<<12);
2698   }
2699 
2700   return Address(base, offset);
2701 }
2702 
2703 // Checks whether offset is aligned.
2704 // Returns true if it is, else false.
2705 bool MacroAssembler::merge_alignment_check(Register base,
2706                                            size_t size,
2707                                            long cur_offset,
2708                                            long prev_offset) const {
2709   if (AvoidUnalignedAccesses) {
2710     if (base == sp) {
2711       // Checks whether low offset if aligned to pair of registers.
2712       long pair_mask = size * 2 - 1;
2713       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2714       return (offset & pair_mask) == 0;
2715     } else { // If base is not sp, we can't guarantee the access is aligned.
2716       return false;
2717     }
2718   } else {
2719     long mask = size - 1;
2720     // Load/store pair instruction only supports element size aligned offset.
2721     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2722   }
2723 }
2724 
2725 // Checks whether current and previous loads/stores can be merged.
2726 // Returns true if it can be merged, else false.
2727 bool MacroAssembler::ldst_can_merge(Register rt,
2728                                     const Address &adr,
2729                                     size_t cur_size_in_bytes,
2730                                     bool is_store) const {
2731   address prev = pc() - NativeInstruction::instruction_size;
2732   address last = code()->last_insn();
2733 
2734   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2735     return false;
2736   }
2737 
2738   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2739     return false;
2740   }
2741 
2742   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2743   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2744 
2745   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2746   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2747 
2748   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2749     return false;
2750   }
2751 
2752   long max_offset = 63 * prev_size_in_bytes;
2753   long min_offset = -64 * prev_size_in_bytes;
2754 
2755   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2756 
2757   // Only same base can be merged.
2758   if (adr.base() != prev_ldst->base()) {
2759     return false;
2760   }
2761 
2762   long cur_offset = adr.offset();
2763   long prev_offset = prev_ldst->offset();
2764   size_t diff = abs(cur_offset - prev_offset);
2765   if (diff != prev_size_in_bytes) {
2766     return false;
2767   }
2768 
2769   // Following cases can not be merged:
2770   // ldr x2, [x2, #8]
2771   // ldr x3, [x2, #16]
2772   // or:
2773   // ldr x2, [x3, #8]
2774   // ldr x2, [x3, #16]
2775   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2776   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2777     return false;
2778   }
2779 
2780   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2781   // Offset range must be in ldp/stp instruction's range.
2782   if (low_offset > max_offset || low_offset < min_offset) {
2783     return false;
2784   }
2785 
2786   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2787     return true;
2788   }
2789 
2790   return false;
2791 }
2792 
2793 // Merge current load/store with previous load/store into ldp/stp.
2794 void MacroAssembler::merge_ldst(Register rt,
2795                                 const Address &adr,
2796                                 size_t cur_size_in_bytes,
2797                                 bool is_store) {
2798 
2799   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2800 
2801   Register rt_low, rt_high;
2802   address prev = pc() - NativeInstruction::instruction_size;
2803   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2804 
2805   long offset;
2806 
2807   if (adr.offset() < prev_ldst->offset()) {
2808     offset = adr.offset();
2809     rt_low = rt;
2810     rt_high = prev_ldst->target();
2811   } else {
2812     offset = prev_ldst->offset();
2813     rt_low = prev_ldst->target();
2814     rt_high = rt;
2815   }
2816 
2817   Address adr_p = Address(prev_ldst->base(), offset);
2818   // Overwrite previous generated binary.
2819   code_section()->set_end(prev);
2820 
2821   const int sz = prev_ldst->size_in_bytes();
2822   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2823   if (!is_store) {
2824     BLOCK_COMMENT("merged ldr pair");
2825     if (sz == 8) {
2826       ldp(rt_low, rt_high, adr_p);
2827     } else {
2828       ldpw(rt_low, rt_high, adr_p);
2829     }
2830   } else {
2831     BLOCK_COMMENT("merged str pair");
2832     if (sz == 8) {
2833       stp(rt_low, rt_high, adr_p);
2834     } else {
2835       stpw(rt_low, rt_high, adr_p);
2836     }
2837   }
2838 }
2839 
2840 /**
2841  * Multiply 64 bit by 64 bit first loop.
2842  */
2843 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2844                                            Register y, Register y_idx, Register z,
2845                                            Register carry, Register product,
2846                                            Register idx, Register kdx) {
2847   //
2848   //  jlong carry, x[], y[], z[];
2849   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2850   //    huge_128 product = y[idx] * x[xstart] + carry;
2851   //    z[kdx] = (jlong)product;
2852   //    carry  = (jlong)(product >>> 64);
2853   //  }
2854   //  z[xstart] = carry;
2855   //
2856 
2857   Label L_first_loop, L_first_loop_exit;
2858   Label L_one_x, L_one_y, L_multiply;
2859 
2860   subsw(xstart, xstart, 1);
2861   br(Assembler::MI, L_one_x);
2862 
2863   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2864   ldr(x_xstart, Address(rscratch1));
2865   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2866 
2867   bind(L_first_loop);
2868   subsw(idx, idx, 1);
2869   br(Assembler::MI, L_first_loop_exit);
2870   subsw(idx, idx, 1);
2871   br(Assembler::MI, L_one_y);
2872   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2873   ldr(y_idx, Address(rscratch1));
2874   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2875   bind(L_multiply);
2876 
2877   // AArch64 has a multiply-accumulate instruction that we can't use
2878   // here because it has no way to process carries, so we have to use
2879   // separate add and adc instructions.  Bah.
2880   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2881   mul(product, x_xstart, y_idx);
2882   adds(product, product, carry);
2883   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2884 
2885   subw(kdx, kdx, 2);
2886   ror(product, product, 32); // back to big-endian
2887   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2888 
2889   b(L_first_loop);
2890 
2891   bind(L_one_y);
2892   ldrw(y_idx, Address(y,  0));
2893   b(L_multiply);
2894 
2895   bind(L_one_x);
2896   ldrw(x_xstart, Address(x,  0));
2897   b(L_first_loop);
2898 
2899   bind(L_first_loop_exit);
2900 }
2901 
2902 /**
2903  * Multiply 128 bit by 128. Unrolled inner loop.
2904  *
2905  */
2906 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2907                                              Register carry, Register carry2,
2908                                              Register idx, Register jdx,
2909                                              Register yz_idx1, Register yz_idx2,
2910                                              Register tmp, Register tmp3, Register tmp4,
2911                                              Register tmp6, Register product_hi) {
2912 
2913   //   jlong carry, x[], y[], z[];
2914   //   int kdx = ystart+1;
2915   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2916   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2917   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2918   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2919   //     carry  = (jlong)(tmp4 >>> 64);
2920   //     z[kdx+idx+1] = (jlong)tmp3;
2921   //     z[kdx+idx] = (jlong)tmp4;
2922   //   }
2923   //   idx += 2;
2924   //   if (idx > 0) {
2925   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2926   //     z[kdx+idx] = (jlong)yz_idx1;
2927   //     carry  = (jlong)(yz_idx1 >>> 64);
2928   //   }
2929   //
2930 
2931   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2932 
2933   lsrw(jdx, idx, 2);
2934 
2935   bind(L_third_loop);
2936 
2937   subsw(jdx, jdx, 1);
2938   br(Assembler::MI, L_third_loop_exit);
2939   subw(idx, idx, 4);
2940 
2941   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2942 
2943   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2944 
2945   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2946 
2947   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2948   ror(yz_idx2, yz_idx2, 32);
2949 
2950   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2951 
2952   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2953   umulh(tmp4, product_hi, yz_idx1);
2954 
2955   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2956   ror(rscratch2, rscratch2, 32);
2957 
2958   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2959   umulh(carry2, product_hi, yz_idx2);
2960 
2961   // propagate sum of both multiplications into carry:tmp4:tmp3
2962   adds(tmp3, tmp3, carry);
2963   adc(tmp4, tmp4, zr);
2964   adds(tmp3, tmp3, rscratch1);
2965   adcs(tmp4, tmp4, tmp);
2966   adc(carry, carry2, zr);
2967   adds(tmp4, tmp4, rscratch2);
2968   adc(carry, carry, zr);
2969 
2970   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2971   ror(tmp4, tmp4, 32);
2972   stp(tmp4, tmp3, Address(tmp6, 0));
2973 
2974   b(L_third_loop);
2975   bind (L_third_loop_exit);
2976 
2977   andw (idx, idx, 0x3);
2978   cbz(idx, L_post_third_loop_done);
2979 
2980   Label L_check_1;
2981   subsw(idx, idx, 2);
2982   br(Assembler::MI, L_check_1);
2983 
2984   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2985   ldr(yz_idx1, Address(rscratch1, 0));
2986   ror(yz_idx1, yz_idx1, 32);
2987   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2988   umulh(tmp4, product_hi, yz_idx1);
2989   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2990   ldr(yz_idx2, Address(rscratch1, 0));
2991   ror(yz_idx2, yz_idx2, 32);
2992 
2993   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2994 
2995   ror(tmp3, tmp3, 32);
2996   str(tmp3, Address(rscratch1, 0));
2997 
2998   bind (L_check_1);
2999 
3000   andw (idx, idx, 0x1);
3001   subsw(idx, idx, 1);
3002   br(Assembler::MI, L_post_third_loop_done);
3003   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
3004   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
3005   umulh(carry2, tmp4, product_hi);
3006   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3007 
3008   add2_with_carry(carry2, tmp3, tmp4, carry);
3009 
3010   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
3011   extr(carry, carry2, tmp3, 32);
3012 
3013   bind(L_post_third_loop_done);
3014 }
3015 
3016 /**
3017  * Code for BigInteger::multiplyToLen() instrinsic.
3018  *
3019  * r0: x
3020  * r1: xlen
3021  * r2: y
3022  * r3: ylen
3023  * r4:  z
3024  * r5: zlen
3025  * r10: tmp1
3026  * r11: tmp2
3027  * r12: tmp3
3028  * r13: tmp4
3029  * r14: tmp5
3030  * r15: tmp6
3031  * r16: tmp7
3032  *
3033  */
3034 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3035                                      Register z, Register zlen,
3036                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3037                                      Register tmp5, Register tmp6, Register product_hi) {
3038 
3039   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3040 
3041   const Register idx = tmp1;
3042   const Register kdx = tmp2;
3043   const Register xstart = tmp3;
3044 
3045   const Register y_idx = tmp4;
3046   const Register carry = tmp5;
3047   const Register product  = xlen;
3048   const Register x_xstart = zlen;  // reuse register
3049 
3050   // First Loop.
3051   //
3052   //  final static long LONG_MASK = 0xffffffffL;
3053   //  int xstart = xlen - 1;
3054   //  int ystart = ylen - 1;
3055   //  long carry = 0;
3056   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3057   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3058   //    z[kdx] = (int)product;
3059   //    carry = product >>> 32;
3060   //  }
3061   //  z[xstart] = (int)carry;
3062   //
3063 
3064   movw(idx, ylen);      // idx = ylen;
3065   movw(kdx, zlen);      // kdx = xlen+ylen;
3066   mov(carry, zr);       // carry = 0;
3067 
3068   Label L_done;
3069 
3070   movw(xstart, xlen);
3071   subsw(xstart, xstart, 1);
3072   br(Assembler::MI, L_done);
3073 
3074   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3075 
3076   Label L_second_loop;
3077   cbzw(kdx, L_second_loop);
3078 
3079   Label L_carry;
3080   subw(kdx, kdx, 1);
3081   cbzw(kdx, L_carry);
3082 
3083   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3084   lsr(carry, carry, 32);
3085   subw(kdx, kdx, 1);
3086 
3087   bind(L_carry);
3088   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3089 
3090   // Second and third (nested) loops.
3091   //
3092   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3093   //   carry = 0;
3094   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3095   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3096   //                    (z[k] & LONG_MASK) + carry;
3097   //     z[k] = (int)product;
3098   //     carry = product >>> 32;
3099   //   }
3100   //   z[i] = (int)carry;
3101   // }
3102   //
3103   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3104 
3105   const Register jdx = tmp1;
3106 
3107   bind(L_second_loop);
3108   mov(carry, zr);                // carry = 0;
3109   movw(jdx, ylen);               // j = ystart+1
3110 
3111   subsw(xstart, xstart, 1);      // i = xstart-1;
3112   br(Assembler::MI, L_done);
3113 
3114   str(z, Address(pre(sp, -4 * wordSize)));
3115 
3116   Label L_last_x;
3117   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3118   subsw(xstart, xstart, 1);       // i = xstart-1;
3119   br(Assembler::MI, L_last_x);
3120 
3121   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3122   ldr(product_hi, Address(rscratch1));
3123   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3124 
3125   Label L_third_loop_prologue;
3126   bind(L_third_loop_prologue);
3127 
3128   str(ylen, Address(sp, wordSize));
3129   stp(x, xstart, Address(sp, 2 * wordSize));
3130   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3131                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3132   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3133   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3134 
3135   addw(tmp3, xlen, 1);
3136   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3137   subsw(tmp3, tmp3, 1);
3138   br(Assembler::MI, L_done);
3139 
3140   lsr(carry, carry, 32);
3141   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3142   b(L_second_loop);
3143 
3144   // Next infrequent code is moved outside loops.
3145   bind(L_last_x);
3146   ldrw(product_hi, Address(x,  0));
3147   b(L_third_loop_prologue);
3148 
3149   bind(L_done);
3150 }
3151 
3152 // Code for BigInteger::mulAdd instrinsic
3153 // out     = r0
3154 // in      = r1
3155 // offset  = r2  (already out.length-offset)
3156 // len     = r3
3157 // k       = r4
3158 //
3159 // pseudo code from java implementation:
3160 // carry = 0;
3161 // offset = out.length-offset - 1;
3162 // for (int j=len-1; j >= 0; j--) {
3163 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3164 //     out[offset--] = (int)product;
3165 //     carry = product >>> 32;
3166 // }
3167 // return (int)carry;
3168 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3169       Register len, Register k) {
3170     Label LOOP, END;
3171     // pre-loop
3172     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3173     csel(out, zr, out, Assembler::EQ);
3174     br(Assembler::EQ, END);
3175     add(in, in, len, LSL, 2); // in[j+1] address
3176     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3177     mov(out, zr); // used to keep carry now
3178     BIND(LOOP);
3179     ldrw(rscratch1, Address(pre(in, -4)));
3180     madd(rscratch1, rscratch1, k, out);
3181     ldrw(rscratch2, Address(pre(offset, -4)));
3182     add(rscratch1, rscratch1, rscratch2);
3183     strw(rscratch1, Address(offset));
3184     lsr(out, rscratch1, 32);
3185     subs(len, len, 1);
3186     br(Assembler::NE, LOOP);
3187     BIND(END);
3188 }
3189 
3190 /**
3191  * Emits code to update CRC-32 with a byte value according to constants in table
3192  *
3193  * @param [in,out]crc   Register containing the crc.
3194  * @param [in]val       Register containing the byte to fold into the CRC.
3195  * @param [in]table     Register containing the table of crc constants.
3196  *
3197  * uint32_t crc;
3198  * val = crc_table[(val ^ crc) & 0xFF];
3199  * crc = val ^ (crc >> 8);
3200  *
3201  */
3202 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3203   eor(val, val, crc);
3204   andr(val, val, 0xff);
3205   ldrw(val, Address(table, val, Address::lsl(2)));
3206   eor(crc, val, crc, Assembler::LSR, 8);
3207 }
3208 
3209 /**
3210  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3211  *
3212  * @param [in,out]crc   Register containing the crc.
3213  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3214  * @param [in]table0    Register containing table 0 of crc constants.
3215  * @param [in]table1    Register containing table 1 of crc constants.
3216  * @param [in]table2    Register containing table 2 of crc constants.
3217  * @param [in]table3    Register containing table 3 of crc constants.
3218  *
3219  * uint32_t crc;
3220  *   v = crc ^ v
3221  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3222  *
3223  */
3224 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3225         Register table0, Register table1, Register table2, Register table3,
3226         bool upper) {
3227   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3228   uxtb(tmp, v);
3229   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3230   ubfx(tmp, v, 8, 8);
3231   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3232   eor(crc, crc, tmp);
3233   ubfx(tmp, v, 16, 8);
3234   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3235   eor(crc, crc, tmp);
3236   ubfx(tmp, v, 24, 8);
3237   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3238   eor(crc, crc, tmp);
3239 }
3240 
3241 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3242         Register len, Register tmp0, Register tmp1, Register tmp2,
3243         Register tmp3) {
3244     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3245     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3246 
3247     mvnw(crc, crc);
3248 
3249     subs(len, len, 128);
3250     br(Assembler::GE, CRC_by64_pre);
3251   BIND(CRC_less64);
3252     adds(len, len, 128-32);
3253     br(Assembler::GE, CRC_by32_loop);
3254   BIND(CRC_less32);
3255     adds(len, len, 32-4);
3256     br(Assembler::GE, CRC_by4_loop);
3257     adds(len, len, 4);
3258     br(Assembler::GT, CRC_by1_loop);
3259     b(L_exit);
3260 
3261   BIND(CRC_by32_loop);
3262     ldp(tmp0, tmp1, Address(post(buf, 16)));
3263     subs(len, len, 32);
3264     crc32x(crc, crc, tmp0);
3265     ldr(tmp2, Address(post(buf, 8)));
3266     crc32x(crc, crc, tmp1);
3267     ldr(tmp3, Address(post(buf, 8)));
3268     crc32x(crc, crc, tmp2);
3269     crc32x(crc, crc, tmp3);
3270     br(Assembler::GE, CRC_by32_loop);
3271     cmn(len, 32);
3272     br(Assembler::NE, CRC_less32);
3273     b(L_exit);
3274 
3275   BIND(CRC_by4_loop);
3276     ldrw(tmp0, Address(post(buf, 4)));
3277     subs(len, len, 4);
3278     crc32w(crc, crc, tmp0);
3279     br(Assembler::GE, CRC_by4_loop);
3280     adds(len, len, 4);
3281     br(Assembler::LE, L_exit);
3282   BIND(CRC_by1_loop);
3283     ldrb(tmp0, Address(post(buf, 1)));
3284     subs(len, len, 1);
3285     crc32b(crc, crc, tmp0);
3286     br(Assembler::GT, CRC_by1_loop);
3287     b(L_exit);
3288 
3289   BIND(CRC_by64_pre);
3290     sub(buf, buf, 8);
3291     ldp(tmp0, tmp1, Address(buf, 8));
3292     crc32x(crc, crc, tmp0);
3293     ldr(tmp2, Address(buf, 24));
3294     crc32x(crc, crc, tmp1);
3295     ldr(tmp3, Address(buf, 32));
3296     crc32x(crc, crc, tmp2);
3297     ldr(tmp0, Address(buf, 40));
3298     crc32x(crc, crc, tmp3);
3299     ldr(tmp1, Address(buf, 48));
3300     crc32x(crc, crc, tmp0);
3301     ldr(tmp2, Address(buf, 56));
3302     crc32x(crc, crc, tmp1);
3303     ldr(tmp3, Address(pre(buf, 64)));
3304 
3305     b(CRC_by64_loop);
3306 
3307     align(CodeEntryAlignment);
3308   BIND(CRC_by64_loop);
3309     subs(len, len, 64);
3310     crc32x(crc, crc, tmp2);
3311     ldr(tmp0, Address(buf, 8));
3312     crc32x(crc, crc, tmp3);
3313     ldr(tmp1, Address(buf, 16));
3314     crc32x(crc, crc, tmp0);
3315     ldr(tmp2, Address(buf, 24));
3316     crc32x(crc, crc, tmp1);
3317     ldr(tmp3, Address(buf, 32));
3318     crc32x(crc, crc, tmp2);
3319     ldr(tmp0, Address(buf, 40));
3320     crc32x(crc, crc, tmp3);
3321     ldr(tmp1, Address(buf, 48));
3322     crc32x(crc, crc, tmp0);
3323     ldr(tmp2, Address(buf, 56));
3324     crc32x(crc, crc, tmp1);
3325     ldr(tmp3, Address(pre(buf, 64)));
3326     br(Assembler::GE, CRC_by64_loop);
3327 
3328     // post-loop
3329     crc32x(crc, crc, tmp2);
3330     crc32x(crc, crc, tmp3);
3331 
3332     sub(len, len, 64);
3333     add(buf, buf, 8);
3334     cmn(len, 128);
3335     br(Assembler::NE, CRC_less64);
3336   BIND(L_exit);
3337     mvnw(crc, crc);
3338 }
3339 
3340 /**
3341  * @param crc   register containing existing CRC (32-bit)
3342  * @param buf   register pointing to input byte buffer (byte*)
3343  * @param len   register containing number of bytes
3344  * @param table register that will contain address of CRC table
3345  * @param tmp   scratch register
3346  */
3347 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3348         Register table0, Register table1, Register table2, Register table3,
3349         Register tmp, Register tmp2, Register tmp3) {
3350   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3351   unsigned long offset;
3352 
3353   if (UseCRC32) {
3354       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3355       return;
3356   }
3357 
3358     mvnw(crc, crc);
3359 
3360     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3361     if (offset) add(table0, table0, offset);
3362     add(table1, table0, 1*256*sizeof(juint));
3363     add(table2, table0, 2*256*sizeof(juint));
3364     add(table3, table0, 3*256*sizeof(juint));
3365 
3366   if (UseNeon) {
3367       cmp(len, (u1)64);
3368       br(Assembler::LT, L_by16);
3369       eor(v16, T16B, v16, v16);
3370 
3371     Label L_fold;
3372 
3373       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3374 
3375       ld1(v0, v1, T2D, post(buf, 32));
3376       ld1r(v4, T2D, post(tmp, 8));
3377       ld1r(v5, T2D, post(tmp, 8));
3378       ld1r(v6, T2D, post(tmp, 8));
3379       ld1r(v7, T2D, post(tmp, 8));
3380       mov(v16, T4S, 0, crc);
3381 
3382       eor(v0, T16B, v0, v16);
3383       sub(len, len, 64);
3384 
3385     BIND(L_fold);
3386       pmull(v22, T8H, v0, v5, T8B);
3387       pmull(v20, T8H, v0, v7, T8B);
3388       pmull(v23, T8H, v0, v4, T8B);
3389       pmull(v21, T8H, v0, v6, T8B);
3390 
3391       pmull2(v18, T8H, v0, v5, T16B);
3392       pmull2(v16, T8H, v0, v7, T16B);
3393       pmull2(v19, T8H, v0, v4, T16B);
3394       pmull2(v17, T8H, v0, v6, T16B);
3395 
3396       uzp1(v24, T8H, v20, v22);
3397       uzp2(v25, T8H, v20, v22);
3398       eor(v20, T16B, v24, v25);
3399 
3400       uzp1(v26, T8H, v16, v18);
3401       uzp2(v27, T8H, v16, v18);
3402       eor(v16, T16B, v26, v27);
3403 
3404       ushll2(v22, T4S, v20, T8H, 8);
3405       ushll(v20, T4S, v20, T4H, 8);
3406 
3407       ushll2(v18, T4S, v16, T8H, 8);
3408       ushll(v16, T4S, v16, T4H, 8);
3409 
3410       eor(v22, T16B, v23, v22);
3411       eor(v18, T16B, v19, v18);
3412       eor(v20, T16B, v21, v20);
3413       eor(v16, T16B, v17, v16);
3414 
3415       uzp1(v17, T2D, v16, v20);
3416       uzp2(v21, T2D, v16, v20);
3417       eor(v17, T16B, v17, v21);
3418 
3419       ushll2(v20, T2D, v17, T4S, 16);
3420       ushll(v16, T2D, v17, T2S, 16);
3421 
3422       eor(v20, T16B, v20, v22);
3423       eor(v16, T16B, v16, v18);
3424 
3425       uzp1(v17, T2D, v20, v16);
3426       uzp2(v21, T2D, v20, v16);
3427       eor(v28, T16B, v17, v21);
3428 
3429       pmull(v22, T8H, v1, v5, T8B);
3430       pmull(v20, T8H, v1, v7, T8B);
3431       pmull(v23, T8H, v1, v4, T8B);
3432       pmull(v21, T8H, v1, v6, T8B);
3433 
3434       pmull2(v18, T8H, v1, v5, T16B);
3435       pmull2(v16, T8H, v1, v7, T16B);
3436       pmull2(v19, T8H, v1, v4, T16B);
3437       pmull2(v17, T8H, v1, v6, T16B);
3438 
3439       ld1(v0, v1, T2D, post(buf, 32));
3440 
3441       uzp1(v24, T8H, v20, v22);
3442       uzp2(v25, T8H, v20, v22);
3443       eor(v20, T16B, v24, v25);
3444 
3445       uzp1(v26, T8H, v16, v18);
3446       uzp2(v27, T8H, v16, v18);
3447       eor(v16, T16B, v26, v27);
3448 
3449       ushll2(v22, T4S, v20, T8H, 8);
3450       ushll(v20, T4S, v20, T4H, 8);
3451 
3452       ushll2(v18, T4S, v16, T8H, 8);
3453       ushll(v16, T4S, v16, T4H, 8);
3454 
3455       eor(v22, T16B, v23, v22);
3456       eor(v18, T16B, v19, v18);
3457       eor(v20, T16B, v21, v20);
3458       eor(v16, T16B, v17, v16);
3459 
3460       uzp1(v17, T2D, v16, v20);
3461       uzp2(v21, T2D, v16, v20);
3462       eor(v16, T16B, v17, v21);
3463 
3464       ushll2(v20, T2D, v16, T4S, 16);
3465       ushll(v16, T2D, v16, T2S, 16);
3466 
3467       eor(v20, T16B, v22, v20);
3468       eor(v16, T16B, v16, v18);
3469 
3470       uzp1(v17, T2D, v20, v16);
3471       uzp2(v21, T2D, v20, v16);
3472       eor(v20, T16B, v17, v21);
3473 
3474       shl(v16, T2D, v28, 1);
3475       shl(v17, T2D, v20, 1);
3476 
3477       eor(v0, T16B, v0, v16);
3478       eor(v1, T16B, v1, v17);
3479 
3480       subs(len, len, 32);
3481       br(Assembler::GE, L_fold);
3482 
3483       mov(crc, 0);
3484       mov(tmp, v0, T1D, 0);
3485       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3486       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3487       mov(tmp, v0, T1D, 1);
3488       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3489       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3490       mov(tmp, v1, T1D, 0);
3491       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3492       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3493       mov(tmp, v1, T1D, 1);
3494       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3495       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3496 
3497       add(len, len, 32);
3498   }
3499 
3500   BIND(L_by16);
3501     subs(len, len, 16);
3502     br(Assembler::GE, L_by16_loop);
3503     adds(len, len, 16-4);
3504     br(Assembler::GE, L_by4_loop);
3505     adds(len, len, 4);
3506     br(Assembler::GT, L_by1_loop);
3507     b(L_exit);
3508 
3509   BIND(L_by4_loop);
3510     ldrw(tmp, Address(post(buf, 4)));
3511     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3512     subs(len, len, 4);
3513     br(Assembler::GE, L_by4_loop);
3514     adds(len, len, 4);
3515     br(Assembler::LE, L_exit);
3516   BIND(L_by1_loop);
3517     subs(len, len, 1);
3518     ldrb(tmp, Address(post(buf, 1)));
3519     update_byte_crc32(crc, tmp, table0);
3520     br(Assembler::GT, L_by1_loop);
3521     b(L_exit);
3522 
3523     align(CodeEntryAlignment);
3524   BIND(L_by16_loop);
3525     subs(len, len, 16);
3526     ldp(tmp, tmp3, Address(post(buf, 16)));
3527     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3528     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3529     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3530     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3531     br(Assembler::GE, L_by16_loop);
3532     adds(len, len, 16-4);
3533     br(Assembler::GE, L_by4_loop);
3534     adds(len, len, 4);
3535     br(Assembler::GT, L_by1_loop);
3536   BIND(L_exit);
3537     mvnw(crc, crc);
3538 }
3539 
3540 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3541         Register len, Register tmp0, Register tmp1, Register tmp2,
3542         Register tmp3) {
3543     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3544     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3545 
3546     subs(len, len, 128);
3547     br(Assembler::GE, CRC_by64_pre);
3548   BIND(CRC_less64);
3549     adds(len, len, 128-32);
3550     br(Assembler::GE, CRC_by32_loop);
3551   BIND(CRC_less32);
3552     adds(len, len, 32-4);
3553     br(Assembler::GE, CRC_by4_loop);
3554     adds(len, len, 4);
3555     br(Assembler::GT, CRC_by1_loop);
3556     b(L_exit);
3557 
3558   BIND(CRC_by32_loop);
3559     ldp(tmp0, tmp1, Address(post(buf, 16)));
3560     subs(len, len, 32);
3561     crc32cx(crc, crc, tmp0);
3562     ldr(tmp2, Address(post(buf, 8)));
3563     crc32cx(crc, crc, tmp1);
3564     ldr(tmp3, Address(post(buf, 8)));
3565     crc32cx(crc, crc, tmp2);
3566     crc32cx(crc, crc, tmp3);
3567     br(Assembler::GE, CRC_by32_loop);
3568     cmn(len, 32);
3569     br(Assembler::NE, CRC_less32);
3570     b(L_exit);
3571 
3572   BIND(CRC_by4_loop);
3573     ldrw(tmp0, Address(post(buf, 4)));
3574     subs(len, len, 4);
3575     crc32cw(crc, crc, tmp0);
3576     br(Assembler::GE, CRC_by4_loop);
3577     adds(len, len, 4);
3578     br(Assembler::LE, L_exit);
3579   BIND(CRC_by1_loop);
3580     ldrb(tmp0, Address(post(buf, 1)));
3581     subs(len, len, 1);
3582     crc32cb(crc, crc, tmp0);
3583     br(Assembler::GT, CRC_by1_loop);
3584     b(L_exit);
3585 
3586   BIND(CRC_by64_pre);
3587     sub(buf, buf, 8);
3588     ldp(tmp0, tmp1, Address(buf, 8));
3589     crc32cx(crc, crc, tmp0);
3590     ldr(tmp2, Address(buf, 24));
3591     crc32cx(crc, crc, tmp1);
3592     ldr(tmp3, Address(buf, 32));
3593     crc32cx(crc, crc, tmp2);
3594     ldr(tmp0, Address(buf, 40));
3595     crc32cx(crc, crc, tmp3);
3596     ldr(tmp1, Address(buf, 48));
3597     crc32cx(crc, crc, tmp0);
3598     ldr(tmp2, Address(buf, 56));
3599     crc32cx(crc, crc, tmp1);
3600     ldr(tmp3, Address(pre(buf, 64)));
3601 
3602     b(CRC_by64_loop);
3603 
3604     align(CodeEntryAlignment);
3605   BIND(CRC_by64_loop);
3606     subs(len, len, 64);
3607     crc32cx(crc, crc, tmp2);
3608     ldr(tmp0, Address(buf, 8));
3609     crc32cx(crc, crc, tmp3);
3610     ldr(tmp1, Address(buf, 16));
3611     crc32cx(crc, crc, tmp0);
3612     ldr(tmp2, Address(buf, 24));
3613     crc32cx(crc, crc, tmp1);
3614     ldr(tmp3, Address(buf, 32));
3615     crc32cx(crc, crc, tmp2);
3616     ldr(tmp0, Address(buf, 40));
3617     crc32cx(crc, crc, tmp3);
3618     ldr(tmp1, Address(buf, 48));
3619     crc32cx(crc, crc, tmp0);
3620     ldr(tmp2, Address(buf, 56));
3621     crc32cx(crc, crc, tmp1);
3622     ldr(tmp3, Address(pre(buf, 64)));
3623     br(Assembler::GE, CRC_by64_loop);
3624 
3625     // post-loop
3626     crc32cx(crc, crc, tmp2);
3627     crc32cx(crc, crc, tmp3);
3628 
3629     sub(len, len, 64);
3630     add(buf, buf, 8);
3631     cmn(len, 128);
3632     br(Assembler::NE, CRC_less64);
3633   BIND(L_exit);
3634 }
3635 
3636 /**
3637  * @param crc   register containing existing CRC (32-bit)
3638  * @param buf   register pointing to input byte buffer (byte*)
3639  * @param len   register containing number of bytes
3640  * @param table register that will contain address of CRC table
3641  * @param tmp   scratch register
3642  */
3643 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3644         Register table0, Register table1, Register table2, Register table3,
3645         Register tmp, Register tmp2, Register tmp3) {
3646   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3647 }
3648 
3649 
3650 SkipIfEqual::SkipIfEqual(
3651     MacroAssembler* masm, const bool* flag_addr, bool value) {
3652   _masm = masm;
3653   unsigned long offset;
3654   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3655   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3656   _masm->cbzw(rscratch1, _label);
3657 }
3658 
3659 SkipIfEqual::~SkipIfEqual() {
3660   _masm->bind(_label);
3661 }
3662 
3663 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3664   Address adr;
3665   switch(dst.getMode()) {
3666   case Address::base_plus_offset:
3667     // This is the expected mode, although we allow all the other
3668     // forms below.
3669     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3670     break;
3671   default:
3672     lea(rscratch2, dst);
3673     adr = Address(rscratch2);
3674     break;
3675   }
3676   ldr(rscratch1, adr);
3677   add(rscratch1, rscratch1, src);
3678   str(rscratch1, adr);
3679 }
3680 
3681 void MacroAssembler::cmpptr(Register src1, Address src2) {
3682   unsigned long offset;
3683   adrp(rscratch1, src2, offset);
3684   ldr(rscratch1, Address(rscratch1, offset));
3685   cmp(src1, rscratch1);
3686 }
3687 
3688 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3689   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3690   bs->obj_equals(this, obj1, obj2);
3691 }
3692 
3693 void MacroAssembler::load_klass(Register dst, Register src) {
3694   if (UseCompressedClassPointers) {
3695     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3696     decode_klass_not_null(dst);
3697   } else {
3698     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3699   }
3700 }
3701 
3702 // ((OopHandle)result).resolve();
3703 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3704   // OopHandle::resolve is an indirection.
3705   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3706 }
3707 
3708 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3709   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3710   ldr(dst, Address(rmethod, Method::const_offset()));
3711   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3712   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3713   ldr(dst, Address(dst, mirror_offset));
3714   resolve_oop_handle(dst, tmp);
3715 }
3716 
3717 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3718   if (UseCompressedClassPointers) {
3719     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3720     if (Universe::narrow_klass_base() == NULL) {
3721       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3722       return;
3723     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3724                && Universe::narrow_klass_shift() == 0) {
3725       // Only the bottom 32 bits matter
3726       cmpw(trial_klass, tmp);
3727       return;
3728     }
3729     decode_klass_not_null(tmp);
3730   } else {
3731     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3732   }
3733   cmp(trial_klass, tmp);
3734 }
3735 
3736 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3737   load_klass(dst, src);
3738   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3739 }
3740 
3741 void MacroAssembler::store_klass(Register dst, Register src) {
3742   // FIXME: Should this be a store release?  concurrent gcs assumes
3743   // klass length is valid if klass field is not null.
3744   if (UseCompressedClassPointers) {
3745     encode_klass_not_null(src);
3746     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3747   } else {
3748     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3749   }
3750 }
3751 
3752 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3753   if (UseCompressedClassPointers) {
3754     // Store to klass gap in destination
3755     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3756   }
3757 }
3758 
3759 // Algorithm must match CompressedOops::encode.
3760 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3761 #ifdef ASSERT
3762   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3763 #endif
3764   verify_oop(s, "broken oop in encode_heap_oop");
3765   if (Universe::narrow_oop_base() == NULL) {
3766     if (Universe::narrow_oop_shift() != 0) {
3767       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3768       lsr(d, s, LogMinObjAlignmentInBytes);
3769     } else {
3770       mov(d, s);
3771     }
3772   } else {
3773     subs(d, s, rheapbase);
3774     csel(d, d, zr, Assembler::HS);
3775     lsr(d, d, LogMinObjAlignmentInBytes);
3776 
3777     /*  Old algorithm: is this any worse?
3778     Label nonnull;
3779     cbnz(r, nonnull);
3780     sub(r, r, rheapbase);
3781     bind(nonnull);
3782     lsr(r, r, LogMinObjAlignmentInBytes);
3783     */
3784   }
3785 }
3786 
3787 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3788 #ifdef ASSERT
3789   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3790   if (CheckCompressedOops) {
3791     Label ok;
3792     cbnz(r, ok);
3793     stop("null oop passed to encode_heap_oop_not_null");
3794     bind(ok);
3795   }
3796 #endif
3797   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3798   if (Universe::narrow_oop_base() != NULL) {
3799     sub(r, r, rheapbase);
3800   }
3801   if (Universe::narrow_oop_shift() != 0) {
3802     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3803     lsr(r, r, LogMinObjAlignmentInBytes);
3804   }
3805 }
3806 
3807 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3808 #ifdef ASSERT
3809   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3810   if (CheckCompressedOops) {
3811     Label ok;
3812     cbnz(src, ok);
3813     stop("null oop passed to encode_heap_oop_not_null2");
3814     bind(ok);
3815   }
3816 #endif
3817   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3818 
3819   Register data = src;
3820   if (Universe::narrow_oop_base() != NULL) {
3821     sub(dst, src, rheapbase);
3822     data = dst;
3823   }
3824   if (Universe::narrow_oop_shift() != 0) {
3825     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3826     lsr(dst, data, LogMinObjAlignmentInBytes);
3827     data = dst;
3828   }
3829   if (data == src)
3830     mov(dst, src);
3831 }
3832 
3833 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3834 #ifdef ASSERT
3835   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3836 #endif
3837   if (Universe::narrow_oop_base() == NULL) {
3838     if (Universe::narrow_oop_shift() != 0 || d != s) {
3839       lsl(d, s, Universe::narrow_oop_shift());
3840     }
3841   } else {
3842     Label done;
3843     if (d != s)
3844       mov(d, s);
3845     cbz(s, done);
3846     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3847     bind(done);
3848   }
3849   verify_oop(d, "broken oop in decode_heap_oop");
3850 }
3851 
3852 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3853   assert (UseCompressedOops, "should only be used for compressed headers");
3854   assert (Universe::heap() != NULL, "java heap should be initialized");
3855   // Cannot assert, unverified entry point counts instructions (see .ad file)
3856   // vtableStubs also counts instructions in pd_code_size_limit.
3857   // Also do not verify_oop as this is called by verify_oop.
3858   if (Universe::narrow_oop_shift() != 0) {
3859     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3860     if (Universe::narrow_oop_base() != NULL) {
3861       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3862     } else {
3863       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3864     }
3865   } else {
3866     assert (Universe::narrow_oop_base() == NULL, "sanity");
3867   }
3868 }
3869 
3870 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3871   assert (UseCompressedOops, "should only be used for compressed headers");
3872   assert (Universe::heap() != NULL, "java heap should be initialized");
3873   // Cannot assert, unverified entry point counts instructions (see .ad file)
3874   // vtableStubs also counts instructions in pd_code_size_limit.
3875   // Also do not verify_oop as this is called by verify_oop.
3876   if (Universe::narrow_oop_shift() != 0) {
3877     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3878     if (Universe::narrow_oop_base() != NULL) {
3879       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3880     } else {
3881       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3882     }
3883   } else {
3884     assert (Universe::narrow_oop_base() == NULL, "sanity");
3885     if (dst != src) {
3886       mov(dst, src);
3887     }
3888   }
3889 }
3890 
3891 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3892   if (Universe::narrow_klass_base() == NULL) {
3893     if (Universe::narrow_klass_shift() != 0) {
3894       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3895       lsr(dst, src, LogKlassAlignmentInBytes);
3896     } else {
3897       if (dst != src) mov(dst, src);
3898     }
3899     return;
3900   }
3901 
3902   if (use_XOR_for_compressed_class_base) {
3903     if (Universe::narrow_klass_shift() != 0) {
3904       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3905       lsr(dst, dst, LogKlassAlignmentInBytes);
3906     } else {
3907       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3908     }
3909     return;
3910   }
3911 
3912   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3913       && Universe::narrow_klass_shift() == 0) {
3914     movw(dst, src);
3915     return;
3916   }
3917 
3918 #ifdef ASSERT
3919   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3920 #endif
3921 
3922   Register rbase = dst;
3923   if (dst == src) rbase = rheapbase;
3924   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3925   sub(dst, src, rbase);
3926   if (Universe::narrow_klass_shift() != 0) {
3927     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3928     lsr(dst, dst, LogKlassAlignmentInBytes);
3929   }
3930   if (dst == src) reinit_heapbase();
3931 }
3932 
3933 void MacroAssembler::encode_klass_not_null(Register r) {
3934   encode_klass_not_null(r, r);
3935 }
3936 
3937 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3938   Register rbase = dst;
3939   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3940 
3941   if (Universe::narrow_klass_base() == NULL) {
3942     if (Universe::narrow_klass_shift() != 0) {
3943       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3944       lsl(dst, src, LogKlassAlignmentInBytes);
3945     } else {
3946       if (dst != src) mov(dst, src);
3947     }
3948     return;
3949   }
3950 
3951   if (use_XOR_for_compressed_class_base) {
3952     if (Universe::narrow_klass_shift() != 0) {
3953       lsl(dst, src, LogKlassAlignmentInBytes);
3954       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3955     } else {
3956       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3957     }
3958     return;
3959   }
3960 
3961   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3962       && Universe::narrow_klass_shift() == 0) {
3963     if (dst != src)
3964       movw(dst, src);
3965     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3966     return;
3967   }
3968 
3969   // Cannot assert, unverified entry point counts instructions (see .ad file)
3970   // vtableStubs also counts instructions in pd_code_size_limit.
3971   // Also do not verify_oop as this is called by verify_oop.
3972   if (dst == src) rbase = rheapbase;
3973   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3974   if (Universe::narrow_klass_shift() != 0) {
3975     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3976     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3977   } else {
3978     add(dst, rbase, src);
3979   }
3980   if (dst == src) reinit_heapbase();
3981 }
3982 
3983 void  MacroAssembler::decode_klass_not_null(Register r) {
3984   decode_klass_not_null(r, r);
3985 }
3986 
3987 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3988 #ifdef ASSERT
3989   {
3990     ThreadInVMfromUnknown tiv;
3991     assert (UseCompressedOops, "should only be used for compressed oops");
3992     assert (Universe::heap() != NULL, "java heap should be initialized");
3993     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3994     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3995   }
3996 #endif
3997   int oop_index = oop_recorder()->find_index(obj);
3998   InstructionMark im(this);
3999   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4000   code_section()->relocate(inst_mark(), rspec);
4001   movz(dst, 0xDEAD, 16);
4002   movk(dst, 0xBEEF);
4003 }
4004 
4005 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
4006   assert (UseCompressedClassPointers, "should only be used for compressed headers");
4007   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
4008   int index = oop_recorder()->find_index(k);
4009   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
4010 
4011   InstructionMark im(this);
4012   RelocationHolder rspec = metadata_Relocation::spec(index);
4013   code_section()->relocate(inst_mark(), rspec);
4014   narrowKlass nk = Klass::encode_klass(k);
4015   movz(dst, (nk >> 16), 16);
4016   movk(dst, nk & 0xffff);
4017 }
4018 
4019 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4020                                     Register dst, Address src,
4021                                     Register tmp1, Register thread_tmp) {
4022   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4023   decorators = AccessInternal::decorator_fixup(decorators);
4024   bool as_raw = (decorators & AS_RAW) != 0;
4025   if (as_raw) {
4026     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4027   } else {
4028     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4029   }
4030 }
4031 
4032 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4033                                      Address dst, Register src,
4034                                      Register tmp1, Register thread_tmp) {
4035   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4036   decorators = AccessInternal::decorator_fixup(decorators);
4037   bool as_raw = (decorators & AS_RAW) != 0;
4038   if (as_raw) {
4039     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4040   } else {
4041     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4042   }
4043 }
4044 
4045 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4046   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4047   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4048     decorators |= ACCESS_READ | ACCESS_WRITE;
4049   }
4050   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4051   return bs->resolve(this, decorators, obj);
4052 }
4053 
4054 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4055                                    Register thread_tmp, DecoratorSet decorators) {
4056   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4057 }
4058 
4059 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4060                                             Register thread_tmp, DecoratorSet decorators) {
4061   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4062 }
4063 
4064 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4065                                     Register thread_tmp, DecoratorSet decorators) {
4066   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4067 }
4068 
4069 // Used for storing NULLs.
4070 void MacroAssembler::store_heap_oop_null(Address dst) {
4071   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4072 }
4073 
4074 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4075   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4076   int index = oop_recorder()->allocate_metadata_index(obj);
4077   RelocationHolder rspec = metadata_Relocation::spec(index);
4078   return Address((address)obj, rspec);
4079 }
4080 
4081 // Move an oop into a register.  immediate is true if we want
4082 // immediate instrcutions, i.e. we are not going to patch this
4083 // instruction while the code is being executed by another thread.  In
4084 // that case we can use move immediates rather than the constant pool.
4085 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4086   int oop_index;
4087   if (obj == NULL) {
4088     oop_index = oop_recorder()->allocate_oop_index(obj);
4089   } else {
4090 #ifdef ASSERT
4091     {
4092       ThreadInVMfromUnknown tiv;
4093       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4094     }
4095 #endif
4096     oop_index = oop_recorder()->find_index(obj);
4097   }
4098   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4099   if (! immediate) {
4100     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4101     ldr_constant(dst, Address(dummy, rspec));
4102   } else
4103     mov(dst, Address((address)obj, rspec));
4104 }
4105 
4106 // Move a metadata address into a register.
4107 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4108   int oop_index;
4109   if (obj == NULL) {
4110     oop_index = oop_recorder()->allocate_metadata_index(obj);
4111   } else {
4112     oop_index = oop_recorder()->find_index(obj);
4113   }
4114   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4115   mov(dst, Address((address)obj, rspec));
4116 }
4117 
4118 Address MacroAssembler::constant_oop_address(jobject obj) {
4119 #ifdef ASSERT
4120   {
4121     ThreadInVMfromUnknown tiv;
4122     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4123     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4124   }
4125 #endif
4126   int oop_index = oop_recorder()->find_index(obj);
4127   return Address((address)obj, oop_Relocation::spec(oop_index));
4128 }
4129 
4130 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4131 void MacroAssembler::tlab_allocate(Register obj,
4132                                    Register var_size_in_bytes,
4133                                    int con_size_in_bytes,
4134                                    Register t1,
4135                                    Register t2,
4136                                    Label& slow_case) {
4137   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4138   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4139 }
4140 
4141 // Defines obj, preserves var_size_in_bytes
4142 void MacroAssembler::eden_allocate(Register obj,
4143                                    Register var_size_in_bytes,
4144                                    int con_size_in_bytes,
4145                                    Register t1,
4146                                    Label& slow_case) {
4147   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4148   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4149 }
4150 
4151 // Zero words; len is in bytes
4152 // Destroys all registers except addr
4153 // len must be a nonzero multiple of wordSize
4154 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4155   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4156 
4157 #ifdef ASSERT
4158   { Label L;
4159     tst(len, BytesPerWord - 1);
4160     br(Assembler::EQ, L);
4161     stop("len is not a multiple of BytesPerWord");
4162     bind(L);
4163   }
4164 #endif
4165 
4166 #ifndef PRODUCT
4167   block_comment("zero memory");
4168 #endif
4169 
4170   Label loop;
4171   Label entry;
4172 
4173 //  Algorithm:
4174 //
4175 //    scratch1 = cnt & 7;
4176 //    cnt -= scratch1;
4177 //    p += scratch1;
4178 //    switch (scratch1) {
4179 //      do {
4180 //        cnt -= 8;
4181 //          p[-8] = 0;
4182 //        case 7:
4183 //          p[-7] = 0;
4184 //        case 6:
4185 //          p[-6] = 0;
4186 //          // ...
4187 //        case 1:
4188 //          p[-1] = 0;
4189 //        case 0:
4190 //          p += 8;
4191 //      } while (cnt);
4192 //    }
4193 
4194   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4195 
4196   lsr(len, len, LogBytesPerWord);
4197   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4198   sub(len, len, rscratch1);      // cnt -= unroll
4199   // t1 always points to the end of the region we're about to zero
4200   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4201   adr(rscratch2, entry);
4202   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4203   br(rscratch2);
4204   bind(loop);
4205   sub(len, len, unroll);
4206   for (int i = -unroll; i < 0; i++)
4207     Assembler::str(zr, Address(t1, i * wordSize));
4208   bind(entry);
4209   add(t1, t1, unroll * wordSize);
4210   cbnz(len, loop);
4211 }
4212 
4213 void MacroAssembler::verify_tlab() {
4214 #ifdef ASSERT
4215   if (UseTLAB && VerifyOops) {
4216     Label next, ok;
4217 
4218     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4219 
4220     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4221     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4222     cmp(rscratch2, rscratch1);
4223     br(Assembler::HS, next);
4224     STOP("assert(top >= start)");
4225     should_not_reach_here();
4226 
4227     bind(next);
4228     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4229     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4230     cmp(rscratch2, rscratch1);
4231     br(Assembler::HS, ok);
4232     STOP("assert(top <= end)");
4233     should_not_reach_here();
4234 
4235     bind(ok);
4236     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4237   }
4238 #endif
4239 }
4240 
4241 // Writes to stack successive pages until offset reached to check for
4242 // stack overflow + shadow pages.  This clobbers tmp.
4243 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4244   assert_different_registers(tmp, size, rscratch1);
4245   mov(tmp, sp);
4246   // Bang stack for total size given plus shadow page size.
4247   // Bang one page at a time because large size can bang beyond yellow and
4248   // red zones.
4249   Label loop;
4250   mov(rscratch1, os::vm_page_size());
4251   bind(loop);
4252   lea(tmp, Address(tmp, -os::vm_page_size()));
4253   subsw(size, size, rscratch1);
4254   str(size, Address(tmp));
4255   br(Assembler::GT, loop);
4256 
4257   // Bang down shadow pages too.
4258   // At this point, (tmp-0) is the last address touched, so don't
4259   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4260   // was post-decremented.)  Skip this address by starting at i=1, and
4261   // touch a few more pages below.  N.B.  It is important to touch all
4262   // the way down to and including i=StackShadowPages.
4263   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4264     // this could be any sized move but this is can be a debugging crumb
4265     // so the bigger the better.
4266     lea(tmp, Address(tmp, -os::vm_page_size()));
4267     str(size, Address(tmp));
4268   }
4269 }
4270 
4271 
4272 // Move the address of the polling page into dest.
4273 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4274   if (SafepointMechanism::uses_thread_local_poll()) {
4275     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4276   } else {
4277     unsigned long off;
4278     adrp(dest, Address(page, rtype), off);
4279     assert(off == 0, "polling page must be page aligned");
4280   }
4281 }
4282 
4283 // Move the address of the polling page into r, then read the polling
4284 // page.
4285 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4286   get_polling_page(r, page, rtype);
4287   return read_polling_page(r, rtype);
4288 }
4289 
4290 // Read the polling page.  The address of the polling page must
4291 // already be in r.
4292 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4293   InstructionMark im(this);
4294   code_section()->relocate(inst_mark(), rtype);
4295   ldrw(zr, Address(r, 0));
4296   return inst_mark();
4297 }
4298 
4299 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4300   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4301   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4302   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4303   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4304   long offset_low = dest_page - low_page;
4305   long offset_high = dest_page - high_page;
4306 
4307   assert(is_valid_AArch64_address(dest.target()), "bad address");
4308   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4309 
4310   InstructionMark im(this);
4311   code_section()->relocate(inst_mark(), dest.rspec());
4312   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4313   // the code cache so that if it is relocated we know it will still reach
4314   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4315     _adrp(reg1, dest.target());
4316   } else {
4317     unsigned long target = (unsigned long)dest.target();
4318     unsigned long adrp_target
4319       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4320 
4321     _adrp(reg1, (address)adrp_target);
4322     movk(reg1, target >> 32, 32);
4323   }
4324   byte_offset = (unsigned long)dest.target() & 0xfff;
4325 }
4326 
4327 void MacroAssembler::load_byte_map_base(Register reg) {
4328   CardTable::CardValue* byte_map_base =
4329     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4330 
4331   if (is_valid_AArch64_address((address)byte_map_base)) {
4332     // Strictly speaking the byte_map_base isn't an address at all,
4333     // and it might even be negative.
4334     unsigned long offset;
4335     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4336     // We expect offset to be zero with most collectors.
4337     if (offset != 0) {
4338       add(reg, reg, offset);
4339     }
4340   } else {
4341     mov(reg, (uint64_t)byte_map_base);
4342   }
4343 }
4344 
4345 void MacroAssembler::build_frame(int framesize) {
4346   assert(framesize > 0, "framesize must be > 0");
4347   if (framesize < ((1 << 9) + 2 * wordSize)) {
4348     sub(sp, sp, framesize);
4349     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4350     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4351   } else {
4352     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4353     if (PreserveFramePointer) mov(rfp, sp);
4354     if (framesize < ((1 << 12) + 2 * wordSize))
4355       sub(sp, sp, framesize - 2 * wordSize);
4356     else {
4357       mov(rscratch1, framesize - 2 * wordSize);
4358       sub(sp, sp, rscratch1);
4359     }
4360   }
4361 }
4362 
4363 void MacroAssembler::remove_frame(int framesize) {
4364   assert(framesize > 0, "framesize must be > 0");
4365   if (framesize < ((1 << 9) + 2 * wordSize)) {
4366     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4367     add(sp, sp, framesize);
4368   } else {
4369     if (framesize < ((1 << 12) + 2 * wordSize))
4370       add(sp, sp, framesize - 2 * wordSize);
4371     else {
4372       mov(rscratch1, framesize - 2 * wordSize);
4373       add(sp, sp, rscratch1);
4374     }
4375     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4376   }
4377 }
4378 
4379 #ifdef COMPILER2
4380 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4381 
4382 // Search for str1 in str2 and return index or -1
4383 void MacroAssembler::string_indexof(Register str2, Register str1,
4384                                     Register cnt2, Register cnt1,
4385                                     Register tmp1, Register tmp2,
4386                                     Register tmp3, Register tmp4,
4387                                     Register tmp5, Register tmp6,
4388                                     int icnt1, Register result, int ae) {
4389   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4390   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4391 
4392   Register ch1 = rscratch1;
4393   Register ch2 = rscratch2;
4394   Register cnt1tmp = tmp1;
4395   Register cnt2tmp = tmp2;
4396   Register cnt1_neg = cnt1;
4397   Register cnt2_neg = cnt2;
4398   Register result_tmp = tmp4;
4399 
4400   bool isL = ae == StrIntrinsicNode::LL;
4401 
4402   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4403   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4404   int str1_chr_shift = str1_isL ? 0:1;
4405   int str2_chr_shift = str2_isL ? 0:1;
4406   int str1_chr_size = str1_isL ? 1:2;
4407   int str2_chr_size = str2_isL ? 1:2;
4408   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4409                                       (chr_insn)&MacroAssembler::ldrh;
4410   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4411                                       (chr_insn)&MacroAssembler::ldrh;
4412   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4413   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4414 
4415   // Note, inline_string_indexOf() generates checks:
4416   // if (substr.count > string.count) return -1;
4417   // if (substr.count == 0) return 0;
4418 
4419   // We have two strings, a source string in str2, cnt2 and a pattern string
4420   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4421 
4422   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4423   // With a small pattern and source we use linear scan.
4424 
4425   if (icnt1 == -1) {
4426     sub(result_tmp, cnt2, cnt1);
4427     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4428     br(LT, LINEARSEARCH);
4429     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4430     subs(zr, cnt1, 256);
4431     lsr(tmp1, cnt2, 2);
4432     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4433     br(GE, LINEARSTUB);
4434   }
4435 
4436 // The Boyer Moore alogorithm is based on the description here:-
4437 //
4438 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4439 //
4440 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4441 // and the 'Good Suffix' rule.
4442 //
4443 // These rules are essentially heuristics for how far we can shift the
4444 // pattern along the search string.
4445 //
4446 // The implementation here uses the 'Bad Character' rule only because of the
4447 // complexity of initialisation for the 'Good Suffix' rule.
4448 //
4449 // This is also known as the Boyer-Moore-Horspool algorithm:-
4450 //
4451 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4452 //
4453 // This particular implementation has few java-specific optimizations.
4454 //
4455 // #define ASIZE 256
4456 //
4457 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4458 //       int i, j;
4459 //       unsigned c;
4460 //       unsigned char bc[ASIZE];
4461 //
4462 //       /* Preprocessing */
4463 //       for (i = 0; i < ASIZE; ++i)
4464 //          bc[i] = m;
4465 //       for (i = 0; i < m - 1; ) {
4466 //          c = x[i];
4467 //          ++i;
4468 //          // c < 256 for Latin1 string, so, no need for branch
4469 //          #ifdef PATTERN_STRING_IS_LATIN1
4470 //          bc[c] = m - i;
4471 //          #else
4472 //          if (c < ASIZE) bc[c] = m - i;
4473 //          #endif
4474 //       }
4475 //
4476 //       /* Searching */
4477 //       j = 0;
4478 //       while (j <= n - m) {
4479 //          c = y[i+j];
4480 //          if (x[m-1] == c)
4481 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4482 //          if (i < 0) return j;
4483 //          // c < 256 for Latin1 string, so, no need for branch
4484 //          #ifdef SOURCE_STRING_IS_LATIN1
4485 //          // LL case: (c< 256) always true. Remove branch
4486 //          j += bc[y[j+m-1]];
4487 //          #endif
4488 //          #ifndef PATTERN_STRING_IS_UTF
4489 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4490 //          if (c < ASIZE)
4491 //            j += bc[y[j+m-1]];
4492 //          else
4493 //            j += 1
4494 //          #endif
4495 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4496 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4497 //          if (c < ASIZE)
4498 //            j += bc[y[j+m-1]];
4499 //          else
4500 //            j += m
4501 //          #endif
4502 //       }
4503 //    }
4504 
4505   if (icnt1 == -1) {
4506     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4507         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4508     Register cnt1end = tmp2;
4509     Register str2end = cnt2;
4510     Register skipch = tmp2;
4511 
4512     // str1 length is >=8, so, we can read at least 1 register for cases when
4513     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4514     // UL case. We'll re-read last character in inner pre-loop code to have
4515     // single outer pre-loop load
4516     const int firstStep = isL ? 7 : 3;
4517 
4518     const int ASIZE = 256;
4519     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4520     sub(sp, sp, ASIZE);
4521     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4522     mov(ch1, sp);
4523     BIND(BM_INIT_LOOP);
4524       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4525       subs(tmp5, tmp5, 1);
4526       br(GT, BM_INIT_LOOP);
4527 
4528       sub(cnt1tmp, cnt1, 1);
4529       mov(tmp5, str2);
4530       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4531       sub(ch2, cnt1, 1);
4532       mov(tmp3, str1);
4533     BIND(BCLOOP);
4534       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4535       if (!str1_isL) {
4536         subs(zr, ch1, ASIZE);
4537         br(HS, BCSKIP);
4538       }
4539       strb(ch2, Address(sp, ch1));
4540     BIND(BCSKIP);
4541       subs(ch2, ch2, 1);
4542       br(GT, BCLOOP);
4543 
4544       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4545       if (str1_isL == str2_isL) {
4546         // load last 8 bytes (8LL/4UU symbols)
4547         ldr(tmp6, Address(tmp6, -wordSize));
4548       } else {
4549         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4550         // convert Latin1 to UTF. We'll have to wait until load completed, but
4551         // it's still faster than per-character loads+checks
4552         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4553         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4554         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4555         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4556         orr(ch2, ch1, ch2, LSL, 16);
4557         orr(tmp6, tmp6, tmp3, LSL, 48);
4558         orr(tmp6, tmp6, ch2, LSL, 16);
4559       }
4560     BIND(BMLOOPSTR2);
4561       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4562       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4563       if (str1_isL == str2_isL) {
4564         // re-init tmp3. It's for free because it's executed in parallel with
4565         // load above. Alternative is to initialize it before loop, but it'll
4566         // affect performance on in-order systems with 2 or more ld/st pipelines
4567         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4568       }
4569       if (!isL) { // UU/UL case
4570         lsl(ch2, cnt1tmp, 1); // offset in bytes
4571       }
4572       cmp(tmp3, skipch);
4573       br(NE, BMSKIP);
4574       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4575       mov(ch1, tmp6);
4576       if (isL) {
4577         b(BMLOOPSTR1_AFTER_LOAD);
4578       } else {
4579         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4580         b(BMLOOPSTR1_CMP);
4581       }
4582     BIND(BMLOOPSTR1);
4583       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4584       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4585     BIND(BMLOOPSTR1_AFTER_LOAD);
4586       subs(cnt1tmp, cnt1tmp, 1);
4587       br(LT, BMLOOPSTR1_LASTCMP);
4588     BIND(BMLOOPSTR1_CMP);
4589       cmp(ch1, ch2);
4590       br(EQ, BMLOOPSTR1);
4591     BIND(BMSKIP);
4592       if (!isL) {
4593         // if we've met UTF symbol while searching Latin1 pattern, then we can
4594         // skip cnt1 symbols
4595         if (str1_isL != str2_isL) {
4596           mov(result_tmp, cnt1);
4597         } else {
4598           mov(result_tmp, 1);
4599         }
4600         subs(zr, skipch, ASIZE);
4601         br(HS, BMADV);
4602       }
4603       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4604     BIND(BMADV);
4605       sub(cnt1tmp, cnt1, 1);
4606       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4607       cmp(str2, str2end);
4608       br(LE, BMLOOPSTR2);
4609       add(sp, sp, ASIZE);
4610       b(NOMATCH);
4611     BIND(BMLOOPSTR1_LASTCMP);
4612       cmp(ch1, ch2);
4613       br(NE, BMSKIP);
4614     BIND(BMMATCH);
4615       sub(result, str2, tmp5);
4616       if (!str2_isL) lsr(result, result, 1);
4617       add(sp, sp, ASIZE);
4618       b(DONE);
4619 
4620     BIND(LINEARSTUB);
4621     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4622     br(LT, LINEAR_MEDIUM);
4623     mov(result, zr);
4624     RuntimeAddress stub = NULL;
4625     if (isL) {
4626       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4627       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4628     } else if (str1_isL) {
4629       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4630        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4631     } else {
4632       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4633       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4634     }
4635     trampoline_call(stub);
4636     b(DONE);
4637   }
4638 
4639   BIND(LINEARSEARCH);
4640   {
4641     Label DO1, DO2, DO3;
4642 
4643     Register str2tmp = tmp2;
4644     Register first = tmp3;
4645 
4646     if (icnt1 == -1)
4647     {
4648         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4649 
4650         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4651         br(LT, DOSHORT);
4652       BIND(LINEAR_MEDIUM);
4653         (this->*str1_load_1chr)(first, Address(str1));
4654         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4655         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4656         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4657         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4658 
4659       BIND(FIRST_LOOP);
4660         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4661         cmp(first, ch2);
4662         br(EQ, STR1_LOOP);
4663       BIND(STR2_NEXT);
4664         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4665         br(LE, FIRST_LOOP);
4666         b(NOMATCH);
4667 
4668       BIND(STR1_LOOP);
4669         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4670         add(cnt2tmp, cnt2_neg, str2_chr_size);
4671         br(GE, MATCH);
4672 
4673       BIND(STR1_NEXT);
4674         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4675         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4676         cmp(ch1, ch2);
4677         br(NE, STR2_NEXT);
4678         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4679         add(cnt2tmp, cnt2tmp, str2_chr_size);
4680         br(LT, STR1_NEXT);
4681         b(MATCH);
4682 
4683       BIND(DOSHORT);
4684       if (str1_isL == str2_isL) {
4685         cmp(cnt1, (u1)2);
4686         br(LT, DO1);
4687         br(GT, DO3);
4688       }
4689     }
4690 
4691     if (icnt1 == 4) {
4692       Label CH1_LOOP;
4693 
4694         (this->*load_4chr)(ch1, str1);
4695         sub(result_tmp, cnt2, 4);
4696         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4697         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4698 
4699       BIND(CH1_LOOP);
4700         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4701         cmp(ch1, ch2);
4702         br(EQ, MATCH);
4703         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4704         br(LE, CH1_LOOP);
4705         b(NOMATCH);
4706       }
4707 
4708     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4709       Label CH1_LOOP;
4710 
4711       BIND(DO2);
4712         (this->*load_2chr)(ch1, str1);
4713         if (icnt1 == 2) {
4714           sub(result_tmp, cnt2, 2);
4715         }
4716         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4717         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4718       BIND(CH1_LOOP);
4719         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4720         cmp(ch1, ch2);
4721         br(EQ, MATCH);
4722         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4723         br(LE, CH1_LOOP);
4724         b(NOMATCH);
4725     }
4726 
4727     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4728       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4729 
4730       BIND(DO3);
4731         (this->*load_2chr)(first, str1);
4732         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4733         if (icnt1 == 3) {
4734           sub(result_tmp, cnt2, 3);
4735         }
4736         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4737         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4738       BIND(FIRST_LOOP);
4739         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4740         cmpw(first, ch2);
4741         br(EQ, STR1_LOOP);
4742       BIND(STR2_NEXT);
4743         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4744         br(LE, FIRST_LOOP);
4745         b(NOMATCH);
4746 
4747       BIND(STR1_LOOP);
4748         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4749         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4750         cmp(ch1, ch2);
4751         br(NE, STR2_NEXT);
4752         b(MATCH);
4753     }
4754 
4755     if (icnt1 == -1 || icnt1 == 1) {
4756       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4757 
4758       BIND(DO1);
4759         (this->*str1_load_1chr)(ch1, str1);
4760         cmp(cnt2, (u1)8);
4761         br(LT, DO1_SHORT);
4762 
4763         sub(result_tmp, cnt2, 8/str2_chr_size);
4764         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4765         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4766         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4767 
4768         if (str2_isL) {
4769           orr(ch1, ch1, ch1, LSL, 8);
4770         }
4771         orr(ch1, ch1, ch1, LSL, 16);
4772         orr(ch1, ch1, ch1, LSL, 32);
4773       BIND(CH1_LOOP);
4774         ldr(ch2, Address(str2, cnt2_neg));
4775         eor(ch2, ch1, ch2);
4776         sub(tmp1, ch2, tmp3);
4777         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4778         bics(tmp1, tmp1, tmp2);
4779         br(NE, HAS_ZERO);
4780         adds(cnt2_neg, cnt2_neg, 8);
4781         br(LT, CH1_LOOP);
4782 
4783         cmp(cnt2_neg, (u1)8);
4784         mov(cnt2_neg, 0);
4785         br(LT, CH1_LOOP);
4786         b(NOMATCH);
4787 
4788       BIND(HAS_ZERO);
4789         rev(tmp1, tmp1);
4790         clz(tmp1, tmp1);
4791         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4792         b(MATCH);
4793 
4794       BIND(DO1_SHORT);
4795         mov(result_tmp, cnt2);
4796         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4797         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4798       BIND(DO1_LOOP);
4799         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4800         cmpw(ch1, ch2);
4801         br(EQ, MATCH);
4802         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4803         br(LT, DO1_LOOP);
4804     }
4805   }
4806   BIND(NOMATCH);
4807     mov(result, -1);
4808     b(DONE);
4809   BIND(MATCH);
4810     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4811   BIND(DONE);
4812 }
4813 
4814 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4815 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4816 
4817 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4818                                          Register ch, Register result,
4819                                          Register tmp1, Register tmp2, Register tmp3)
4820 {
4821   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4822   Register cnt1_neg = cnt1;
4823   Register ch1 = rscratch1;
4824   Register result_tmp = rscratch2;
4825 
4826   cmp(cnt1, (u1)4);
4827   br(LT, DO1_SHORT);
4828 
4829   orr(ch, ch, ch, LSL, 16);
4830   orr(ch, ch, ch, LSL, 32);
4831 
4832   sub(cnt1, cnt1, 4);
4833   mov(result_tmp, cnt1);
4834   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4835   sub(cnt1_neg, zr, cnt1, LSL, 1);
4836 
4837   mov(tmp3, 0x0001000100010001);
4838 
4839   BIND(CH1_LOOP);
4840     ldr(ch1, Address(str1, cnt1_neg));
4841     eor(ch1, ch, ch1);
4842     sub(tmp1, ch1, tmp3);
4843     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4844     bics(tmp1, tmp1, tmp2);
4845     br(NE, HAS_ZERO);
4846     adds(cnt1_neg, cnt1_neg, 8);
4847     br(LT, CH1_LOOP);
4848 
4849     cmp(cnt1_neg, (u1)8);
4850     mov(cnt1_neg, 0);
4851     br(LT, CH1_LOOP);
4852     b(NOMATCH);
4853 
4854   BIND(HAS_ZERO);
4855     rev(tmp1, tmp1);
4856     clz(tmp1, tmp1);
4857     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4858     b(MATCH);
4859 
4860   BIND(DO1_SHORT);
4861     mov(result_tmp, cnt1);
4862     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4863     sub(cnt1_neg, zr, cnt1, LSL, 1);
4864   BIND(DO1_LOOP);
4865     ldrh(ch1, Address(str1, cnt1_neg));
4866     cmpw(ch, ch1);
4867     br(EQ, MATCH);
4868     adds(cnt1_neg, cnt1_neg, 2);
4869     br(LT, DO1_LOOP);
4870   BIND(NOMATCH);
4871     mov(result, -1);
4872     b(DONE);
4873   BIND(MATCH);
4874     add(result, result_tmp, cnt1_neg, ASR, 1);
4875   BIND(DONE);
4876 }
4877 
4878 // Compare strings.
4879 void MacroAssembler::string_compare(Register str1, Register str2,
4880     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4881     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4882   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4883       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4884       SHORT_LOOP_START, TAIL_CHECK;
4885 
4886   const u1 STUB_THRESHOLD = 64 + 8;
4887   bool isLL = ae == StrIntrinsicNode::LL;
4888   bool isLU = ae == StrIntrinsicNode::LU;
4889   bool isUL = ae == StrIntrinsicNode::UL;
4890 
4891   bool str1_isL = isLL || isLU;
4892   bool str2_isL = isLL || isUL;
4893 
4894   int str1_chr_shift = str1_isL ? 0 : 1;
4895   int str2_chr_shift = str2_isL ? 0 : 1;
4896   int str1_chr_size = str1_isL ? 1 : 2;
4897   int str2_chr_size = str2_isL ? 1 : 2;
4898   int minCharsInWord = isLL ? wordSize : wordSize/2;
4899 
4900   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4901   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4902                                       (chr_insn)&MacroAssembler::ldrh;
4903   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4904                                       (chr_insn)&MacroAssembler::ldrh;
4905   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4906                             (uxt_insn)&MacroAssembler::uxthw;
4907 
4908   BLOCK_COMMENT("string_compare {");
4909 
4910   // Bizzarely, the counts are passed in bytes, regardless of whether they
4911   // are L or U strings, however the result is always in characters.
4912   if (!str1_isL) asrw(cnt1, cnt1, 1);
4913   if (!str2_isL) asrw(cnt2, cnt2, 1);
4914 
4915   // Compute the minimum of the string lengths and save the difference.
4916   subsw(result, cnt1, cnt2);
4917   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4918 
4919   // A very short string
4920   cmpw(cnt2, minCharsInWord);
4921   br(Assembler::LE, SHORT_STRING);
4922 
4923   // Compare longwords
4924   // load first parts of strings and finish initialization while loading
4925   {
4926     if (str1_isL == str2_isL) { // LL or UU
4927       ldr(tmp1, Address(str1));
4928       cmp(str1, str2);
4929       br(Assembler::EQ, DONE);
4930       ldr(tmp2, Address(str2));
4931       cmp(cnt2, STUB_THRESHOLD);
4932       br(GE, STUB);
4933       subsw(cnt2, cnt2, minCharsInWord);
4934       br(EQ, TAIL_CHECK);
4935       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4936       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4937       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4938     } else if (isLU) {
4939       ldrs(vtmp, Address(str1));
4940       cmp(str1, str2);
4941       br(Assembler::EQ, DONE);
4942       ldr(tmp2, Address(str2));
4943       cmp(cnt2, STUB_THRESHOLD);
4944       br(GE, STUB);
4945       subw(cnt2, cnt2, 4);
4946       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4947       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4948       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4949       zip1(vtmp, T8B, vtmp, vtmpZ);
4950       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4951       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4952       add(cnt1, cnt1, 4);
4953       fmovd(tmp1, vtmp);
4954     } else { // UL case
4955       ldr(tmp1, Address(str1));
4956       cmp(str1, str2);
4957       br(Assembler::EQ, DONE);
4958       ldrs(vtmp, Address(str2));
4959       cmp(cnt2, STUB_THRESHOLD);
4960       br(GE, STUB);
4961       subw(cnt2, cnt2, 4);
4962       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4963       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4964       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4965       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4966       zip1(vtmp, T8B, vtmp, vtmpZ);
4967       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4968       add(cnt1, cnt1, 8);
4969       fmovd(tmp2, vtmp);
4970     }
4971     adds(cnt2, cnt2, isUL ? 4 : 8);
4972     br(GE, TAIL);
4973     eor(rscratch2, tmp1, tmp2);
4974     cbnz(rscratch2, DIFFERENCE);
4975     // main loop
4976     bind(NEXT_WORD);
4977     if (str1_isL == str2_isL) {
4978       ldr(tmp1, Address(str1, cnt2));
4979       ldr(tmp2, Address(str2, cnt2));
4980       adds(cnt2, cnt2, 8);
4981     } else if (isLU) {
4982       ldrs(vtmp, Address(str1, cnt1));
4983       ldr(tmp2, Address(str2, cnt2));
4984       add(cnt1, cnt1, 4);
4985       zip1(vtmp, T8B, vtmp, vtmpZ);
4986       fmovd(tmp1, vtmp);
4987       adds(cnt2, cnt2, 8);
4988     } else { // UL
4989       ldrs(vtmp, Address(str2, cnt2));
4990       ldr(tmp1, Address(str1, cnt1));
4991       zip1(vtmp, T8B, vtmp, vtmpZ);
4992       add(cnt1, cnt1, 8);
4993       fmovd(tmp2, vtmp);
4994       adds(cnt2, cnt2, 4);
4995     }
4996     br(GE, TAIL);
4997 
4998     eor(rscratch2, tmp1, tmp2);
4999     cbz(rscratch2, NEXT_WORD);
5000     b(DIFFERENCE);
5001     bind(TAIL);
5002     eor(rscratch2, tmp1, tmp2);
5003     cbnz(rscratch2, DIFFERENCE);
5004     // Last longword.  In the case where length == 4 we compare the
5005     // same longword twice, but that's still faster than another
5006     // conditional branch.
5007     if (str1_isL == str2_isL) {
5008       ldr(tmp1, Address(str1));
5009       ldr(tmp2, Address(str2));
5010     } else if (isLU) {
5011       ldrs(vtmp, Address(str1));
5012       ldr(tmp2, Address(str2));
5013       zip1(vtmp, T8B, vtmp, vtmpZ);
5014       fmovd(tmp1, vtmp);
5015     } else { // UL
5016       ldrs(vtmp, Address(str2));
5017       ldr(tmp1, Address(str1));
5018       zip1(vtmp, T8B, vtmp, vtmpZ);
5019       fmovd(tmp2, vtmp);
5020     }
5021     bind(TAIL_CHECK);
5022     eor(rscratch2, tmp1, tmp2);
5023     cbz(rscratch2, DONE);
5024 
5025     // Find the first different characters in the longwords and
5026     // compute their difference.
5027     bind(DIFFERENCE);
5028     rev(rscratch2, rscratch2);
5029     clz(rscratch2, rscratch2);
5030     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5031     lsrv(tmp1, tmp1, rscratch2);
5032     (this->*ext_chr)(tmp1, tmp1);
5033     lsrv(tmp2, tmp2, rscratch2);
5034     (this->*ext_chr)(tmp2, tmp2);
5035     subw(result, tmp1, tmp2);
5036     b(DONE);
5037   }
5038 
5039   bind(STUB);
5040     RuntimeAddress stub = NULL;
5041     switch(ae) {
5042       case StrIntrinsicNode::LL:
5043         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5044         break;
5045       case StrIntrinsicNode::UU:
5046         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5047         break;
5048       case StrIntrinsicNode::LU:
5049         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5050         break;
5051       case StrIntrinsicNode::UL:
5052         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5053         break;
5054       default:
5055         ShouldNotReachHere();
5056      }
5057     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5058     trampoline_call(stub);
5059     b(DONE);
5060 
5061   bind(SHORT_STRING);
5062   // Is the minimum length zero?
5063   cbz(cnt2, DONE);
5064   // arrange code to do most branches while loading and loading next characters
5065   // while comparing previous
5066   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5067   subs(cnt2, cnt2, 1);
5068   br(EQ, SHORT_LAST_INIT);
5069   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5070   b(SHORT_LOOP_START);
5071   bind(SHORT_LOOP);
5072   subs(cnt2, cnt2, 1);
5073   br(EQ, SHORT_LAST);
5074   bind(SHORT_LOOP_START);
5075   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5076   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5077   cmp(tmp1, cnt1);
5078   br(NE, SHORT_LOOP_TAIL);
5079   subs(cnt2, cnt2, 1);
5080   br(EQ, SHORT_LAST2);
5081   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5082   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5083   cmp(tmp2, rscratch1);
5084   br(EQ, SHORT_LOOP);
5085   sub(result, tmp2, rscratch1);
5086   b(DONE);
5087   bind(SHORT_LOOP_TAIL);
5088   sub(result, tmp1, cnt1);
5089   b(DONE);
5090   bind(SHORT_LAST2);
5091   cmp(tmp2, rscratch1);
5092   br(EQ, DONE);
5093   sub(result, tmp2, rscratch1);
5094 
5095   b(DONE);
5096   bind(SHORT_LAST_INIT);
5097   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5098   bind(SHORT_LAST);
5099   cmp(tmp1, cnt1);
5100   br(EQ, DONE);
5101   sub(result, tmp1, cnt1);
5102 
5103   bind(DONE);
5104 
5105   BLOCK_COMMENT("} string_compare");
5106 }
5107 #endif // COMPILER2
5108 
5109 // This method checks if provided byte array contains byte with highest bit set.
5110 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5111     // Simple and most common case of aligned small array which is not at the
5112     // end of memory page is placed here. All other cases are in stub.
5113     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5114     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5115     assert_different_registers(ary1, len, result);
5116 
5117     cmpw(len, 0);
5118     br(LE, SET_RESULT);
5119     cmpw(len, 4 * wordSize);
5120     br(GE, STUB_LONG); // size > 32 then go to stub
5121 
5122     int shift = 64 - exact_log2(os::vm_page_size());
5123     lsl(rscratch1, ary1, shift);
5124     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5125     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5126     br(CS, STUB); // at the end of page then go to stub
5127     subs(len, len, wordSize);
5128     br(LT, END);
5129 
5130   BIND(LOOP);
5131     ldr(rscratch1, Address(post(ary1, wordSize)));
5132     tst(rscratch1, UPPER_BIT_MASK);
5133     br(NE, SET_RESULT);
5134     subs(len, len, wordSize);
5135     br(GE, LOOP);
5136     cmpw(len, -wordSize);
5137     br(EQ, SET_RESULT);
5138 
5139   BIND(END);
5140     ldr(result, Address(ary1));
5141     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5142     lslv(result, result, len);
5143     tst(result, UPPER_BIT_MASK);
5144     b(SET_RESULT);
5145 
5146   BIND(STUB);
5147     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5148     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5149     trampoline_call(has_neg);
5150     b(DONE);
5151 
5152   BIND(STUB_LONG);
5153     RuntimeAddress has_neg_long =  RuntimeAddress(
5154             StubRoutines::aarch64::has_negatives_long());
5155     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5156     trampoline_call(has_neg_long);
5157     b(DONE);
5158 
5159   BIND(SET_RESULT);
5160     cset(result, NE); // set true or false
5161 
5162   BIND(DONE);
5163 }
5164 
5165 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5166                                    Register tmp4, Register tmp5, Register result,
5167                                    Register cnt1, int elem_size) {
5168   Label DONE, SAME;
5169   Register tmp1 = rscratch1;
5170   Register tmp2 = rscratch2;
5171   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5172   int elem_per_word = wordSize/elem_size;
5173   int log_elem_size = exact_log2(elem_size);
5174   int length_offset = arrayOopDesc::length_offset_in_bytes();
5175   int base_offset
5176     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5177   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5178 
5179   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5180   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5181 
5182 #ifndef PRODUCT
5183   {
5184     const char kind = (elem_size == 2) ? 'U' : 'L';
5185     char comment[64];
5186     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5187     BLOCK_COMMENT(comment);
5188   }
5189 #endif
5190 
5191   // if (a1 == a2)
5192   //     return true;
5193   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5194   br(EQ, SAME);
5195 
5196   if (UseSimpleArrayEquals) {
5197     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5198     // if (a1 == null || a2 == null)
5199     //     return false;
5200     // a1 & a2 == 0 means (some-pointer is null) or
5201     // (very-rare-or-even-probably-impossible-pointer-values)
5202     // so, we can save one branch in most cases
5203     tst(a1, a2);
5204     mov(result, false);
5205     br(EQ, A_MIGHT_BE_NULL);
5206     // if (a1.length != a2.length)
5207     //      return false;
5208     bind(A_IS_NOT_NULL);
5209     ldrw(cnt1, Address(a1, length_offset));
5210     ldrw(cnt2, Address(a2, length_offset));
5211     eorw(tmp5, cnt1, cnt2);
5212     cbnzw(tmp5, DONE);
5213     lea(a1, Address(a1, base_offset));
5214     lea(a2, Address(a2, base_offset));
5215     // Check for short strings, i.e. smaller than wordSize.
5216     subs(cnt1, cnt1, elem_per_word);
5217     br(Assembler::LT, SHORT);
5218     // Main 8 byte comparison loop.
5219     bind(NEXT_WORD); {
5220       ldr(tmp1, Address(post(a1, wordSize)));
5221       ldr(tmp2, Address(post(a2, wordSize)));
5222       subs(cnt1, cnt1, elem_per_word);
5223       eor(tmp5, tmp1, tmp2);
5224       cbnz(tmp5, DONE);
5225     } br(GT, NEXT_WORD);
5226     // Last longword.  In the case where length == 4 we compare the
5227     // same longword twice, but that's still faster than another
5228     // conditional branch.
5229     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5230     // length == 4.
5231     if (log_elem_size > 0)
5232       lsl(cnt1, cnt1, log_elem_size);
5233     ldr(tmp3, Address(a1, cnt1));
5234     ldr(tmp4, Address(a2, cnt1));
5235     eor(tmp5, tmp3, tmp4);
5236     cbnz(tmp5, DONE);
5237     b(SAME);
5238     bind(A_MIGHT_BE_NULL);
5239     // in case both a1 and a2 are not-null, proceed with loads
5240     cbz(a1, DONE);
5241     cbz(a2, DONE);
5242     b(A_IS_NOT_NULL);
5243     bind(SHORT);
5244 
5245     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5246     {
5247       ldrw(tmp1, Address(post(a1, 4)));
5248       ldrw(tmp2, Address(post(a2, 4)));
5249       eorw(tmp5, tmp1, tmp2);
5250       cbnzw(tmp5, DONE);
5251     }
5252     bind(TAIL03);
5253     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5254     {
5255       ldrh(tmp3, Address(post(a1, 2)));
5256       ldrh(tmp4, Address(post(a2, 2)));
5257       eorw(tmp5, tmp3, tmp4);
5258       cbnzw(tmp5, DONE);
5259     }
5260     bind(TAIL01);
5261     if (elem_size == 1) { // Only needed when comparing byte arrays.
5262       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5263       {
5264         ldrb(tmp1, a1);
5265         ldrb(tmp2, a2);
5266         eorw(tmp5, tmp1, tmp2);
5267         cbnzw(tmp5, DONE);
5268       }
5269     }
5270   } else {
5271     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5272         CSET_EQ, LAST_CHECK;
5273     mov(result, false);
5274     cbz(a1, DONE);
5275     ldrw(cnt1, Address(a1, length_offset));
5276     cbz(a2, DONE);
5277     ldrw(cnt2, Address(a2, length_offset));
5278     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5279     // faster to perform another branch before comparing a1 and a2
5280     cmp(cnt1, (u1)elem_per_word);
5281     br(LE, SHORT); // short or same
5282     ldr(tmp3, Address(pre(a1, base_offset)));
5283     subs(zr, cnt1, stubBytesThreshold);
5284     br(GE, STUB);
5285     ldr(tmp4, Address(pre(a2, base_offset)));
5286     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5287     cmp(cnt2, cnt1);
5288     br(NE, DONE);
5289 
5290     // Main 16 byte comparison loop with 2 exits
5291     bind(NEXT_DWORD); {
5292       ldr(tmp1, Address(pre(a1, wordSize)));
5293       ldr(tmp2, Address(pre(a2, wordSize)));
5294       subs(cnt1, cnt1, 2 * elem_per_word);
5295       br(LE, TAIL);
5296       eor(tmp4, tmp3, tmp4);
5297       cbnz(tmp4, DONE);
5298       ldr(tmp3, Address(pre(a1, wordSize)));
5299       ldr(tmp4, Address(pre(a2, wordSize)));
5300       cmp(cnt1, (u1)elem_per_word);
5301       br(LE, TAIL2);
5302       cmp(tmp1, tmp2);
5303     } br(EQ, NEXT_DWORD);
5304     b(DONE);
5305 
5306     bind(TAIL);
5307     eor(tmp4, tmp3, tmp4);
5308     eor(tmp2, tmp1, tmp2);
5309     lslv(tmp2, tmp2, tmp5);
5310     orr(tmp5, tmp4, tmp2);
5311     cmp(tmp5, zr);
5312     b(CSET_EQ);
5313 
5314     bind(TAIL2);
5315     eor(tmp2, tmp1, tmp2);
5316     cbnz(tmp2, DONE);
5317     b(LAST_CHECK);
5318 
5319     bind(STUB);
5320     ldr(tmp4, Address(pre(a2, base_offset)));
5321     cmp(cnt2, cnt1);
5322     br(NE, DONE);
5323     if (elem_size == 2) { // convert to byte counter
5324       lsl(cnt1, cnt1, 1);
5325     }
5326     eor(tmp5, tmp3, tmp4);
5327     cbnz(tmp5, DONE);
5328     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5329     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5330     trampoline_call(stub);
5331     b(DONE);
5332 
5333     bind(EARLY_OUT);
5334     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5335     // so, if a2 == null => return false(0), else return true, so we can return a2
5336     mov(result, a2);
5337     b(DONE);
5338     bind(SHORT);
5339     cmp(cnt2, cnt1);
5340     br(NE, DONE);
5341     cbz(cnt1, SAME);
5342     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5343     ldr(tmp3, Address(a1, base_offset));
5344     ldr(tmp4, Address(a2, base_offset));
5345     bind(LAST_CHECK);
5346     eor(tmp4, tmp3, tmp4);
5347     lslv(tmp5, tmp4, tmp5);
5348     cmp(tmp5, zr);
5349     bind(CSET_EQ);
5350     cset(result, EQ);
5351     b(DONE);
5352   }
5353 
5354   bind(SAME);
5355   mov(result, true);
5356   // That's it.
5357   bind(DONE);
5358 
5359   BLOCK_COMMENT("} array_equals");
5360 }
5361 
5362 // Compare Strings
5363 
5364 // For Strings we're passed the address of the first characters in a1
5365 // and a2 and the length in cnt1.
5366 // elem_size is the element size in bytes: either 1 or 2.
5367 // There are two implementations.  For arrays >= 8 bytes, all
5368 // comparisons (including the final one, which may overlap) are
5369 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5370 // halfword, then a short, and then a byte.
5371 
5372 void MacroAssembler::string_equals(Register a1, Register a2,
5373                                    Register result, Register cnt1, int elem_size)
5374 {
5375   Label SAME, DONE, SHORT, NEXT_WORD;
5376   Register tmp1 = rscratch1;
5377   Register tmp2 = rscratch2;
5378   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5379 
5380   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5381   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5382 
5383 #ifndef PRODUCT
5384   {
5385     const char kind = (elem_size == 2) ? 'U' : 'L';
5386     char comment[64];
5387     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5388     BLOCK_COMMENT(comment);
5389   }
5390 #endif
5391 
5392   mov(result, false);
5393 
5394   // Check for short strings, i.e. smaller than wordSize.
5395   subs(cnt1, cnt1, wordSize);
5396   br(Assembler::LT, SHORT);
5397   // Main 8 byte comparison loop.
5398   bind(NEXT_WORD); {
5399     ldr(tmp1, Address(post(a1, wordSize)));
5400     ldr(tmp2, Address(post(a2, wordSize)));
5401     subs(cnt1, cnt1, wordSize);
5402     eor(tmp1, tmp1, tmp2);
5403     cbnz(tmp1, DONE);
5404   } br(GT, NEXT_WORD);
5405   // Last longword.  In the case where length == 4 we compare the
5406   // same longword twice, but that's still faster than another
5407   // conditional branch.
5408   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5409   // length == 4.
5410   ldr(tmp1, Address(a1, cnt1));
5411   ldr(tmp2, Address(a2, cnt1));
5412   eor(tmp2, tmp1, tmp2);
5413   cbnz(tmp2, DONE);
5414   b(SAME);
5415 
5416   bind(SHORT);
5417   Label TAIL03, TAIL01;
5418 
5419   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5420   {
5421     ldrw(tmp1, Address(post(a1, 4)));
5422     ldrw(tmp2, Address(post(a2, 4)));
5423     eorw(tmp1, tmp1, tmp2);
5424     cbnzw(tmp1, DONE);
5425   }
5426   bind(TAIL03);
5427   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5428   {
5429     ldrh(tmp1, Address(post(a1, 2)));
5430     ldrh(tmp2, Address(post(a2, 2)));
5431     eorw(tmp1, tmp1, tmp2);
5432     cbnzw(tmp1, DONE);
5433   }
5434   bind(TAIL01);
5435   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5436     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5437     {
5438       ldrb(tmp1, a1);
5439       ldrb(tmp2, a2);
5440       eorw(tmp1, tmp1, tmp2);
5441       cbnzw(tmp1, DONE);
5442     }
5443   }
5444   // Arrays are equal.
5445   bind(SAME);
5446   mov(result, true);
5447 
5448   // That's it.
5449   bind(DONE);
5450   BLOCK_COMMENT("} string_equals");
5451 }
5452 
5453 
5454 // The size of the blocks erased by the zero_blocks stub.  We must
5455 // handle anything smaller than this ourselves in zero_words().
5456 const int MacroAssembler::zero_words_block_size = 8;
5457 
5458 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5459 // possible, handling small word counts locally and delegating
5460 // anything larger to the zero_blocks stub.  It is expanded many times
5461 // in compiled code, so it is important to keep it short.
5462 
5463 // ptr:   Address of a buffer to be zeroed.
5464 // cnt:   Count in HeapWords.
5465 //
5466 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5467 void MacroAssembler::zero_words(Register ptr, Register cnt)
5468 {
5469   assert(is_power_of_2(zero_words_block_size), "adjust this");
5470   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5471 
5472   BLOCK_COMMENT("zero_words {");
5473   cmp(cnt, (u1)zero_words_block_size);
5474   Label around;
5475   br(LO, around);
5476   {
5477     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5478     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5479     if (StubRoutines::aarch64::complete()) {
5480       trampoline_call(zero_blocks);
5481     } else {
5482       bl(zero_blocks);
5483     }
5484   }
5485   bind(around);
5486   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5487     Label l;
5488     tbz(cnt, exact_log2(i), l);
5489     for (int j = 0; j < i; j += 2) {
5490       stp(zr, zr, post(ptr, 16));
5491     }
5492     bind(l);
5493   }
5494   {
5495     Label l;
5496     tbz(cnt, 0, l);
5497     str(zr, Address(ptr));
5498     bind(l);
5499   }
5500   BLOCK_COMMENT("} zero_words");
5501 }
5502 
5503 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5504 // cnt:          Immediate count in HeapWords.
5505 #define SmallArraySize (18 * BytesPerLong)
5506 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5507 {
5508   BLOCK_COMMENT("zero_words {");
5509   int i = cnt & 1;  // store any odd word to start
5510   if (i) str(zr, Address(base));
5511 
5512   if (cnt <= SmallArraySize / BytesPerLong) {
5513     for (; i < (int)cnt; i += 2)
5514       stp(zr, zr, Address(base, i * wordSize));
5515   } else {
5516     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5517     int remainder = cnt % (2 * unroll);
5518     for (; i < remainder; i += 2)
5519       stp(zr, zr, Address(base, i * wordSize));
5520 
5521     Label loop;
5522     Register cnt_reg = rscratch1;
5523     Register loop_base = rscratch2;
5524     cnt = cnt - remainder;
5525     mov(cnt_reg, cnt);
5526     // adjust base and prebias by -2 * wordSize so we can pre-increment
5527     add(loop_base, base, (remainder - 2) * wordSize);
5528     bind(loop);
5529     sub(cnt_reg, cnt_reg, 2 * unroll);
5530     for (i = 1; i < unroll; i++)
5531       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5532     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5533     cbnz(cnt_reg, loop);
5534   }
5535   BLOCK_COMMENT("} zero_words");
5536 }
5537 
5538 // Zero blocks of memory by using DC ZVA.
5539 //
5540 // Aligns the base address first sufficently for DC ZVA, then uses
5541 // DC ZVA repeatedly for every full block.  cnt is the size to be
5542 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5543 // in cnt.
5544 //
5545 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5546 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5547 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5548   Register tmp = rscratch1;
5549   Register tmp2 = rscratch2;
5550   int zva_length = VM_Version::zva_length();
5551   Label initial_table_end, loop_zva;
5552   Label fini;
5553 
5554   // Base must be 16 byte aligned. If not just return and let caller handle it
5555   tst(base, 0x0f);
5556   br(Assembler::NE, fini);
5557   // Align base with ZVA length.
5558   neg(tmp, base);
5559   andr(tmp, tmp, zva_length - 1);
5560 
5561   // tmp: the number of bytes to be filled to align the base with ZVA length.
5562   add(base, base, tmp);
5563   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5564   adr(tmp2, initial_table_end);
5565   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5566   br(tmp2);
5567 
5568   for (int i = -zva_length + 16; i < 0; i += 16)
5569     stp(zr, zr, Address(base, i));
5570   bind(initial_table_end);
5571 
5572   sub(cnt, cnt, zva_length >> 3);
5573   bind(loop_zva);
5574   dc(Assembler::ZVA, base);
5575   subs(cnt, cnt, zva_length >> 3);
5576   add(base, base, zva_length);
5577   br(Assembler::GE, loop_zva);
5578   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5579   bind(fini);
5580 }
5581 
5582 // base:   Address of a buffer to be filled, 8 bytes aligned.
5583 // cnt:    Count in 8-byte unit.
5584 // value:  Value to be filled with.
5585 // base will point to the end of the buffer after filling.
5586 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5587 {
5588 //  Algorithm:
5589 //
5590 //    scratch1 = cnt & 7;
5591 //    cnt -= scratch1;
5592 //    p += scratch1;
5593 //    switch (scratch1) {
5594 //      do {
5595 //        cnt -= 8;
5596 //          p[-8] = v;
5597 //        case 7:
5598 //          p[-7] = v;
5599 //        case 6:
5600 //          p[-6] = v;
5601 //          // ...
5602 //        case 1:
5603 //          p[-1] = v;
5604 //        case 0:
5605 //          p += 8;
5606 //      } while (cnt);
5607 //    }
5608 
5609   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5610 
5611   Label fini, skip, entry, loop;
5612   const int unroll = 8; // Number of stp instructions we'll unroll
5613 
5614   cbz(cnt, fini);
5615   tbz(base, 3, skip);
5616   str(value, Address(post(base, 8)));
5617   sub(cnt, cnt, 1);
5618   bind(skip);
5619 
5620   andr(rscratch1, cnt, (unroll-1) * 2);
5621   sub(cnt, cnt, rscratch1);
5622   add(base, base, rscratch1, Assembler::LSL, 3);
5623   adr(rscratch2, entry);
5624   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5625   br(rscratch2);
5626 
5627   bind(loop);
5628   add(base, base, unroll * 16);
5629   for (int i = -unroll; i < 0; i++)
5630     stp(value, value, Address(base, i * 16));
5631   bind(entry);
5632   subs(cnt, cnt, unroll * 2);
5633   br(Assembler::GE, loop);
5634 
5635   tbz(cnt, 0, fini);
5636   str(value, Address(post(base, 8)));
5637   bind(fini);
5638 }
5639 
5640 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5641 // java/lang/StringUTF16.compress.
5642 void MacroAssembler::encode_iso_array(Register src, Register dst,
5643                       Register len, Register result,
5644                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5645                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5646 {
5647     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5648         NEXT_32_START, NEXT_32_PRFM_START;
5649     Register tmp1 = rscratch1, tmp2 = rscratch2;
5650 
5651       mov(result, len); // Save initial len
5652 
5653 #ifndef BUILTIN_SIM
5654       cmp(len, (u1)8); // handle shortest strings first
5655       br(LT, LOOP_1);
5656       cmp(len, (u1)32);
5657       br(LT, NEXT_8);
5658       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5659       // to convert chars to bytes
5660       if (SoftwarePrefetchHintDistance >= 0) {
5661         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5662         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5663         br(LE, NEXT_32_START);
5664         b(NEXT_32_PRFM_START);
5665         BIND(NEXT_32_PRFM);
5666           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5667         BIND(NEXT_32_PRFM_START);
5668           prfm(Address(src, SoftwarePrefetchHintDistance));
5669           orr(v4, T16B, Vtmp1, Vtmp2);
5670           orr(v5, T16B, Vtmp3, Vtmp4);
5671           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5672           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5673           uzp2(v5, T16B, v4, v5); // high bytes
5674           umov(tmp2, v5, D, 1);
5675           fmovd(tmp1, v5);
5676           orr(tmp1, tmp1, tmp2);
5677           cbnz(tmp1, LOOP_8);
5678           stpq(Vtmp1, Vtmp3, dst);
5679           sub(len, len, 32);
5680           add(dst, dst, 32);
5681           add(src, src, 64);
5682           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5683           br(GE, NEXT_32_PRFM);
5684           cmp(len, (u1)32);
5685           br(LT, LOOP_8);
5686         BIND(NEXT_32);
5687           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5688         BIND(NEXT_32_START);
5689       } else {
5690         BIND(NEXT_32);
5691           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5692       }
5693       prfm(Address(src, SoftwarePrefetchHintDistance));
5694       uzp1(v4, T16B, Vtmp1, Vtmp2);
5695       uzp1(v5, T16B, Vtmp3, Vtmp4);
5696       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5697       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5698       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5699       umov(tmp2, Vtmp1, D, 1);
5700       fmovd(tmp1, Vtmp1);
5701       orr(tmp1, tmp1, tmp2);
5702       cbnz(tmp1, LOOP_8);
5703       stpq(v4, v5, dst);
5704       sub(len, len, 32);
5705       add(dst, dst, 32);
5706       add(src, src, 64);
5707       cmp(len, (u1)32);
5708       br(GE, NEXT_32);
5709       cbz(len, DONE);
5710 
5711     BIND(LOOP_8);
5712       cmp(len, (u1)8);
5713       br(LT, LOOP_1);
5714     BIND(NEXT_8);
5715       ld1(Vtmp1, T8H, src);
5716       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5717       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5718       fmovd(tmp1, Vtmp3);
5719       cbnz(tmp1, NEXT_1);
5720       strd(Vtmp2, dst);
5721 
5722       sub(len, len, 8);
5723       add(dst, dst, 8);
5724       add(src, src, 16);
5725       cmp(len, (u1)8);
5726       br(GE, NEXT_8);
5727 
5728     BIND(LOOP_1);
5729 #endif
5730     cbz(len, DONE);
5731     BIND(NEXT_1);
5732       ldrh(tmp1, Address(post(src, 2)));
5733       tst(tmp1, 0xff00);
5734       br(NE, SET_RESULT);
5735       strb(tmp1, Address(post(dst, 1)));
5736       subs(len, len, 1);
5737       br(GT, NEXT_1);
5738 
5739     BIND(SET_RESULT);
5740       sub(result, result, len); // Return index where we stopped
5741                                 // Return len == 0 if we processed all
5742                                 // characters
5743     BIND(DONE);
5744 }
5745 
5746 
5747 // Inflate byte[] array to char[].
5748 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5749                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5750                                         Register tmp4) {
5751   Label big, done, after_init, to_stub;
5752 
5753   assert_different_registers(src, dst, len, tmp4, rscratch1);
5754 
5755   fmovd(vtmp1, zr);
5756   lsrw(tmp4, len, 3);
5757   bind(after_init);
5758   cbnzw(tmp4, big);
5759   // Short string: less than 8 bytes.
5760   {
5761     Label loop, tiny;
5762 
5763     cmpw(len, 4);
5764     br(LT, tiny);
5765     // Use SIMD to do 4 bytes.
5766     ldrs(vtmp2, post(src, 4));
5767     zip1(vtmp3, T8B, vtmp2, vtmp1);
5768     subw(len, len, 4);
5769     strd(vtmp3, post(dst, 8));
5770 
5771     cbzw(len, done);
5772 
5773     // Do the remaining bytes by steam.
5774     bind(loop);
5775     ldrb(tmp4, post(src, 1));
5776     strh(tmp4, post(dst, 2));
5777     subw(len, len, 1);
5778 
5779     bind(tiny);
5780     cbnz(len, loop);
5781 
5782     b(done);
5783   }
5784 
5785   if (SoftwarePrefetchHintDistance >= 0) {
5786     bind(to_stub);
5787       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5788       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5789       trampoline_call(stub);
5790       b(after_init);
5791   }
5792 
5793   // Unpack the bytes 8 at a time.
5794   bind(big);
5795   {
5796     Label loop, around, loop_last, loop_start;
5797 
5798     if (SoftwarePrefetchHintDistance >= 0) {
5799       const int large_loop_threshold = (64 + 16)/8;
5800       ldrd(vtmp2, post(src, 8));
5801       andw(len, len, 7);
5802       cmp(tmp4, (u1)large_loop_threshold);
5803       br(GE, to_stub);
5804       b(loop_start);
5805 
5806       bind(loop);
5807       ldrd(vtmp2, post(src, 8));
5808       bind(loop_start);
5809       subs(tmp4, tmp4, 1);
5810       br(EQ, loop_last);
5811       zip1(vtmp2, T16B, vtmp2, vtmp1);
5812       ldrd(vtmp3, post(src, 8));
5813       st1(vtmp2, T8H, post(dst, 16));
5814       subs(tmp4, tmp4, 1);
5815       zip1(vtmp3, T16B, vtmp3, vtmp1);
5816       st1(vtmp3, T8H, post(dst, 16));
5817       br(NE, loop);
5818       b(around);
5819       bind(loop_last);
5820       zip1(vtmp2, T16B, vtmp2, vtmp1);
5821       st1(vtmp2, T8H, post(dst, 16));
5822       bind(around);
5823       cbz(len, done);
5824     } else {
5825       andw(len, len, 7);
5826       bind(loop);
5827       ldrd(vtmp2, post(src, 8));
5828       sub(tmp4, tmp4, 1);
5829       zip1(vtmp3, T16B, vtmp2, vtmp1);
5830       st1(vtmp3, T8H, post(dst, 16));
5831       cbnz(tmp4, loop);
5832     }
5833   }
5834 
5835   // Do the tail of up to 8 bytes.
5836   add(src, src, len);
5837   ldrd(vtmp3, Address(src, -8));
5838   add(dst, dst, len, ext::uxtw, 1);
5839   zip1(vtmp3, T16B, vtmp3, vtmp1);
5840   strq(vtmp3, Address(dst, -16));
5841 
5842   bind(done);
5843 }
5844 
5845 // Compress char[] array to byte[].
5846 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5847                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5848                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5849                                          Register result) {
5850   encode_iso_array(src, dst, len, result,
5851                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5852   cmp(len, zr);
5853   csel(result, result, zr, EQ);
5854 }
5855 
5856 // get_thread() can be called anywhere inside generated code so we
5857 // need to save whatever non-callee save context might get clobbered
5858 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5859 // the call setup code.
5860 //
5861 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5862 //
5863 void MacroAssembler::get_thread(Register dst) {
5864   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5865   push(saved_regs, sp);
5866 
5867   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5868   blrt(lr, 1, 0, 1);
5869   if (dst != c_rarg0) {
5870     mov(dst, c_rarg0);
5871   }
5872 
5873   pop(saved_regs, sp);
5874 }