1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   if (last_java_pc != NULL) {
 377     adr(scratch, last_java_pc);
 378   } else {
 379     // FIXME: This is almost never correct.  We should delete all
 380     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 381     // correct return address instead.
 382     adr(scratch, pc());
 383   }
 384 
 385   str(scratch, Address(rthread,
 386                        JavaThread::frame_anchor_offset()
 387                        + JavaFrameAnchor::last_Java_pc_offset()));
 388 
 389   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 390 }
 391 
 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 393                                          Register last_java_fp,
 394                                          Label &L,
 395                                          Register scratch) {
 396   if (L.is_bound()) {
 397     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 398   } else {
 399     InstructionMark im(this);
 400     L.add_patch_at(code(), locator());
 401     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 402   }
 403 }
 404 
 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 407   assert(CodeCache::find_blob(entry.target()) != NULL,
 408          "destination of far call not found in code cache");
 409   if (far_branches()) {
 410     unsigned long offset;
 411     // We can use ADRP here because we know that the total size of
 412     // the code cache cannot exceed 2Gb.
 413     adrp(tmp, entry, offset);
 414     add(tmp, tmp, offset);
 415     if (cbuf) cbuf->set_insts_mark();
 416     blr(tmp);
 417   } else {
 418     if (cbuf) cbuf->set_insts_mark();
 419     bl(entry);
 420   }
 421 }
 422 
 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 424   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 425   assert(CodeCache::find_blob(entry.target()) != NULL,
 426          "destination of far call not found in code cache");
 427   if (far_branches()) {
 428     unsigned long offset;
 429     // We can use ADRP here because we know that the total size of
 430     // the code cache cannot exceed 2Gb.
 431     adrp(tmp, entry, offset);
 432     add(tmp, tmp, offset);
 433     if (cbuf) cbuf->set_insts_mark();
 434     br(tmp);
 435   } else {
 436     if (cbuf) cbuf->set_insts_mark();
 437     b(entry);
 438   }
 439 }
 440 
 441 void MacroAssembler::reserved_stack_check() {
 442     // testing if reserved zone needs to be enabled
 443     Label no_reserved_zone_enabling;
 444 
 445     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 446     cmp(sp, rscratch1);
 447     br(Assembler::LO, no_reserved_zone_enabling);
 448 
 449     enter();   // LR and FP are live.
 450     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 451     mov(c_rarg0, rthread);
 452     blr(rscratch1);
 453     leave();
 454 
 455     // We have already removed our own frame.
 456     // throw_delayed_StackOverflowError will think that it's been
 457     // called by our caller.
 458     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 459     br(rscratch1);
 460     should_not_reach_here();
 461 
 462     bind(no_reserved_zone_enabling);
 463 }
 464 
 465 int MacroAssembler::biased_locking_enter(Register lock_reg,
 466                                          Register obj_reg,
 467                                          Register swap_reg,
 468                                          Register tmp_reg,
 469                                          bool swap_reg_contains_mark,
 470                                          Label& done,
 471                                          Label* slow_case,
 472                                          BiasedLockingCounters* counters) {
 473   assert(UseBiasedLocking, "why call this otherwise?");
 474   assert_different_registers(lock_reg, obj_reg, swap_reg);
 475 
 476   if (PrintBiasedLockingStatistics && counters == NULL)
 477     counters = BiasedLocking::counters();
 478 
 479   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 480   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 481   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 482   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 483   Address saved_mark_addr(lock_reg, 0);
 484 
 485   // Biased locking
 486   // See whether the lock is currently biased toward our thread and
 487   // whether the epoch is still valid
 488   // Note that the runtime guarantees sufficient alignment of JavaThread
 489   // pointers to allow age to be placed into low bits
 490   // First check to see whether biasing is even enabled for this object
 491   Label cas_label;
 492   int null_check_offset = -1;
 493   if (!swap_reg_contains_mark) {
 494     null_check_offset = offset();
 495     ldr(swap_reg, mark_addr);
 496   }
 497   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 498   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 499   br(Assembler::NE, cas_label);
 500   // The bias pattern is present in the object's header. Need to check
 501   // whether the bias owner and the epoch are both still current.
 502   load_prototype_header(tmp_reg, obj_reg);
 503   orr(tmp_reg, tmp_reg, rthread);
 504   eor(tmp_reg, swap_reg, tmp_reg);
 505   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 506   if (counters != NULL) {
 507     Label around;
 508     cbnz(tmp_reg, around);
 509     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 510     b(done);
 511     bind(around);
 512   } else {
 513     cbz(tmp_reg, done);
 514   }
 515 
 516   Label try_revoke_bias;
 517   Label try_rebias;
 518 
 519   // At this point we know that the header has the bias pattern and
 520   // that we are not the bias owner in the current epoch. We need to
 521   // figure out more details about the state of the header in order to
 522   // know what operations can be legally performed on the object's
 523   // header.
 524 
 525   // If the low three bits in the xor result aren't clear, that means
 526   // the prototype header is no longer biased and we have to revoke
 527   // the bias on this object.
 528   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 529   cbnz(rscratch1, try_revoke_bias);
 530 
 531   // Biasing is still enabled for this data type. See whether the
 532   // epoch of the current bias is still valid, meaning that the epoch
 533   // bits of the mark word are equal to the epoch bits of the
 534   // prototype header. (Note that the prototype header's epoch bits
 535   // only change at a safepoint.) If not, attempt to rebias the object
 536   // toward the current thread. Note that we must be absolutely sure
 537   // that the current epoch is invalid in order to do this because
 538   // otherwise the manipulations it performs on the mark word are
 539   // illegal.
 540   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 541   cbnz(rscratch1, try_rebias);
 542 
 543   // The epoch of the current bias is still valid but we know nothing
 544   // about the owner; it might be set or it might be clear. Try to
 545   // acquire the bias of the object using an atomic operation. If this
 546   // fails we will go in to the runtime to revoke the object's bias.
 547   // Note that we first construct the presumed unbiased header so we
 548   // don't accidentally blow away another thread's valid bias.
 549   {
 550     Label here;
 551     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 552     andr(swap_reg, swap_reg, rscratch1);
 553     orr(tmp_reg, swap_reg, rthread);
 554     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 555     // If the biasing toward our thread failed, this means that
 556     // another thread succeeded in biasing it toward itself and we
 557     // need to revoke that bias. The revocation will occur in the
 558     // interpreter runtime in the slow case.
 559     bind(here);
 560     if (counters != NULL) {
 561       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 562                   tmp_reg, rscratch1, rscratch2);
 563     }
 564   }
 565   b(done);
 566 
 567   bind(try_rebias);
 568   // At this point we know the epoch has expired, meaning that the
 569   // current "bias owner", if any, is actually invalid. Under these
 570   // circumstances _only_, we are allowed to use the current header's
 571   // value as the comparison value when doing the cas to acquire the
 572   // bias in the current epoch. In other words, we allow transfer of
 573   // the bias from one thread to another directly in this situation.
 574   //
 575   // FIXME: due to a lack of registers we currently blow away the age
 576   // bits in this situation. Should attempt to preserve them.
 577   {
 578     Label here;
 579     load_prototype_header(tmp_reg, obj_reg);
 580     orr(tmp_reg, rthread, tmp_reg);
 581     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 582     // If the biasing toward our thread failed, then another thread
 583     // succeeded in biasing it toward itself and we need to revoke that
 584     // bias. The revocation will occur in the runtime in the slow case.
 585     bind(here);
 586     if (counters != NULL) {
 587       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 588                   tmp_reg, rscratch1, rscratch2);
 589     }
 590   }
 591   b(done);
 592 
 593   bind(try_revoke_bias);
 594   // The prototype mark in the klass doesn't have the bias bit set any
 595   // more, indicating that objects of this data type are not supposed
 596   // to be biased any more. We are going to try to reset the mark of
 597   // this object to the prototype value and fall through to the
 598   // CAS-based locking scheme. Note that if our CAS fails, it means
 599   // that another thread raced us for the privilege of revoking the
 600   // bias of this particular object, so it's okay to continue in the
 601   // normal locking code.
 602   //
 603   // FIXME: due to a lack of registers we currently blow away the age
 604   // bits in this situation. Should attempt to preserve them.
 605   {
 606     Label here, nope;
 607     load_prototype_header(tmp_reg, obj_reg);
 608     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 609     bind(here);
 610 
 611     // Fall through to the normal CAS-based lock, because no matter what
 612     // the result of the above CAS, some thread must have succeeded in
 613     // removing the bias bit from the object's header.
 614     if (counters != NULL) {
 615       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 616                   rscratch1, rscratch2);
 617     }
 618     bind(nope);
 619   }
 620 
 621   bind(cas_label);
 622 
 623   return null_check_offset;
 624 }
 625 
 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 627   assert(UseBiasedLocking, "why call this otherwise?");
 628 
 629   // Check for biased locking unlock case, which is a no-op
 630   // Note: we do not have to check the thread ID for two reasons.
 631   // First, the interpreter checks for IllegalMonitorStateException at
 632   // a higher level. Second, if the bias was revoked while we held the
 633   // lock, the object could not be rebiased toward another thread, so
 634   // the bias bit would be clear.
 635   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 636   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 637   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 638   br(Assembler::EQ, done);
 639 }
 640 
 641 static void pass_arg0(MacroAssembler* masm, Register arg) {
 642   if (c_rarg0 != arg ) {
 643     masm->mov(c_rarg0, arg);
 644   }
 645 }
 646 
 647 static void pass_arg1(MacroAssembler* masm, Register arg) {
 648   if (c_rarg1 != arg ) {
 649     masm->mov(c_rarg1, arg);
 650   }
 651 }
 652 
 653 static void pass_arg2(MacroAssembler* masm, Register arg) {
 654   if (c_rarg2 != arg ) {
 655     masm->mov(c_rarg2, arg);
 656   }
 657 }
 658 
 659 static void pass_arg3(MacroAssembler* masm, Register arg) {
 660   if (c_rarg3 != arg ) {
 661     masm->mov(c_rarg3, arg);
 662   }
 663 }
 664 
 665 void MacroAssembler::call_VM_base(Register oop_result,
 666                                   Register java_thread,
 667                                   Register last_java_sp,
 668                                   address  entry_point,
 669                                   int      number_of_arguments,
 670                                   bool     check_exceptions) {
 671    // determine java_thread register
 672   if (!java_thread->is_valid()) {
 673     java_thread = rthread;
 674   }
 675 
 676   // determine last_java_sp register
 677   if (!last_java_sp->is_valid()) {
 678     last_java_sp = esp;
 679   }
 680 
 681   // debugging support
 682   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 683   assert(java_thread == rthread, "unexpected register");
 684 #ifdef ASSERT
 685   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 686   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 687 #endif // ASSERT
 688 
 689   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 690   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 691 
 692   // push java thread (becomes first argument of C function)
 693 
 694   mov(c_rarg0, java_thread);
 695 
 696   // set last Java frame before call
 697   assert(last_java_sp != rfp, "can't use rfp");
 698 
 699   Label l;
 700   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 701 
 702   // do the call, remove parameters
 703   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 704 
 705   // reset last Java frame
 706   // Only interpreter should have to clear fp
 707   reset_last_Java_frame(true);
 708 
 709    // C++ interp handles this in the interpreter
 710   check_and_handle_popframe(java_thread);
 711   check_and_handle_earlyret(java_thread);
 712 
 713   if (check_exceptions) {
 714     // check for pending exceptions (java_thread is set upon return)
 715     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 716     Label ok;
 717     cbz(rscratch1, ok);
 718     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 719     br(rscratch1);
 720     bind(ok);
 721   }
 722 
 723   // get oop result if there is one and reset the value in the thread
 724   if (oop_result->is_valid()) {
 725     get_vm_result(oop_result, java_thread);
 726   }
 727 }
 728 
 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 730   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 731 }
 732 
 733 // Maybe emit a call via a trampoline.  If the code cache is small
 734 // trampolines won't be emitted.
 735 
 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 737   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 738   assert(entry.rspec().type() == relocInfo::runtime_call_type
 739          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 740          || entry.rspec().type() == relocInfo::static_call_type
 741          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 742 
 743   // We need a trampoline if branches are far.
 744   if (far_branches()) {
 745     bool in_scratch_emit_size = false;
 746 #ifdef COMPILER2
 747     // We don't want to emit a trampoline if C2 is generating dummy
 748     // code during its branch shortening phase.
 749     CompileTask* task = ciEnv::current()->task();
 750     in_scratch_emit_size =
 751       (task != NULL && is_c2_compile(task->comp_level()) &&
 752        Compile::current()->in_scratch_emit_size());
 753 #endif
 754     if (!in_scratch_emit_size) {
 755       address stub = emit_trampoline_stub(offset(), entry.target());
 756       if (stub == NULL) {
 757         return NULL; // CodeCache is full
 758       }
 759     }
 760   }
 761 
 762   if (cbuf) cbuf->set_insts_mark();
 763   relocate(entry.rspec());
 764   if (!far_branches()) {
 765     bl(entry.target());
 766   } else {
 767     bl(pc());
 768   }
 769   // just need to return a non-null address
 770   return pc();
 771 }
 772 
 773 
 774 // Emit a trampoline stub for a call to a target which is too far away.
 775 //
 776 // code sequences:
 777 //
 778 // call-site:
 779 //   branch-and-link to <destination> or <trampoline stub>
 780 //
 781 // Related trampoline stub for this call site in the stub section:
 782 //   load the call target from the constant pool
 783 //   branch (LR still points to the call site above)
 784 
 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 786                                              address dest) {
 787   // Max stub size: alignment nop, TrampolineStub.
 788   address stub = start_a_stub(NativeInstruction::instruction_size
 789                    + NativeCallTrampolineStub::instruction_size);
 790   if (stub == NULL) {
 791     return NULL;  // CodeBuffer::expand failed
 792   }
 793 
 794   // Create a trampoline stub relocation which relates this trampoline stub
 795   // with the call instruction at insts_call_instruction_offset in the
 796   // instructions code-section.
 797   align(wordSize);
 798   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 799                                             + insts_call_instruction_offset));
 800   const int stub_start_offset = offset();
 801 
 802   // Now, create the trampoline stub's code:
 803   // - load the call
 804   // - call
 805   Label target;
 806   ldr(rscratch1, target);
 807   br(rscratch1);
 808   bind(target);
 809   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 810          "should be");
 811   emit_int64((int64_t)dest);
 812 
 813   const address stub_start_addr = addr_at(stub_start_offset);
 814 
 815   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 816 
 817   end_a_stub();
 818   return stub_start_addr;
 819 }
 820 
 821 void MacroAssembler::c2bool(Register x) {
 822   // implements x == 0 ? 0 : 1
 823   // note: must only look at least-significant byte of x
 824   //       since C-style booleans are stored in one byte
 825   //       only! (was bug)
 826   tst(x, 0xff);
 827   cset(x, Assembler::NE);
 828 }
 829 
 830 address MacroAssembler::ic_call(address entry, jint method_index) {
 831   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 832   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 833   // unsigned long offset;
 834   // ldr_constant(rscratch2, const_ptr);
 835   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 836   return trampoline_call(Address(entry, rh));
 837 }
 838 
 839 // Implementation of call_VM versions
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              bool check_exceptions) {
 844   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 845 }
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              Register arg_1,
 850                              bool check_exceptions) {
 851   pass_arg1(this, arg_1);
 852   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              address entry_point,
 868                              Register arg_1,
 869                              Register arg_2,
 870                              Register arg_3,
 871                              bool check_exceptions) {
 872   assert(arg_1 != c_rarg3, "smashed arg");
 873   assert(arg_2 != c_rarg3, "smashed arg");
 874   pass_arg3(this, arg_3);
 875 
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878 
 879   pass_arg1(this, arg_1);
 880   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 881 }
 882 
 883 void MacroAssembler::call_VM(Register oop_result,
 884                              Register last_java_sp,
 885                              address entry_point,
 886                              int number_of_arguments,
 887                              bool check_exceptions) {
 888   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              Register arg_1,
 895                              bool check_exceptions) {
 896   pass_arg1(this, arg_1);
 897   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 898 }
 899 
 900 void MacroAssembler::call_VM(Register oop_result,
 901                              Register last_java_sp,
 902                              address entry_point,
 903                              Register arg_1,
 904                              Register arg_2,
 905                              bool check_exceptions) {
 906 
 907   assert(arg_1 != c_rarg2, "smashed arg");
 908   pass_arg2(this, arg_2);
 909   pass_arg1(this, arg_1);
 910   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 911 }
 912 
 913 void MacroAssembler::call_VM(Register oop_result,
 914                              Register last_java_sp,
 915                              address entry_point,
 916                              Register arg_1,
 917                              Register arg_2,
 918                              Register arg_3,
 919                              bool check_exceptions) {
 920   assert(arg_1 != c_rarg3, "smashed arg");
 921   assert(arg_2 != c_rarg3, "smashed arg");
 922   pass_arg3(this, arg_3);
 923   assert(arg_1 != c_rarg2, "smashed arg");
 924   pass_arg2(this, arg_2);
 925   pass_arg1(this, arg_1);
 926   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 927 }
 928 
 929 
 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 931   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 933   verify_oop(oop_result, "broken oop in call_VM_base");
 934 }
 935 
 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 937   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 939 }
 940 
 941 void MacroAssembler::align(int modulus) {
 942   while (offset() % modulus != 0) nop();
 943 }
 944 
 945 // these are no-ops overridden by InterpreterMacroAssembler
 946 
 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 948 
 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 950 
 951 
 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 953                                                       Register tmp,
 954                                                       int offset) {
 955   intptr_t value = *delayed_value_addr;
 956   if (value != 0)
 957     return RegisterOrConstant(value + offset);
 958 
 959   // load indirectly to solve generation ordering problem
 960   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 961 
 962   if (offset != 0)
 963     add(tmp, tmp, offset);
 964 
 965   return RegisterOrConstant(tmp);
 966 }
 967 
 968 
 969 void MacroAssembler:: notify(int type) {
 970   if (type == bytecode_start) {
 971     // set_last_Java_frame(esp, rfp, (address)NULL);
 972     Assembler:: notify(type);
 973     // reset_last_Java_frame(true);
 974   }
 975   else
 976     Assembler:: notify(type);
 977 }
 978 
 979 // Look up the method for a megamorphic invokeinterface call.
 980 // The target method is determined by <intf_klass, itable_index>.
 981 // The receiver klass is in recv_klass.
 982 // On success, the result will be in method_result, and execution falls through.
 983 // On failure, execution transfers to the given label.
 984 void MacroAssembler::lookup_interface_method(Register recv_klass,
 985                                              Register intf_klass,
 986                                              RegisterOrConstant itable_index,
 987                                              Register method_result,
 988                                              Register scan_temp,
 989                                              Label& L_no_such_interface,
 990                          bool return_method) {
 991   assert_different_registers(recv_klass, intf_klass, scan_temp);
 992   assert_different_registers(method_result, intf_klass, scan_temp);
 993   assert(recv_klass != method_result || !return_method,
 994      "recv_klass can be destroyed when method isn't needed");
 995   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 996          "caller must use same register for non-constant itable index as for method");
 997 
 998   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 999   int vtable_base = in_bytes(Klass::vtable_start_offset());
1000   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1001   int scan_step   = itableOffsetEntry::size() * wordSize;
1002   int vte_size    = vtableEntry::size_in_bytes();
1003   assert(vte_size == wordSize, "else adjust times_vte_scale");
1004 
1005   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1006 
1007   // %%% Could store the aligned, prescaled offset in the klassoop.
1008   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1009   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1010   add(scan_temp, scan_temp, vtable_base);
1011 
1012   if (return_method) {
1013     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1014     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1015     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1016     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1017     if (itentry_off)
1018       add(recv_klass, recv_klass, itentry_off);
1019   }
1020 
1021   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1022   //   if (scan->interface() == intf) {
1023   //     result = (klass + scan->offset() + itable_index);
1024   //   }
1025   // }
1026   Label search, found_method;
1027 
1028   for (int peel = 1; peel >= 0; peel--) {
1029     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1030     cmp(intf_klass, method_result);
1031 
1032     if (peel) {
1033       br(Assembler::EQ, found_method);
1034     } else {
1035       br(Assembler::NE, search);
1036       // (invert the test to fall through to found_method...)
1037     }
1038 
1039     if (!peel)  break;
1040 
1041     bind(search);
1042 
1043     // Check that the previous entry is non-null.  A null entry means that
1044     // the receiver class doesn't implement the interface, and wasn't the
1045     // same as when the caller was compiled.
1046     cbz(method_result, L_no_such_interface);
1047     add(scan_temp, scan_temp, scan_step);
1048   }
1049 
1050   bind(found_method);
1051 
1052   // Got a hit.
1053   if (return_method) {
1054     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1055     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1056   }
1057 }
1058 
1059 // virtual method calling
1060 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1061                                            RegisterOrConstant vtable_index,
1062                                            Register method_result) {
1063   const int base = in_bytes(Klass::vtable_start_offset());
1064   assert(vtableEntry::size() * wordSize == 8,
1065          "adjust the scaling in the code below");
1066   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1067 
1068   if (vtable_index.is_register()) {
1069     lea(method_result, Address(recv_klass,
1070                                vtable_index.as_register(),
1071                                Address::lsl(LogBytesPerWord)));
1072     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1073   } else {
1074     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1075     ldr(method_result,
1076         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1077   }
1078 }
1079 
1080 void MacroAssembler::check_klass_subtype(Register sub_klass,
1081                            Register super_klass,
1082                            Register temp_reg,
1083                            Label& L_success) {
1084   Label L_failure;
1085   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1086   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1087   bind(L_failure);
1088 }
1089 
1090 
1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1092                                                    Register super_klass,
1093                                                    Register temp_reg,
1094                                                    Label* L_success,
1095                                                    Label* L_failure,
1096                                                    Label* L_slow_path,
1097                                         RegisterOrConstant super_check_offset) {
1098   assert_different_registers(sub_klass, super_klass, temp_reg);
1099   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1100   if (super_check_offset.is_register()) {
1101     assert_different_registers(sub_klass, super_klass,
1102                                super_check_offset.as_register());
1103   } else if (must_load_sco) {
1104     assert(temp_reg != noreg, "supply either a temp or a register offset");
1105   }
1106 
1107   Label L_fallthrough;
1108   int label_nulls = 0;
1109   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1110   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1111   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1112   assert(label_nulls <= 1, "at most one NULL in the batch");
1113 
1114   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1115   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1116   Address super_check_offset_addr(super_klass, sco_offset);
1117 
1118   // Hacked jmp, which may only be used just before L_fallthrough.
1119 #define final_jmp(label)                                                \
1120   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1121   else                            b(label)                /*omit semi*/
1122 
1123   // If the pointers are equal, we are done (e.g., String[] elements).
1124   // This self-check enables sharing of secondary supertype arrays among
1125   // non-primary types such as array-of-interface.  Otherwise, each such
1126   // type would need its own customized SSA.
1127   // We move this check to the front of the fast path because many
1128   // type checks are in fact trivially successful in this manner,
1129   // so we get a nicely predicted branch right at the start of the check.
1130   cmp(sub_klass, super_klass);
1131   br(Assembler::EQ, *L_success);
1132 
1133   // Check the supertype display:
1134   if (must_load_sco) {
1135     ldrw(temp_reg, super_check_offset_addr);
1136     super_check_offset = RegisterOrConstant(temp_reg);
1137   }
1138   Address super_check_addr(sub_klass, super_check_offset);
1139   ldr(rscratch1, super_check_addr);
1140   cmp(super_klass, rscratch1); // load displayed supertype
1141 
1142   // This check has worked decisively for primary supers.
1143   // Secondary supers are sought in the super_cache ('super_cache_addr').
1144   // (Secondary supers are interfaces and very deeply nested subtypes.)
1145   // This works in the same check above because of a tricky aliasing
1146   // between the super_cache and the primary super display elements.
1147   // (The 'super_check_addr' can address either, as the case requires.)
1148   // Note that the cache is updated below if it does not help us find
1149   // what we need immediately.
1150   // So if it was a primary super, we can just fail immediately.
1151   // Otherwise, it's the slow path for us (no success at this point).
1152 
1153   if (super_check_offset.is_register()) {
1154     br(Assembler::EQ, *L_success);
1155     subs(zr, super_check_offset.as_register(), sc_offset);
1156     if (L_failure == &L_fallthrough) {
1157       br(Assembler::EQ, *L_slow_path);
1158     } else {
1159       br(Assembler::NE, *L_failure);
1160       final_jmp(*L_slow_path);
1161     }
1162   } else if (super_check_offset.as_constant() == sc_offset) {
1163     // Need a slow path; fast failure is impossible.
1164     if (L_slow_path == &L_fallthrough) {
1165       br(Assembler::EQ, *L_success);
1166     } else {
1167       br(Assembler::NE, *L_slow_path);
1168       final_jmp(*L_success);
1169     }
1170   } else {
1171     // No slow path; it's a fast decision.
1172     if (L_failure == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_failure);
1176       final_jmp(*L_success);
1177     }
1178   }
1179 
1180   bind(L_fallthrough);
1181 
1182 #undef final_jmp
1183 }
1184 
1185 // These two are taken from x86, but they look generally useful
1186 
1187 // scans count pointer sized words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldr(scratch, post(addr, wordSize));
1195   cmp(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 // scans count 4 byte words at [addr] for occurence of value,
1203 // generic
1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1205                                 Register scratch) {
1206   Label Lloop, Lexit;
1207   cbz(count, Lexit);
1208   bind(Lloop);
1209   ldrw(scratch, post(addr, wordSize));
1210   cmpw(value, scratch);
1211   br(EQ, Lexit);
1212   sub(count, count, 1);
1213   cbnz(count, Lloop);
1214   bind(Lexit);
1215 }
1216 
1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1218                                                    Register super_klass,
1219                                                    Register temp_reg,
1220                                                    Register temp2_reg,
1221                                                    Label* L_success,
1222                                                    Label* L_failure,
1223                                                    bool set_cond_codes) {
1224   assert_different_registers(sub_klass, super_klass, temp_reg);
1225   if (temp2_reg != noreg)
1226     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1228 
1229   Label L_fallthrough;
1230   int label_nulls = 0;
1231   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1232   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1233   assert(label_nulls <= 1, "at most one NULL in the batch");
1234 
1235   // a couple of useful fields in sub_klass:
1236   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1237   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1238   Address secondary_supers_addr(sub_klass, ss_offset);
1239   Address super_cache_addr(     sub_klass, sc_offset);
1240 
1241   BLOCK_COMMENT("check_klass_subtype_slow_path");
1242 
1243   // Do a linear scan of the secondary super-klass chain.
1244   // This code is rarely used, so simplicity is a virtue here.
1245   // The repne_scan instruction uses fixed registers, which we must spill.
1246   // Don't worry too much about pre-existing connections with the input regs.
1247 
1248   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1249   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1250 
1251   RegSet pushed_registers;
1252   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1253   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1254 
1255   if (super_klass != r0 || UseCompressedOops) {
1256     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1257   }
1258 
1259   push(pushed_registers, sp);
1260 
1261   // Get super_klass value into r0 (even if it was in r5 or r2).
1262   if (super_klass != r0) {
1263     mov(r0, super_klass);
1264   }
1265 
1266 #ifndef PRODUCT
1267   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1268   Address pst_counter_addr(rscratch2);
1269   ldr(rscratch1, pst_counter_addr);
1270   add(rscratch1, rscratch1, 1);
1271   str(rscratch1, pst_counter_addr);
1272 #endif //PRODUCT
1273 
1274   // We will consult the secondary-super array.
1275   ldr(r5, secondary_supers_addr);
1276   // Load the array length.
1277   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1278   // Skip to start of data.
1279   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1280 
1281   cmp(sp, zr); // Clear Z flag; SP is never zero
1282   // Scan R2 words at [R5] for an occurrence of R0.
1283   // Set NZ/Z based on last compare.
1284   repne_scan(r5, r0, r2, rscratch1);
1285 
1286   // Unspill the temp. registers:
1287   pop(pushed_registers, sp);
1288 
1289   br(Assembler::NE, *L_failure);
1290 
1291   // Success.  Cache the super we found and proceed in triumph.
1292   str(super_klass, super_cache_addr);
1293 
1294   if (L_success != &L_fallthrough) {
1295     b(*L_success);
1296   }
1297 
1298 #undef IS_A_TEMP
1299 
1300   bind(L_fallthrough);
1301 }
1302 
1303 
1304 void MacroAssembler::verify_oop(Register reg, const char* s) {
1305   if (!VerifyOops) return;
1306 
1307   // Pass register number to verify_oop_subroutine
1308   const char* b = NULL;
1309   {
1310     ResourceMark rm;
1311     stringStream ss;
1312     ss.print("verify_oop: %s: %s", reg->name(), s);
1313     b = code_string(ss.as_string());
1314   }
1315   BLOCK_COMMENT("verify_oop {");
1316 
1317   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1318   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1319 
1320   mov(r0, reg);
1321   mov(rscratch1, (address)b);
1322 
1323   // call indirectly to solve generation ordering problem
1324   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1325   ldr(rscratch2, Address(rscratch2));
1326   blr(rscratch2);
1327 
1328   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1329   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1330 
1331   BLOCK_COMMENT("} verify_oop");
1332 }
1333 
1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1335   if (!VerifyOops) return;
1336 
1337   const char* b = NULL;
1338   {
1339     ResourceMark rm;
1340     stringStream ss;
1341     ss.print("verify_oop_addr: %s", s);
1342     b = code_string(ss.as_string());
1343   }
1344   BLOCK_COMMENT("verify_oop_addr {");
1345 
1346   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1347   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1348 
1349   // addr may contain sp so we will have to adjust it based on the
1350   // pushes that we just did.
1351   if (addr.uses(sp)) {
1352     lea(r0, addr);
1353     ldr(r0, Address(r0, 4 * wordSize));
1354   } else {
1355     ldr(r0, addr);
1356   }
1357   mov(rscratch1, (address)b);
1358 
1359   // call indirectly to solve generation ordering problem
1360   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1361   ldr(rscratch2, Address(rscratch2));
1362   blr(rscratch2);
1363 
1364   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1365   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1366 
1367   BLOCK_COMMENT("} verify_oop_addr");
1368 }
1369 
1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1371                                          int extra_slot_offset) {
1372   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1373   int stackElementSize = Interpreter::stackElementSize;
1374   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1375 #ifdef ASSERT
1376   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1377   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1378 #endif
1379   if (arg_slot.is_constant()) {
1380     return Address(esp, arg_slot.as_constant() * stackElementSize
1381                    + offset);
1382   } else {
1383     add(rscratch1, esp, arg_slot.as_register(),
1384         ext::uxtx, exact_log2(stackElementSize));
1385     return Address(rscratch1, offset);
1386   }
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base(address entry_point,
1390                                        int number_of_arguments,
1391                                        Label *retaddr) {
1392   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1393 }
1394 
1395 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1396                                         int number_of_gp_arguments,
1397                                         int number_of_fp_arguments,
1398                                         ret_type type,
1399                                         Label *retaddr) {
1400   Label E, L;
1401 
1402   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1403 
1404   // We add 1 to number_of_arguments because the thread in arg0 is
1405   // not counted
1406   mov(rscratch1, entry_point);
1407   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1408   if (retaddr)
1409     bind(*retaddr);
1410 
1411   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1412   maybe_isb();
1413 }
1414 
1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1416   call_VM_leaf_base(entry_point, number_of_arguments);
1417 }
1418 
1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1420   pass_arg0(this, arg_0);
1421   call_VM_leaf_base(entry_point, 1);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1425   pass_arg0(this, arg_0);
1426   pass_arg1(this, arg_1);
1427   call_VM_leaf_base(entry_point, 2);
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1431                                   Register arg_1, Register arg_2) {
1432   pass_arg0(this, arg_0);
1433   pass_arg1(this, arg_1);
1434   pass_arg2(this, arg_2);
1435   call_VM_leaf_base(entry_point, 3);
1436 }
1437 
1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1439   pass_arg0(this, arg_0);
1440   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1441 }
1442 
1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1444 
1445   assert(arg_0 != c_rarg1, "smashed arg");
1446   pass_arg1(this, arg_1);
1447   pass_arg0(this, arg_0);
1448   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1449 }
1450 
1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1452   assert(arg_0 != c_rarg2, "smashed arg");
1453   assert(arg_1 != c_rarg2, "smashed arg");
1454   pass_arg2(this, arg_2);
1455   assert(arg_0 != c_rarg1, "smashed arg");
1456   pass_arg1(this, arg_1);
1457   pass_arg0(this, arg_0);
1458   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1459 }
1460 
1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1462   assert(arg_0 != c_rarg3, "smashed arg");
1463   assert(arg_1 != c_rarg3, "smashed arg");
1464   assert(arg_2 != c_rarg3, "smashed arg");
1465   pass_arg3(this, arg_3);
1466   assert(arg_0 != c_rarg2, "smashed arg");
1467   assert(arg_1 != c_rarg2, "smashed arg");
1468   pass_arg2(this, arg_2);
1469   assert(arg_0 != c_rarg1, "smashed arg");
1470   pass_arg1(this, arg_1);
1471   pass_arg0(this, arg_0);
1472   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1473 }
1474 
1475 void MacroAssembler::null_check(Register reg, int offset) {
1476   if (needs_explicit_null_check(offset)) {
1477     // provoke OS NULL exception if reg = NULL by
1478     // accessing M[reg] w/o changing any registers
1479     // NOTE: this is plenty to provoke a segv
1480     ldr(zr, Address(reg));
1481   } else {
1482     // nothing to do, (later) access of M[reg + offset]
1483     // will provoke OS NULL exception if reg = NULL
1484   }
1485 }
1486 
1487 // MacroAssembler protected routines needed to implement
1488 // public methods
1489 
1490 void MacroAssembler::mov(Register r, Address dest) {
1491   code_section()->relocate(pc(), dest.rspec());
1492   u_int64_t imm64 = (u_int64_t)dest.target();
1493   movptr(r, imm64);
1494 }
1495 
1496 // Move a constant pointer into r.  In AArch64 mode the virtual
1497 // address space is 48 bits in size, so we only need three
1498 // instructions to create a patchable instruction sequence that can
1499 // reach anywhere.
1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1501 #ifndef PRODUCT
1502   {
1503     char buffer[64];
1504     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1505     block_comment(buffer);
1506   }
1507 #endif
1508   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1509   movz(r, imm64 & 0xffff);
1510   imm64 >>= 16;
1511   movk(r, imm64 & 0xffff, 16);
1512   imm64 >>= 16;
1513   movk(r, imm64 & 0xffff, 32);
1514 }
1515 
1516 // Macro to mov replicated immediate to vector register.
1517 //  Vd will get the following values for different arrangements in T
1518 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1519 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1520 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1521 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1522 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1523 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1524 //   T1D/T2D: invalid
1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1526   assert(T != T1D && T != T2D, "invalid arrangement");
1527   if (T == T8B || T == T16B) {
1528     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1529     movi(Vd, T, imm32 & 0xff, 0);
1530     return;
1531   }
1532   u_int32_t nimm32 = ~imm32;
1533   if (T == T4H || T == T8H) {
1534     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1535     imm32 &= 0xffff;
1536     nimm32 &= 0xffff;
1537   }
1538   u_int32_t x = imm32;
1539   int movi_cnt = 0;
1540   int movn_cnt = 0;
1541   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1542   x = nimm32;
1543   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1544   if (movn_cnt < movi_cnt) imm32 = nimm32;
1545   unsigned lsl = 0;
1546   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1547   if (movn_cnt < movi_cnt)
1548     mvni(Vd, T, imm32 & 0xff, lsl);
1549   else
1550     movi(Vd, T, imm32 & 0xff, lsl);
1551   imm32 >>= 8; lsl += 8;
1552   while (imm32) {
1553     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1554     if (movn_cnt < movi_cnt)
1555       bici(Vd, T, imm32 & 0xff, lsl);
1556     else
1557       orri(Vd, T, imm32 & 0xff, lsl);
1558     lsl += 8; imm32 >>= 8;
1559   }
1560 }
1561 
1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1563 {
1564 #ifndef PRODUCT
1565   {
1566     char buffer[64];
1567     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1568     block_comment(buffer);
1569   }
1570 #endif
1571   if (operand_valid_for_logical_immediate(false, imm64)) {
1572     orr(dst, zr, imm64);
1573   } else {
1574     // we can use a combination of MOVZ or MOVN with
1575     // MOVK to build up the constant
1576     u_int64_t imm_h[4];
1577     int zero_count = 0;
1578     int neg_count = 0;
1579     int i;
1580     for (i = 0; i < 4; i++) {
1581       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1582       if (imm_h[i] == 0) {
1583         zero_count++;
1584       } else if (imm_h[i] == 0xffffL) {
1585         neg_count++;
1586       }
1587     }
1588     if (zero_count == 4) {
1589       // one MOVZ will do
1590       movz(dst, 0);
1591     } else if (neg_count == 4) {
1592       // one MOVN will do
1593       movn(dst, 0);
1594     } else if (zero_count == 3) {
1595       for (i = 0; i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1598           break;
1599         }
1600       }
1601     } else if (neg_count == 3) {
1602       // one MOVN will do
1603       for (int i = 0; i < 4; i++) {
1604         if (imm_h[i] != 0xffffL) {
1605           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1606           break;
1607         }
1608       }
1609     } else if (zero_count == 2) {
1610       // one MOVZ and one MOVK will do
1611       for (i = 0; i < 3; i++) {
1612         if (imm_h[i] != 0L) {
1613           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1614           i++;
1615           break;
1616         }
1617       }
1618       for (;i < 4; i++) {
1619         if (imm_h[i] != 0L) {
1620           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1621         }
1622       }
1623     } else if (neg_count == 2) {
1624       // one MOVN and one MOVK will do
1625       for (i = 0; i < 4; i++) {
1626         if (imm_h[i] != 0xffffL) {
1627           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1628           i++;
1629           break;
1630         }
1631       }
1632       for (;i < 4; i++) {
1633         if (imm_h[i] != 0xffffL) {
1634           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1635         }
1636       }
1637     } else if (zero_count == 1) {
1638       // one MOVZ and two MOVKs will do
1639       for (i = 0; i < 4; i++) {
1640         if (imm_h[i] != 0L) {
1641           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1642           i++;
1643           break;
1644         }
1645       }
1646       for (;i < 4; i++) {
1647         if (imm_h[i] != 0x0L) {
1648           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649         }
1650       }
1651     } else if (neg_count == 1) {
1652       // one MOVN and two MOVKs will do
1653       for (i = 0; i < 4; i++) {
1654         if (imm_h[i] != 0xffffL) {
1655           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1656           i++;
1657           break;
1658         }
1659       }
1660       for (;i < 4; i++) {
1661         if (imm_h[i] != 0xffffL) {
1662           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1663         }
1664       }
1665     } else {
1666       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1667       movz(dst, (u_int32_t)imm_h[0], 0);
1668       for (i = 1; i < 4; i++) {
1669         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1670       }
1671     }
1672   }
1673 }
1674 
1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1676 {
1677 #ifndef PRODUCT
1678     {
1679       char buffer[64];
1680       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1681       block_comment(buffer);
1682     }
1683 #endif
1684   if (operand_valid_for_logical_immediate(true, imm32)) {
1685     orrw(dst, zr, imm32);
1686   } else {
1687     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1688     // constant
1689     u_int32_t imm_h[2];
1690     imm_h[0] = imm32 & 0xffff;
1691     imm_h[1] = ((imm32 >> 16) & 0xffff);
1692     if (imm_h[0] == 0) {
1693       movzw(dst, imm_h[1], 16);
1694     } else if (imm_h[0] == 0xffff) {
1695       movnw(dst, imm_h[1] ^ 0xffff, 16);
1696     } else if (imm_h[1] == 0) {
1697       movzw(dst, imm_h[0], 0);
1698     } else if (imm_h[1] == 0xffff) {
1699       movnw(dst, imm_h[0] ^ 0xffff, 0);
1700     } else {
1701       // use a MOVZ and MOVK (makes it easier to debug)
1702       movzw(dst, imm_h[0], 0);
1703       movkw(dst, imm_h[1], 16);
1704     }
1705   }
1706 }
1707 
1708 // Form an address from base + offset in Rd.  Rd may or may
1709 // not actually be used: you must use the Address that is returned.
1710 // It is up to you to ensure that the shift provided matches the size
1711 // of your data.
1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1713   if (Address::offset_ok_for_immed(byte_offset, shift))
1714     // It fits; no need for any heroics
1715     return Address(base, byte_offset);
1716 
1717   // Don't do anything clever with negative or misaligned offsets
1718   unsigned mask = (1 << shift) - 1;
1719   if (byte_offset < 0 || byte_offset & mask) {
1720     mov(Rd, byte_offset);
1721     add(Rd, base, Rd);
1722     return Address(Rd);
1723   }
1724 
1725   // See if we can do this with two 12-bit offsets
1726   {
1727     unsigned long word_offset = byte_offset >> shift;
1728     unsigned long masked_offset = word_offset & 0xfff000;
1729     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1730         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1731       add(Rd, base, masked_offset << shift);
1732       word_offset -= masked_offset;
1733       return Address(Rd, word_offset << shift);
1734     }
1735   }
1736 
1737   // Do it the hard way
1738   mov(Rd, byte_offset);
1739   add(Rd, base, Rd);
1740   return Address(Rd);
1741 }
1742 
1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1744   if (UseLSE) {
1745     mov(tmp, 1);
1746     ldadd(Assembler::word, tmp, zr, counter_addr);
1747     return;
1748   }
1749   Label retry_load;
1750   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1751     prfm(Address(counter_addr), PSTL1STRM);
1752   bind(retry_load);
1753   // flush and load exclusive from the memory location
1754   ldxrw(tmp, counter_addr);
1755   addw(tmp, tmp, 1);
1756   // if we store+flush with no intervening write tmp wil be zero
1757   stxrw(tmp2, tmp, counter_addr);
1758   cbnzw(tmp2, retry_load);
1759 }
1760 
1761 
1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1763                                     bool want_remainder, Register scratch)
1764 {
1765   // Full implementation of Java idiv and irem.  The function
1766   // returns the (pc) offset of the div instruction - may be needed
1767   // for implicit exceptions.
1768   //
1769   // constraint : ra/rb =/= scratch
1770   //         normal case
1771   //
1772   // input : ra: dividend
1773   //         rb: divisor
1774   //
1775   // result: either
1776   //         quotient  (= ra idiv rb)
1777   //         remainder (= ra irem rb)
1778 
1779   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1780 
1781   int idivl_offset = offset();
1782   if (! want_remainder) {
1783     sdivw(result, ra, rb);
1784   } else {
1785     sdivw(scratch, ra, rb);
1786     Assembler::msubw(result, scratch, rb, ra);
1787   }
1788 
1789   return idivl_offset;
1790 }
1791 
1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1793                                     bool want_remainder, Register scratch)
1794 {
1795   // Full implementation of Java ldiv and lrem.  The function
1796   // returns the (pc) offset of the div instruction - may be needed
1797   // for implicit exceptions.
1798   //
1799   // constraint : ra/rb =/= scratch
1800   //         normal case
1801   //
1802   // input : ra: dividend
1803   //         rb: divisor
1804   //
1805   // result: either
1806   //         quotient  (= ra idiv rb)
1807   //         remainder (= ra irem rb)
1808 
1809   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1810 
1811   int idivq_offset = offset();
1812   if (! want_remainder) {
1813     sdiv(result, ra, rb);
1814   } else {
1815     sdiv(scratch, ra, rb);
1816     Assembler::msub(result, scratch, rb, ra);
1817   }
1818 
1819   return idivq_offset;
1820 }
1821 
1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1823   address prev = pc() - NativeMembar::instruction_size;
1824   address last = code()->last_insn();
1825   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1826     NativeMembar *bar = NativeMembar_at(prev);
1827     // We are merging two memory barrier instructions.  On AArch64 we
1828     // can do this simply by ORing them together.
1829     bar->set_kind(bar->get_kind() | order_constraint);
1830     BLOCK_COMMENT("merged membar");
1831   } else {
1832     code()->set_last_insn(pc());
1833     dmb(Assembler::barrier(order_constraint));
1834   }
1835 }
1836 
1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1838   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1839     merge_ldst(rt, adr, size_in_bytes, is_store);
1840     code()->clear_last_insn();
1841     return true;
1842   } else {
1843     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1844     const unsigned mask = size_in_bytes - 1;
1845     if (adr.getMode() == Address::base_plus_offset &&
1846         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1847       code()->set_last_insn(pc());
1848     }
1849     return false;
1850   }
1851 }
1852 
1853 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1854   // We always try to merge two adjacent loads into one ldp.
1855   if (!try_merge_ldst(Rx, adr, 8, false)) {
1856     Assembler::ldr(Rx, adr);
1857   }
1858 }
1859 
1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1861   // We always try to merge two adjacent loads into one ldp.
1862   if (!try_merge_ldst(Rw, adr, 4, false)) {
1863     Assembler::ldrw(Rw, adr);
1864   }
1865 }
1866 
1867 void MacroAssembler::str(Register Rx, const Address &adr) {
1868   // We always try to merge two adjacent stores into one stp.
1869   if (!try_merge_ldst(Rx, adr, 8, true)) {
1870     Assembler::str(Rx, adr);
1871   }
1872 }
1873 
1874 void MacroAssembler::strw(Register Rw, const Address &adr) {
1875   // We always try to merge two adjacent stores into one stp.
1876   if (!try_merge_ldst(Rw, adr, 4, true)) {
1877     Assembler::strw(Rw, adr);
1878   }
1879 }
1880 
1881 // MacroAssembler routines found actually to be needed
1882 
1883 void MacroAssembler::push(Register src)
1884 {
1885   str(src, Address(pre(esp, -1 * wordSize)));
1886 }
1887 
1888 void MacroAssembler::pop(Register dst)
1889 {
1890   ldr(dst, Address(post(esp, 1 * wordSize)));
1891 }
1892 
1893 // Note: load_unsigned_short used to be called load_unsigned_word.
1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1895   int off = offset();
1896   ldrh(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1901   int off = offset();
1902   ldrb(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_short(Register dst, Address src) {
1907   int off = offset();
1908   ldrsh(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1913   int off = offset();
1914   ldrsb(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1919   int off = offset();
1920   ldrshw(dst, src);
1921   return off;
1922 }
1923 
1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1925   int off = offset();
1926   ldrsbw(dst, src);
1927   return off;
1928 }
1929 
1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1931   switch (size_in_bytes) {
1932   case  8:  ldr(dst, src); break;
1933   case  4:  ldrw(dst, src); break;
1934   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1935   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1936   default:  ShouldNotReachHere();
1937   }
1938 }
1939 
1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1941   switch (size_in_bytes) {
1942   case  8:  str(src, dst); break;
1943   case  4:  strw(src, dst); break;
1944   case  2:  strh(src, dst); break;
1945   case  1:  strb(src, dst); break;
1946   default:  ShouldNotReachHere();
1947   }
1948 }
1949 
1950 void MacroAssembler::decrementw(Register reg, int value)
1951 {
1952   if (value < 0)  { incrementw(reg, -value);      return; }
1953   if (value == 0) {                               return; }
1954   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1955   /* else */ {
1956     guarantee(reg != rscratch2, "invalid dst for register decrement");
1957     movw(rscratch2, (unsigned)value);
1958     subw(reg, reg, rscratch2);
1959   }
1960 }
1961 
1962 void MacroAssembler::decrement(Register reg, int value)
1963 {
1964   if (value < 0)  { increment(reg, -value);      return; }
1965   if (value == 0) {                              return; }
1966   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1967   /* else */ {
1968     assert(reg != rscratch2, "invalid dst for register decrement");
1969     mov(rscratch2, (unsigned long)value);
1970     sub(reg, reg, rscratch2);
1971   }
1972 }
1973 
1974 void MacroAssembler::decrementw(Address dst, int value)
1975 {
1976   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1977   if (dst.getMode() == Address::literal) {
1978     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1979     lea(rscratch2, dst);
1980     dst = Address(rscratch2);
1981   }
1982   ldrw(rscratch1, dst);
1983   decrementw(rscratch1, value);
1984   strw(rscratch1, dst);
1985 }
1986 
1987 void MacroAssembler::decrement(Address dst, int value)
1988 {
1989   assert(!dst.uses(rscratch1), "invalid address for decrement");
1990   if (dst.getMode() == Address::literal) {
1991     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1992     lea(rscratch2, dst);
1993     dst = Address(rscratch2);
1994   }
1995   ldr(rscratch1, dst);
1996   decrement(rscratch1, value);
1997   str(rscratch1, dst);
1998 }
1999 
2000 void MacroAssembler::incrementw(Register reg, int value)
2001 {
2002   if (value < 0)  { decrementw(reg, -value);      return; }
2003   if (value == 0) {                               return; }
2004   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2005   /* else */ {
2006     assert(reg != rscratch2, "invalid dst for register increment");
2007     movw(rscratch2, (unsigned)value);
2008     addw(reg, reg, rscratch2);
2009   }
2010 }
2011 
2012 void MacroAssembler::increment(Register reg, int value)
2013 {
2014   if (value < 0)  { decrement(reg, -value);      return; }
2015   if (value == 0) {                              return; }
2016   if (value < (1 << 12)) { add(reg, reg, value); return; }
2017   /* else */ {
2018     assert(reg != rscratch2, "invalid dst for register increment");
2019     movw(rscratch2, (unsigned)value);
2020     add(reg, reg, rscratch2);
2021   }
2022 }
2023 
2024 void MacroAssembler::incrementw(Address dst, int value)
2025 {
2026   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2027   if (dst.getMode() == Address::literal) {
2028     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2029     lea(rscratch2, dst);
2030     dst = Address(rscratch2);
2031   }
2032   ldrw(rscratch1, dst);
2033   incrementw(rscratch1, value);
2034   strw(rscratch1, dst);
2035 }
2036 
2037 void MacroAssembler::increment(Address dst, int value)
2038 {
2039   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2040   if (dst.getMode() == Address::literal) {
2041     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2042     lea(rscratch2, dst);
2043     dst = Address(rscratch2);
2044   }
2045   ldr(rscratch1, dst);
2046   increment(rscratch1, value);
2047   str(rscratch1, dst);
2048 }
2049 
2050 
2051 void MacroAssembler::pusha() {
2052   push(0x7fffffff, sp);
2053 }
2054 
2055 void MacroAssembler::popa() {
2056   pop(0x7fffffff, sp);
2057 }
2058 
2059 // Push lots of registers in the bit set supplied.  Don't push sp.
2060 // Return the number of words pushed
2061 int MacroAssembler::push(unsigned int bitset, Register stack) {
2062   int words_pushed = 0;
2063 
2064   // Scan bitset to accumulate register pairs
2065   unsigned char regs[32];
2066   int count = 0;
2067   for (int reg = 0; reg <= 30; reg++) {
2068     if (1 & bitset)
2069       regs[count++] = reg;
2070     bitset >>= 1;
2071   }
2072   regs[count++] = zr->encoding_nocheck();
2073   count &= ~1;  // Only push an even nuber of regs
2074 
2075   if (count) {
2076     stp(as_Register(regs[0]), as_Register(regs[1]),
2077        Address(pre(stack, -count * wordSize)));
2078     words_pushed += 2;
2079   }
2080   for (int i = 2; i < count; i += 2) {
2081     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2082        Address(stack, i * wordSize));
2083     words_pushed += 2;
2084   }
2085 
2086   assert(words_pushed == count, "oops, pushed != count");
2087 
2088   return count;
2089 }
2090 
2091 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2092   int words_pushed = 0;
2093 
2094   // Scan bitset to accumulate register pairs
2095   unsigned char regs[32];
2096   int count = 0;
2097   for (int reg = 0; reg <= 30; reg++) {
2098     if (1 & bitset)
2099       regs[count++] = reg;
2100     bitset >>= 1;
2101   }
2102   regs[count++] = zr->encoding_nocheck();
2103   count &= ~1;
2104 
2105   for (int i = 2; i < count; i += 2) {
2106     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2107        Address(stack, i * wordSize));
2108     words_pushed += 2;
2109   }
2110   if (count) {
2111     ldp(as_Register(regs[0]), as_Register(regs[1]),
2112        Address(post(stack, count * wordSize)));
2113     words_pushed += 2;
2114   }
2115 
2116   assert(words_pushed == count, "oops, pushed != count");
2117 
2118   return count;
2119 }
2120 #ifdef ASSERT
2121 void MacroAssembler::verify_heapbase(const char* msg) {
2122 #if 0
2123   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2124   assert (Universe::heap() != NULL, "java heap should be initialized");
2125   if (CheckCompressedOops) {
2126     Label ok;
2127     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2128     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2129     br(Assembler::EQ, ok);
2130     stop(msg);
2131     bind(ok);
2132     pop(1 << rscratch1->encoding(), sp);
2133   }
2134 #endif
2135 }
2136 #endif
2137 
2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2139   Label done, not_weak;
2140   cbz(value, done);           // Use NULL as-is.
2141 
2142   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2143   tbz(r0, 0, not_weak);    // Test for jweak tag.
2144 
2145   // Resolve jweak.
2146   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2147                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2148   verify_oop(value);
2149   b(done);
2150 
2151   bind(not_weak);
2152   // Resolve (untagged) jobject.
2153   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2154   verify_oop(value);
2155   bind(done);
2156 }
2157 
2158 void MacroAssembler::stop(const char* msg) {
2159   address ip = pc();
2160   pusha();
2161   mov(c_rarg0, (address)msg);
2162   mov(c_rarg1, (address)ip);
2163   mov(c_rarg2, sp);
2164   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2165   // call(c_rarg3);
2166   blrt(c_rarg3, 3, 0, 1);
2167   hlt(0);
2168 }
2169 
2170 void MacroAssembler::unimplemented(const char* what) {
2171   const char* buf = NULL;
2172   {
2173     ResourceMark rm;
2174     stringStream ss;
2175     ss.print("unimplemented: %s", what);
2176     buf = code_string(ss.as_string());
2177   }
2178   stop(buf);
2179 }
2180 
2181 // If a constant does not fit in an immediate field, generate some
2182 // number of MOV instructions and then perform the operation.
2183 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2184                                            add_sub_imm_insn insn1,
2185                                            add_sub_reg_insn insn2) {
2186   assert(Rd != zr, "Rd = zr and not setting flags?");
2187   if (operand_valid_for_add_sub_immediate((int)imm)) {
2188     (this->*insn1)(Rd, Rn, imm);
2189   } else {
2190     if (uabs(imm) < (1 << 24)) {
2191        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2192        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2193     } else {
2194        assert_different_registers(Rd, Rn);
2195        mov(Rd, (uint64_t)imm);
2196        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2197     }
2198   }
2199 }
2200 
2201 // Seperate vsn which sets the flags. Optimisations are more restricted
2202 // because we must set the flags correctly.
2203 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2204                                            add_sub_imm_insn insn1,
2205                                            add_sub_reg_insn insn2) {
2206   if (operand_valid_for_add_sub_immediate((int)imm)) {
2207     (this->*insn1)(Rd, Rn, imm);
2208   } else {
2209     assert_different_registers(Rd, Rn);
2210     assert(Rd != zr, "overflow in immediate operand");
2211     mov(Rd, (uint64_t)imm);
2212     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2213   }
2214 }
2215 
2216 
2217 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2218   if (increment.is_register()) {
2219     add(Rd, Rn, increment.as_register());
2220   } else {
2221     add(Rd, Rn, increment.as_constant());
2222   }
2223 }
2224 
2225 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2226   if (increment.is_register()) {
2227     addw(Rd, Rn, increment.as_register());
2228   } else {
2229     addw(Rd, Rn, increment.as_constant());
2230   }
2231 }
2232 
2233 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2234   if (decrement.is_register()) {
2235     sub(Rd, Rn, decrement.as_register());
2236   } else {
2237     sub(Rd, Rn, decrement.as_constant());
2238   }
2239 }
2240 
2241 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2242   if (decrement.is_register()) {
2243     subw(Rd, Rn, decrement.as_register());
2244   } else {
2245     subw(Rd, Rn, decrement.as_constant());
2246   }
2247 }
2248 
2249 void MacroAssembler::reinit_heapbase()
2250 {
2251   if (UseCompressedOops) {
2252     if (Universe::is_fully_initialized()) {
2253       mov(rheapbase, Universe::narrow_ptrs_base());
2254     } else {
2255       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2256       ldr(rheapbase, Address(rheapbase));
2257     }
2258   }
2259 }
2260 
2261 // this simulates the behaviour of the x86 cmpxchg instruction using a
2262 // load linked/store conditional pair. we use the acquire/release
2263 // versions of these instructions so that we flush pending writes as
2264 // per Java semantics.
2265 
2266 // n.b the x86 version assumes the old value to be compared against is
2267 // in rax and updates rax with the value located in memory if the
2268 // cmpxchg fails. we supply a register for the old value explicitly
2269 
2270 // the aarch64 load linked/store conditional instructions do not
2271 // accept an offset. so, unlike x86, we must provide a plain register
2272 // to identify the memory word to be compared/exchanged rather than a
2273 // register+offset Address.
2274 
2275 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2276                                 Label &succeed, Label *fail) {
2277   // oldv holds comparison value
2278   // newv holds value to write in exchange
2279   // addr identifies memory word to compare against/update
2280   if (UseLSE) {
2281     mov(tmp, oldv);
2282     casal(Assembler::xword, oldv, newv, addr);
2283     cmp(tmp, oldv);
2284     br(Assembler::EQ, succeed);
2285     membar(AnyAny);
2286   } else {
2287     Label retry_load, nope;
2288     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2289       prfm(Address(addr), PSTL1STRM);
2290     bind(retry_load);
2291     // flush and load exclusive from the memory location
2292     // and fail if it is not what we expect
2293     ldaxr(tmp, addr);
2294     cmp(tmp, oldv);
2295     br(Assembler::NE, nope);
2296     // if we store+flush with no intervening write tmp wil be zero
2297     stlxr(tmp, newv, addr);
2298     cbzw(tmp, succeed);
2299     // retry so we only ever return after a load fails to compare
2300     // ensures we don't return a stale value after a failed write.
2301     b(retry_load);
2302     // if the memory word differs we return it in oldv and signal a fail
2303     bind(nope);
2304     membar(AnyAny);
2305     mov(oldv, tmp);
2306   }
2307   if (fail)
2308     b(*fail);
2309 }
2310 
2311 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2312                                         Label &succeed, Label *fail) {
2313   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2314   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2315 }
2316 
2317 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2318                                 Label &succeed, Label *fail) {
2319   // oldv holds comparison value
2320   // newv holds value to write in exchange
2321   // addr identifies memory word to compare against/update
2322   // tmp returns 0/1 for success/failure
2323   if (UseLSE) {
2324     mov(tmp, oldv);
2325     casal(Assembler::word, oldv, newv, addr);
2326     cmp(tmp, oldv);
2327     br(Assembler::EQ, succeed);
2328     membar(AnyAny);
2329   } else {
2330     Label retry_load, nope;
2331     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2332       prfm(Address(addr), PSTL1STRM);
2333     bind(retry_load);
2334     // flush and load exclusive from the memory location
2335     // and fail if it is not what we expect
2336     ldaxrw(tmp, addr);
2337     cmp(tmp, oldv);
2338     br(Assembler::NE, nope);
2339     // if we store+flush with no intervening write tmp wil be zero
2340     stlxrw(tmp, newv, addr);
2341     cbzw(tmp, succeed);
2342     // retry so we only ever return after a load fails to compare
2343     // ensures we don't return a stale value after a failed write.
2344     b(retry_load);
2345     // if the memory word differs we return it in oldv and signal a fail
2346     bind(nope);
2347     membar(AnyAny);
2348     mov(oldv, tmp);
2349   }
2350   if (fail)
2351     b(*fail);
2352 }
2353 
2354 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2355 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2356 // Pass a register for the result, otherwise pass noreg.
2357 
2358 // Clobbers rscratch1
2359 void MacroAssembler::cmpxchg(Register addr, Register expected,
2360                              Register new_val,
2361                              enum operand_size size,
2362                              bool acquire, bool release,
2363                              bool weak,
2364                              Register result) {
2365   if (result == noreg)  result = rscratch1;
2366   BLOCK_COMMENT("cmpxchg {");
2367   if (UseLSE) {
2368     mov(result, expected);
2369     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2370     compare_eq(result, expected, size);
2371   } else {
2372     Label retry_load, done;
2373     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2374       prfm(Address(addr), PSTL1STRM);
2375     bind(retry_load);
2376     load_exclusive(result, addr, size, acquire);
2377     compare_eq(result, expected, size);
2378     br(Assembler::NE, done);
2379     store_exclusive(rscratch1, new_val, addr, size, release);
2380     if (weak) {
2381       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2382     } else {
2383       cbnzw(rscratch1, retry_load);
2384     }
2385     bind(done);
2386   }
2387   BLOCK_COMMENT("} cmpxchg");
2388 }
2389 
2390 // A generic comparison. Only compares for equality, clobbers rscratch1.
2391 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2392   if (size == xword) {
2393     cmp(rm, rn);
2394   } else if (size == word) {
2395     cmpw(rm, rn);
2396   } else if (size == halfword) {
2397     eorw(rscratch1, rm, rn);
2398     ands(zr, rscratch1, 0xffff);
2399   } else if (size == byte) {
2400     eorw(rscratch1, rm, rn);
2401     ands(zr, rscratch1, 0xff);
2402   } else {
2403     ShouldNotReachHere();
2404   }
2405 }
2406 
2407 
2408 static bool different(Register a, RegisterOrConstant b, Register c) {
2409   if (b.is_constant())
2410     return a != c;
2411   else
2412     return a != b.as_register() && a != c && b.as_register() != c;
2413 }
2414 
2415 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2416 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2417   if (UseLSE) {                                                         \
2418     prev = prev->is_valid() ? prev : zr;                                \
2419     if (incr.is_register()) {                                           \
2420       AOP(sz, incr.as_register(), prev, addr);                          \
2421     } else {                                                            \
2422       mov(rscratch2, incr.as_constant());                               \
2423       AOP(sz, rscratch2, prev, addr);                                   \
2424     }                                                                   \
2425     return;                                                             \
2426   }                                                                     \
2427   Register result = rscratch2;                                          \
2428   if (prev->is_valid())                                                 \
2429     result = different(prev, incr, addr) ? prev : rscratch2;            \
2430                                                                         \
2431   Label retry_load;                                                     \
2432   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2433     prfm(Address(addr), PSTL1STRM);                                     \
2434   bind(retry_load);                                                     \
2435   LDXR(result, addr);                                                   \
2436   OP(rscratch1, result, incr);                                          \
2437   STXR(rscratch2, rscratch1, addr);                                     \
2438   cbnzw(rscratch2, retry_load);                                         \
2439   if (prev->is_valid() && prev != result) {                             \
2440     IOP(prev, rscratch1, incr);                                         \
2441   }                                                                     \
2442 }
2443 
2444 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2445 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2446 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2447 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2448 
2449 #undef ATOMIC_OP
2450 
2451 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2452 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2453   if (UseLSE) {                                                         \
2454     prev = prev->is_valid() ? prev : zr;                                \
2455     AOP(sz, newv, prev, addr);                                          \
2456     return;                                                             \
2457   }                                                                     \
2458   Register result = rscratch2;                                          \
2459   if (prev->is_valid())                                                 \
2460     result = different(prev, newv, addr) ? prev : rscratch2;            \
2461                                                                         \
2462   Label retry_load;                                                     \
2463   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2464     prfm(Address(addr), PSTL1STRM);                                     \
2465   bind(retry_load);                                                     \
2466   LDXR(result, addr);                                                   \
2467   STXR(rscratch1, newv, addr);                                          \
2468   cbnzw(rscratch1, retry_load);                                         \
2469   if (prev->is_valid() && prev != result)                               \
2470     mov(prev, result);                                                  \
2471 }
2472 
2473 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2474 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2477 
2478 #undef ATOMIC_XCHG
2479 
2480 #ifndef PRODUCT
2481 extern "C" void findpc(intptr_t x);
2482 #endif
2483 
2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2485 {
2486   // In order to get locks to work, we need to fake a in_VM state
2487   if (ShowMessageBoxOnError ) {
2488     JavaThread* thread = JavaThread::current();
2489     JavaThreadState saved_state = thread->thread_state();
2490     thread->set_thread_state(_thread_in_vm);
2491 #ifndef PRODUCT
2492     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2493       ttyLocker ttyl;
2494       BytecodeCounter::print();
2495     }
2496 #endif
2497     if (os::message_box(msg, "Execution stopped, print registers?")) {
2498       ttyLocker ttyl;
2499       tty->print_cr(" pc = 0x%016lx", pc);
2500 #ifndef PRODUCT
2501       tty->cr();
2502       findpc(pc);
2503       tty->cr();
2504 #endif
2505       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2506       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2507       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2508       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2509       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2510       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2511       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2512       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2513       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2514       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2515       tty->print_cr("r10 = 0x%016lx", regs[10]);
2516       tty->print_cr("r11 = 0x%016lx", regs[11]);
2517       tty->print_cr("r12 = 0x%016lx", regs[12]);
2518       tty->print_cr("r13 = 0x%016lx", regs[13]);
2519       tty->print_cr("r14 = 0x%016lx", regs[14]);
2520       tty->print_cr("r15 = 0x%016lx", regs[15]);
2521       tty->print_cr("r16 = 0x%016lx", regs[16]);
2522       tty->print_cr("r17 = 0x%016lx", regs[17]);
2523       tty->print_cr("r18 = 0x%016lx", regs[18]);
2524       tty->print_cr("r19 = 0x%016lx", regs[19]);
2525       tty->print_cr("r20 = 0x%016lx", regs[20]);
2526       tty->print_cr("r21 = 0x%016lx", regs[21]);
2527       tty->print_cr("r22 = 0x%016lx", regs[22]);
2528       tty->print_cr("r23 = 0x%016lx", regs[23]);
2529       tty->print_cr("r24 = 0x%016lx", regs[24]);
2530       tty->print_cr("r25 = 0x%016lx", regs[25]);
2531       tty->print_cr("r26 = 0x%016lx", regs[26]);
2532       tty->print_cr("r27 = 0x%016lx", regs[27]);
2533       tty->print_cr("r28 = 0x%016lx", regs[28]);
2534       tty->print_cr("r30 = 0x%016lx", regs[30]);
2535       tty->print_cr("r31 = 0x%016lx", regs[31]);
2536       BREAKPOINT;
2537     }
2538     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2539   } else {
2540     ttyLocker ttyl;
2541     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2542                     msg);
2543     assert(false, "DEBUG MESSAGE: %s", msg);
2544   }
2545 }
2546 
2547 #ifdef BUILTIN_SIM
2548 // routine to generate an x86 prolog for a stub function which
2549 // bootstraps into the generated ARM code which directly follows the
2550 // stub
2551 //
2552 // the argument encodes the number of general and fp registers
2553 // passed by the caller and the callng convention (currently just
2554 // the number of general registers and assumes C argument passing)
2555 
2556 extern "C" {
2557 int aarch64_stub_prolog_size();
2558 void aarch64_stub_prolog();
2559 void aarch64_prolog();
2560 }
2561 
2562 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2563                                    address *prolog_ptr)
2564 {
2565   int calltype = (((ret_type & 0x3) << 8) |
2566                   ((fp_arg_count & 0xf) << 4) |
2567                   (gp_arg_count & 0xf));
2568 
2569   // the addresses for the x86 to ARM entry code we need to use
2570   address start = pc();
2571   // printf("start = %lx\n", start);
2572   int byteCount =  aarch64_stub_prolog_size();
2573   // printf("byteCount = %x\n", byteCount);
2574   int instructionCount = (byteCount + 3)/ 4;
2575   // printf("instructionCount = %x\n", instructionCount);
2576   for (int i = 0; i < instructionCount; i++) {
2577     nop();
2578   }
2579 
2580   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2581 
2582   // write the address of the setup routine and the call format at the
2583   // end of into the copied code
2584   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2585   if (prolog_ptr)
2586     patch_end[-2] = (u_int64_t)prolog_ptr;
2587   patch_end[-1] = calltype;
2588 }
2589 #endif
2590 
2591 void MacroAssembler::push_call_clobbered_registers() {
2592   int step = 4 * wordSize;
2593   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2594   sub(sp, sp, step);
2595   mov(rscratch1, -step);
2596   // Push v0-v7, v16-v31.
2597   for (int i = 31; i>= 4; i -= 4) {
2598     if (i <= v7->encoding() || i >= v16->encoding())
2599       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2600           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2601   }
2602   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2603       as_FloatRegister(3), T1D, Address(sp));
2604 }
2605 
2606 void MacroAssembler::pop_call_clobbered_registers() {
2607   for (int i = 0; i < 32; i += 4) {
2608     if (i <= v7->encoding() || i >= v16->encoding())
2609       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2610           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2611   }
2612 
2613   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2614 }
2615 
2616 void MacroAssembler::push_CPU_state(bool save_vectors) {
2617   int step = (save_vectors ? 8 : 4) * wordSize;
2618   push(0x3fffffff, sp);         // integer registers except lr & sp
2619   mov(rscratch1, -step);
2620   sub(sp, sp, step);
2621   for (int i = 28; i >= 4; i -= 4) {
2622     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2623         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2624   }
2625   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2626 }
2627 
2628 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2629   int step = (restore_vectors ? 8 : 4) * wordSize;
2630   for (int i = 0; i <= 28; i += 4)
2631     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2632         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2633   pop(0x3fffffff, sp);         // integer registers except lr & sp
2634 }
2635 
2636 /**
2637  * Helpers for multiply_to_len().
2638  */
2639 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2640                                      Register src1, Register src2) {
2641   adds(dest_lo, dest_lo, src1);
2642   adc(dest_hi, dest_hi, zr);
2643   adds(dest_lo, dest_lo, src2);
2644   adc(final_dest_hi, dest_hi, zr);
2645 }
2646 
2647 // Generate an address from (r + r1 extend offset).  "size" is the
2648 // size of the operand.  The result may be in rscratch2.
2649 Address MacroAssembler::offsetted_address(Register r, Register r1,
2650                                           Address::extend ext, int offset, int size) {
2651   if (offset || (ext.shift() % size != 0)) {
2652     lea(rscratch2, Address(r, r1, ext));
2653     return Address(rscratch2, offset);
2654   } else {
2655     return Address(r, r1, ext);
2656   }
2657 }
2658 
2659 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2660 {
2661   assert(offset >= 0, "spill to negative address?");
2662   // Offset reachable ?
2663   //   Not aligned - 9 bits signed offset
2664   //   Aligned - 12 bits unsigned offset shifted
2665   Register base = sp;
2666   if ((offset & (size-1)) && offset >= (1<<8)) {
2667     add(tmp, base, offset & ((1<<12)-1));
2668     base = tmp;
2669     offset &= -1<<12;
2670   }
2671 
2672   if (offset >= (1<<12) * size) {
2673     add(tmp, base, offset & (((1<<12)-1)<<12));
2674     base = tmp;
2675     offset &= ~(((1<<12)-1)<<12);
2676   }
2677 
2678   return Address(base, offset);
2679 }
2680 
2681 // Checks whether offset is aligned.
2682 // Returns true if it is, else false.
2683 bool MacroAssembler::merge_alignment_check(Register base,
2684                                            size_t size,
2685                                            long cur_offset,
2686                                            long prev_offset) const {
2687   if (AvoidUnalignedAccesses) {
2688     if (base == sp) {
2689       // Checks whether low offset if aligned to pair of registers.
2690       long pair_mask = size * 2 - 1;
2691       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2692       return (offset & pair_mask) == 0;
2693     } else { // If base is not sp, we can't guarantee the access is aligned.
2694       return false;
2695     }
2696   } else {
2697     long mask = size - 1;
2698     // Load/store pair instruction only supports element size aligned offset.
2699     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2700   }
2701 }
2702 
2703 // Checks whether current and previous loads/stores can be merged.
2704 // Returns true if it can be merged, else false.
2705 bool MacroAssembler::ldst_can_merge(Register rt,
2706                                     const Address &adr,
2707                                     size_t cur_size_in_bytes,
2708                                     bool is_store) const {
2709   address prev = pc() - NativeInstruction::instruction_size;
2710   address last = code()->last_insn();
2711 
2712   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2713     return false;
2714   }
2715 
2716   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2717     return false;
2718   }
2719 
2720   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2721   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2722 
2723   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2724   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2725 
2726   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2727     return false;
2728   }
2729 
2730   long max_offset = 63 * prev_size_in_bytes;
2731   long min_offset = -64 * prev_size_in_bytes;
2732 
2733   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2734 
2735   // Only same base can be merged.
2736   if (adr.base() != prev_ldst->base()) {
2737     return false;
2738   }
2739 
2740   long cur_offset = adr.offset();
2741   long prev_offset = prev_ldst->offset();
2742   size_t diff = abs(cur_offset - prev_offset);
2743   if (diff != prev_size_in_bytes) {
2744     return false;
2745   }
2746 
2747   // Following cases can not be merged:
2748   // ldr x2, [x2, #8]
2749   // ldr x3, [x2, #16]
2750   // or:
2751   // ldr x2, [x3, #8]
2752   // ldr x2, [x3, #16]
2753   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2754   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2755     return false;
2756   }
2757 
2758   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2759   // Offset range must be in ldp/stp instruction's range.
2760   if (low_offset > max_offset || low_offset < min_offset) {
2761     return false;
2762   }
2763 
2764   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2765     return true;
2766   }
2767 
2768   return false;
2769 }
2770 
2771 // Merge current load/store with previous load/store into ldp/stp.
2772 void MacroAssembler::merge_ldst(Register rt,
2773                                 const Address &adr,
2774                                 size_t cur_size_in_bytes,
2775                                 bool is_store) {
2776 
2777   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2778 
2779   Register rt_low, rt_high;
2780   address prev = pc() - NativeInstruction::instruction_size;
2781   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2782 
2783   long offset;
2784 
2785   if (adr.offset() < prev_ldst->offset()) {
2786     offset = adr.offset();
2787     rt_low = rt;
2788     rt_high = prev_ldst->target();
2789   } else {
2790     offset = prev_ldst->offset();
2791     rt_low = prev_ldst->target();
2792     rt_high = rt;
2793   }
2794 
2795   Address adr_p = Address(prev_ldst->base(), offset);
2796   // Overwrite previous generated binary.
2797   code_section()->set_end(prev);
2798 
2799   const int sz = prev_ldst->size_in_bytes();
2800   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2801   if (!is_store) {
2802     BLOCK_COMMENT("merged ldr pair");
2803     if (sz == 8) {
2804       ldp(rt_low, rt_high, adr_p);
2805     } else {
2806       ldpw(rt_low, rt_high, adr_p);
2807     }
2808   } else {
2809     BLOCK_COMMENT("merged str pair");
2810     if (sz == 8) {
2811       stp(rt_low, rt_high, adr_p);
2812     } else {
2813       stpw(rt_low, rt_high, adr_p);
2814     }
2815   }
2816 }
2817 
2818 /**
2819  * Multiply 64 bit by 64 bit first loop.
2820  */
2821 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2822                                            Register y, Register y_idx, Register z,
2823                                            Register carry, Register product,
2824                                            Register idx, Register kdx) {
2825   //
2826   //  jlong carry, x[], y[], z[];
2827   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2828   //    huge_128 product = y[idx] * x[xstart] + carry;
2829   //    z[kdx] = (jlong)product;
2830   //    carry  = (jlong)(product >>> 64);
2831   //  }
2832   //  z[xstart] = carry;
2833   //
2834 
2835   Label L_first_loop, L_first_loop_exit;
2836   Label L_one_x, L_one_y, L_multiply;
2837 
2838   subsw(xstart, xstart, 1);
2839   br(Assembler::MI, L_one_x);
2840 
2841   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2842   ldr(x_xstart, Address(rscratch1));
2843   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2844 
2845   bind(L_first_loop);
2846   subsw(idx, idx, 1);
2847   br(Assembler::MI, L_first_loop_exit);
2848   subsw(idx, idx, 1);
2849   br(Assembler::MI, L_one_y);
2850   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2851   ldr(y_idx, Address(rscratch1));
2852   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2853   bind(L_multiply);
2854 
2855   // AArch64 has a multiply-accumulate instruction that we can't use
2856   // here because it has no way to process carries, so we have to use
2857   // separate add and adc instructions.  Bah.
2858   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2859   mul(product, x_xstart, y_idx);
2860   adds(product, product, carry);
2861   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2862 
2863   subw(kdx, kdx, 2);
2864   ror(product, product, 32); // back to big-endian
2865   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2866 
2867   b(L_first_loop);
2868 
2869   bind(L_one_y);
2870   ldrw(y_idx, Address(y,  0));
2871   b(L_multiply);
2872 
2873   bind(L_one_x);
2874   ldrw(x_xstart, Address(x,  0));
2875   b(L_first_loop);
2876 
2877   bind(L_first_loop_exit);
2878 }
2879 
2880 /**
2881  * Multiply 128 bit by 128. Unrolled inner loop.
2882  *
2883  */
2884 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2885                                              Register carry, Register carry2,
2886                                              Register idx, Register jdx,
2887                                              Register yz_idx1, Register yz_idx2,
2888                                              Register tmp, Register tmp3, Register tmp4,
2889                                              Register tmp6, Register product_hi) {
2890 
2891   //   jlong carry, x[], y[], z[];
2892   //   int kdx = ystart+1;
2893   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2894   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2895   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2896   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2897   //     carry  = (jlong)(tmp4 >>> 64);
2898   //     z[kdx+idx+1] = (jlong)tmp3;
2899   //     z[kdx+idx] = (jlong)tmp4;
2900   //   }
2901   //   idx += 2;
2902   //   if (idx > 0) {
2903   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2904   //     z[kdx+idx] = (jlong)yz_idx1;
2905   //     carry  = (jlong)(yz_idx1 >>> 64);
2906   //   }
2907   //
2908 
2909   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2910 
2911   lsrw(jdx, idx, 2);
2912 
2913   bind(L_third_loop);
2914 
2915   subsw(jdx, jdx, 1);
2916   br(Assembler::MI, L_third_loop_exit);
2917   subw(idx, idx, 4);
2918 
2919   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2920 
2921   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2922 
2923   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2924 
2925   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2926   ror(yz_idx2, yz_idx2, 32);
2927 
2928   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2929 
2930   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2931   umulh(tmp4, product_hi, yz_idx1);
2932 
2933   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2934   ror(rscratch2, rscratch2, 32);
2935 
2936   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2937   umulh(carry2, product_hi, yz_idx2);
2938 
2939   // propagate sum of both multiplications into carry:tmp4:tmp3
2940   adds(tmp3, tmp3, carry);
2941   adc(tmp4, tmp4, zr);
2942   adds(tmp3, tmp3, rscratch1);
2943   adcs(tmp4, tmp4, tmp);
2944   adc(carry, carry2, zr);
2945   adds(tmp4, tmp4, rscratch2);
2946   adc(carry, carry, zr);
2947 
2948   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2949   ror(tmp4, tmp4, 32);
2950   stp(tmp4, tmp3, Address(tmp6, 0));
2951 
2952   b(L_third_loop);
2953   bind (L_third_loop_exit);
2954 
2955   andw (idx, idx, 0x3);
2956   cbz(idx, L_post_third_loop_done);
2957 
2958   Label L_check_1;
2959   subsw(idx, idx, 2);
2960   br(Assembler::MI, L_check_1);
2961 
2962   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2963   ldr(yz_idx1, Address(rscratch1, 0));
2964   ror(yz_idx1, yz_idx1, 32);
2965   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2966   umulh(tmp4, product_hi, yz_idx1);
2967   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2968   ldr(yz_idx2, Address(rscratch1, 0));
2969   ror(yz_idx2, yz_idx2, 32);
2970 
2971   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2972 
2973   ror(tmp3, tmp3, 32);
2974   str(tmp3, Address(rscratch1, 0));
2975 
2976   bind (L_check_1);
2977 
2978   andw (idx, idx, 0x1);
2979   subsw(idx, idx, 1);
2980   br(Assembler::MI, L_post_third_loop_done);
2981   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2982   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2983   umulh(carry2, tmp4, product_hi);
2984   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2985 
2986   add2_with_carry(carry2, tmp3, tmp4, carry);
2987 
2988   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2989   extr(carry, carry2, tmp3, 32);
2990 
2991   bind(L_post_third_loop_done);
2992 }
2993 
2994 /**
2995  * Code for BigInteger::multiplyToLen() instrinsic.
2996  *
2997  * r0: x
2998  * r1: xlen
2999  * r2: y
3000  * r3: ylen
3001  * r4:  z
3002  * r5: zlen
3003  * r10: tmp1
3004  * r11: tmp2
3005  * r12: tmp3
3006  * r13: tmp4
3007  * r14: tmp5
3008  * r15: tmp6
3009  * r16: tmp7
3010  *
3011  */
3012 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3013                                      Register z, Register zlen,
3014                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3015                                      Register tmp5, Register tmp6, Register product_hi) {
3016 
3017   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3018 
3019   const Register idx = tmp1;
3020   const Register kdx = tmp2;
3021   const Register xstart = tmp3;
3022 
3023   const Register y_idx = tmp4;
3024   const Register carry = tmp5;
3025   const Register product  = xlen;
3026   const Register x_xstart = zlen;  // reuse register
3027 
3028   // First Loop.
3029   //
3030   //  final static long LONG_MASK = 0xffffffffL;
3031   //  int xstart = xlen - 1;
3032   //  int ystart = ylen - 1;
3033   //  long carry = 0;
3034   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3035   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3036   //    z[kdx] = (int)product;
3037   //    carry = product >>> 32;
3038   //  }
3039   //  z[xstart] = (int)carry;
3040   //
3041 
3042   movw(idx, ylen);      // idx = ylen;
3043   movw(kdx, zlen);      // kdx = xlen+ylen;
3044   mov(carry, zr);       // carry = 0;
3045 
3046   Label L_done;
3047 
3048   movw(xstart, xlen);
3049   subsw(xstart, xstart, 1);
3050   br(Assembler::MI, L_done);
3051 
3052   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3053 
3054   Label L_second_loop;
3055   cbzw(kdx, L_second_loop);
3056 
3057   Label L_carry;
3058   subw(kdx, kdx, 1);
3059   cbzw(kdx, L_carry);
3060 
3061   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3062   lsr(carry, carry, 32);
3063   subw(kdx, kdx, 1);
3064 
3065   bind(L_carry);
3066   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3067 
3068   // Second and third (nested) loops.
3069   //
3070   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3071   //   carry = 0;
3072   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3073   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3074   //                    (z[k] & LONG_MASK) + carry;
3075   //     z[k] = (int)product;
3076   //     carry = product >>> 32;
3077   //   }
3078   //   z[i] = (int)carry;
3079   // }
3080   //
3081   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3082 
3083   const Register jdx = tmp1;
3084 
3085   bind(L_second_loop);
3086   mov(carry, zr);                // carry = 0;
3087   movw(jdx, ylen);               // j = ystart+1
3088 
3089   subsw(xstart, xstart, 1);      // i = xstart-1;
3090   br(Assembler::MI, L_done);
3091 
3092   str(z, Address(pre(sp, -4 * wordSize)));
3093 
3094   Label L_last_x;
3095   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3096   subsw(xstart, xstart, 1);       // i = xstart-1;
3097   br(Assembler::MI, L_last_x);
3098 
3099   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3100   ldr(product_hi, Address(rscratch1));
3101   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3102 
3103   Label L_third_loop_prologue;
3104   bind(L_third_loop_prologue);
3105 
3106   str(ylen, Address(sp, wordSize));
3107   stp(x, xstart, Address(sp, 2 * wordSize));
3108   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3109                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3110   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3111   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3112 
3113   addw(tmp3, xlen, 1);
3114   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3115   subsw(tmp3, tmp3, 1);
3116   br(Assembler::MI, L_done);
3117 
3118   lsr(carry, carry, 32);
3119   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3120   b(L_second_loop);
3121 
3122   // Next infrequent code is moved outside loops.
3123   bind(L_last_x);
3124   ldrw(product_hi, Address(x,  0));
3125   b(L_third_loop_prologue);
3126 
3127   bind(L_done);
3128 }
3129 
3130 // Code for BigInteger::mulAdd instrinsic
3131 // out     = r0
3132 // in      = r1
3133 // offset  = r2  (already out.length-offset)
3134 // len     = r3
3135 // k       = r4
3136 //
3137 // pseudo code from java implementation:
3138 // carry = 0;
3139 // offset = out.length-offset - 1;
3140 // for (int j=len-1; j >= 0; j--) {
3141 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3142 //     out[offset--] = (int)product;
3143 //     carry = product >>> 32;
3144 // }
3145 // return (int)carry;
3146 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3147       Register len, Register k) {
3148     Label LOOP, END;
3149     // pre-loop
3150     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3151     csel(out, zr, out, Assembler::EQ);
3152     br(Assembler::EQ, END);
3153     add(in, in, len, LSL, 2); // in[j+1] address
3154     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3155     mov(out, zr); // used to keep carry now
3156     BIND(LOOP);
3157     ldrw(rscratch1, Address(pre(in, -4)));
3158     madd(rscratch1, rscratch1, k, out);
3159     ldrw(rscratch2, Address(pre(offset, -4)));
3160     add(rscratch1, rscratch1, rscratch2);
3161     strw(rscratch1, Address(offset));
3162     lsr(out, rscratch1, 32);
3163     subs(len, len, 1);
3164     br(Assembler::NE, LOOP);
3165     BIND(END);
3166 }
3167 
3168 /**
3169  * Emits code to update CRC-32 with a byte value according to constants in table
3170  *
3171  * @param [in,out]crc   Register containing the crc.
3172  * @param [in]val       Register containing the byte to fold into the CRC.
3173  * @param [in]table     Register containing the table of crc constants.
3174  *
3175  * uint32_t crc;
3176  * val = crc_table[(val ^ crc) & 0xFF];
3177  * crc = val ^ (crc >> 8);
3178  *
3179  */
3180 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3181   eor(val, val, crc);
3182   andr(val, val, 0xff);
3183   ldrw(val, Address(table, val, Address::lsl(2)));
3184   eor(crc, val, crc, Assembler::LSR, 8);
3185 }
3186 
3187 /**
3188  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3189  *
3190  * @param [in,out]crc   Register containing the crc.
3191  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3192  * @param [in]table0    Register containing table 0 of crc constants.
3193  * @param [in]table1    Register containing table 1 of crc constants.
3194  * @param [in]table2    Register containing table 2 of crc constants.
3195  * @param [in]table3    Register containing table 3 of crc constants.
3196  *
3197  * uint32_t crc;
3198  *   v = crc ^ v
3199  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3200  *
3201  */
3202 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3203         Register table0, Register table1, Register table2, Register table3,
3204         bool upper) {
3205   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3206   uxtb(tmp, v);
3207   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3208   ubfx(tmp, v, 8, 8);
3209   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3210   eor(crc, crc, tmp);
3211   ubfx(tmp, v, 16, 8);
3212   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3213   eor(crc, crc, tmp);
3214   ubfx(tmp, v, 24, 8);
3215   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3216   eor(crc, crc, tmp);
3217 }
3218 
3219 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3220         Register len, Register tmp0, Register tmp1, Register tmp2,
3221         Register tmp3) {
3222     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3223     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3224 
3225     mvnw(crc, crc);
3226 
3227     subs(len, len, 128);
3228     br(Assembler::GE, CRC_by64_pre);
3229   BIND(CRC_less64);
3230     adds(len, len, 128-32);
3231     br(Assembler::GE, CRC_by32_loop);
3232   BIND(CRC_less32);
3233     adds(len, len, 32-4);
3234     br(Assembler::GE, CRC_by4_loop);
3235     adds(len, len, 4);
3236     br(Assembler::GT, CRC_by1_loop);
3237     b(L_exit);
3238 
3239   BIND(CRC_by32_loop);
3240     ldp(tmp0, tmp1, Address(post(buf, 16)));
3241     subs(len, len, 32);
3242     crc32x(crc, crc, tmp0);
3243     ldr(tmp2, Address(post(buf, 8)));
3244     crc32x(crc, crc, tmp1);
3245     ldr(tmp3, Address(post(buf, 8)));
3246     crc32x(crc, crc, tmp2);
3247     crc32x(crc, crc, tmp3);
3248     br(Assembler::GE, CRC_by32_loop);
3249     cmn(len, 32);
3250     br(Assembler::NE, CRC_less32);
3251     b(L_exit);
3252 
3253   BIND(CRC_by4_loop);
3254     ldrw(tmp0, Address(post(buf, 4)));
3255     subs(len, len, 4);
3256     crc32w(crc, crc, tmp0);
3257     br(Assembler::GE, CRC_by4_loop);
3258     adds(len, len, 4);
3259     br(Assembler::LE, L_exit);
3260   BIND(CRC_by1_loop);
3261     ldrb(tmp0, Address(post(buf, 1)));
3262     subs(len, len, 1);
3263     crc32b(crc, crc, tmp0);
3264     br(Assembler::GT, CRC_by1_loop);
3265     b(L_exit);
3266 
3267   BIND(CRC_by64_pre);
3268     sub(buf, buf, 8);
3269     ldp(tmp0, tmp1, Address(buf, 8));
3270     crc32x(crc, crc, tmp0);
3271     ldr(tmp2, Address(buf, 24));
3272     crc32x(crc, crc, tmp1);
3273     ldr(tmp3, Address(buf, 32));
3274     crc32x(crc, crc, tmp2);
3275     ldr(tmp0, Address(buf, 40));
3276     crc32x(crc, crc, tmp3);
3277     ldr(tmp1, Address(buf, 48));
3278     crc32x(crc, crc, tmp0);
3279     ldr(tmp2, Address(buf, 56));
3280     crc32x(crc, crc, tmp1);
3281     ldr(tmp3, Address(pre(buf, 64)));
3282 
3283     b(CRC_by64_loop);
3284 
3285     align(CodeEntryAlignment);
3286   BIND(CRC_by64_loop);
3287     subs(len, len, 64);
3288     crc32x(crc, crc, tmp2);
3289     ldr(tmp0, Address(buf, 8));
3290     crc32x(crc, crc, tmp3);
3291     ldr(tmp1, Address(buf, 16));
3292     crc32x(crc, crc, tmp0);
3293     ldr(tmp2, Address(buf, 24));
3294     crc32x(crc, crc, tmp1);
3295     ldr(tmp3, Address(buf, 32));
3296     crc32x(crc, crc, tmp2);
3297     ldr(tmp0, Address(buf, 40));
3298     crc32x(crc, crc, tmp3);
3299     ldr(tmp1, Address(buf, 48));
3300     crc32x(crc, crc, tmp0);
3301     ldr(tmp2, Address(buf, 56));
3302     crc32x(crc, crc, tmp1);
3303     ldr(tmp3, Address(pre(buf, 64)));
3304     br(Assembler::GE, CRC_by64_loop);
3305 
3306     // post-loop
3307     crc32x(crc, crc, tmp2);
3308     crc32x(crc, crc, tmp3);
3309 
3310     sub(len, len, 64);
3311     add(buf, buf, 8);
3312     cmn(len, 128);
3313     br(Assembler::NE, CRC_less64);
3314   BIND(L_exit);
3315     mvnw(crc, crc);
3316 }
3317 
3318 /**
3319  * @param crc   register containing existing CRC (32-bit)
3320  * @param buf   register pointing to input byte buffer (byte*)
3321  * @param len   register containing number of bytes
3322  * @param table register that will contain address of CRC table
3323  * @param tmp   scratch register
3324  */
3325 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3326         Register table0, Register table1, Register table2, Register table3,
3327         Register tmp, Register tmp2, Register tmp3) {
3328   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3329   unsigned long offset;
3330 
3331   if (UseCRC32) {
3332       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3333       return;
3334   }
3335 
3336     mvnw(crc, crc);
3337 
3338     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3339     if (offset) add(table0, table0, offset);
3340     add(table1, table0, 1*256*sizeof(juint));
3341     add(table2, table0, 2*256*sizeof(juint));
3342     add(table3, table0, 3*256*sizeof(juint));
3343 
3344   if (UseNeon) {
3345       cmp(len, (u1)64);
3346       br(Assembler::LT, L_by16);
3347       eor(v16, T16B, v16, v16);
3348 
3349     Label L_fold;
3350 
3351       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3352 
3353       ld1(v0, v1, T2D, post(buf, 32));
3354       ld1r(v4, T2D, post(tmp, 8));
3355       ld1r(v5, T2D, post(tmp, 8));
3356       ld1r(v6, T2D, post(tmp, 8));
3357       ld1r(v7, T2D, post(tmp, 8));
3358       mov(v16, T4S, 0, crc);
3359 
3360       eor(v0, T16B, v0, v16);
3361       sub(len, len, 64);
3362 
3363     BIND(L_fold);
3364       pmull(v22, T8H, v0, v5, T8B);
3365       pmull(v20, T8H, v0, v7, T8B);
3366       pmull(v23, T8H, v0, v4, T8B);
3367       pmull(v21, T8H, v0, v6, T8B);
3368 
3369       pmull2(v18, T8H, v0, v5, T16B);
3370       pmull2(v16, T8H, v0, v7, T16B);
3371       pmull2(v19, T8H, v0, v4, T16B);
3372       pmull2(v17, T8H, v0, v6, T16B);
3373 
3374       uzp1(v24, T8H, v20, v22);
3375       uzp2(v25, T8H, v20, v22);
3376       eor(v20, T16B, v24, v25);
3377 
3378       uzp1(v26, T8H, v16, v18);
3379       uzp2(v27, T8H, v16, v18);
3380       eor(v16, T16B, v26, v27);
3381 
3382       ushll2(v22, T4S, v20, T8H, 8);
3383       ushll(v20, T4S, v20, T4H, 8);
3384 
3385       ushll2(v18, T4S, v16, T8H, 8);
3386       ushll(v16, T4S, v16, T4H, 8);
3387 
3388       eor(v22, T16B, v23, v22);
3389       eor(v18, T16B, v19, v18);
3390       eor(v20, T16B, v21, v20);
3391       eor(v16, T16B, v17, v16);
3392 
3393       uzp1(v17, T2D, v16, v20);
3394       uzp2(v21, T2D, v16, v20);
3395       eor(v17, T16B, v17, v21);
3396 
3397       ushll2(v20, T2D, v17, T4S, 16);
3398       ushll(v16, T2D, v17, T2S, 16);
3399 
3400       eor(v20, T16B, v20, v22);
3401       eor(v16, T16B, v16, v18);
3402 
3403       uzp1(v17, T2D, v20, v16);
3404       uzp2(v21, T2D, v20, v16);
3405       eor(v28, T16B, v17, v21);
3406 
3407       pmull(v22, T8H, v1, v5, T8B);
3408       pmull(v20, T8H, v1, v7, T8B);
3409       pmull(v23, T8H, v1, v4, T8B);
3410       pmull(v21, T8H, v1, v6, T8B);
3411 
3412       pmull2(v18, T8H, v1, v5, T16B);
3413       pmull2(v16, T8H, v1, v7, T16B);
3414       pmull2(v19, T8H, v1, v4, T16B);
3415       pmull2(v17, T8H, v1, v6, T16B);
3416 
3417       ld1(v0, v1, T2D, post(buf, 32));
3418 
3419       uzp1(v24, T8H, v20, v22);
3420       uzp2(v25, T8H, v20, v22);
3421       eor(v20, T16B, v24, v25);
3422 
3423       uzp1(v26, T8H, v16, v18);
3424       uzp2(v27, T8H, v16, v18);
3425       eor(v16, T16B, v26, v27);
3426 
3427       ushll2(v22, T4S, v20, T8H, 8);
3428       ushll(v20, T4S, v20, T4H, 8);
3429 
3430       ushll2(v18, T4S, v16, T8H, 8);
3431       ushll(v16, T4S, v16, T4H, 8);
3432 
3433       eor(v22, T16B, v23, v22);
3434       eor(v18, T16B, v19, v18);
3435       eor(v20, T16B, v21, v20);
3436       eor(v16, T16B, v17, v16);
3437 
3438       uzp1(v17, T2D, v16, v20);
3439       uzp2(v21, T2D, v16, v20);
3440       eor(v16, T16B, v17, v21);
3441 
3442       ushll2(v20, T2D, v16, T4S, 16);
3443       ushll(v16, T2D, v16, T2S, 16);
3444 
3445       eor(v20, T16B, v22, v20);
3446       eor(v16, T16B, v16, v18);
3447 
3448       uzp1(v17, T2D, v20, v16);
3449       uzp2(v21, T2D, v20, v16);
3450       eor(v20, T16B, v17, v21);
3451 
3452       shl(v16, T2D, v28, 1);
3453       shl(v17, T2D, v20, 1);
3454 
3455       eor(v0, T16B, v0, v16);
3456       eor(v1, T16B, v1, v17);
3457 
3458       subs(len, len, 32);
3459       br(Assembler::GE, L_fold);
3460 
3461       mov(crc, 0);
3462       mov(tmp, v0, T1D, 0);
3463       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3464       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3465       mov(tmp, v0, T1D, 1);
3466       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3467       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3468       mov(tmp, v1, T1D, 0);
3469       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3470       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3471       mov(tmp, v1, T1D, 1);
3472       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3473       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3474 
3475       add(len, len, 32);
3476   }
3477 
3478   BIND(L_by16);
3479     subs(len, len, 16);
3480     br(Assembler::GE, L_by16_loop);
3481     adds(len, len, 16-4);
3482     br(Assembler::GE, L_by4_loop);
3483     adds(len, len, 4);
3484     br(Assembler::GT, L_by1_loop);
3485     b(L_exit);
3486 
3487   BIND(L_by4_loop);
3488     ldrw(tmp, Address(post(buf, 4)));
3489     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3490     subs(len, len, 4);
3491     br(Assembler::GE, L_by4_loop);
3492     adds(len, len, 4);
3493     br(Assembler::LE, L_exit);
3494   BIND(L_by1_loop);
3495     subs(len, len, 1);
3496     ldrb(tmp, Address(post(buf, 1)));
3497     update_byte_crc32(crc, tmp, table0);
3498     br(Assembler::GT, L_by1_loop);
3499     b(L_exit);
3500 
3501     align(CodeEntryAlignment);
3502   BIND(L_by16_loop);
3503     subs(len, len, 16);
3504     ldp(tmp, tmp3, Address(post(buf, 16)));
3505     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3506     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3507     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3508     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3509     br(Assembler::GE, L_by16_loop);
3510     adds(len, len, 16-4);
3511     br(Assembler::GE, L_by4_loop);
3512     adds(len, len, 4);
3513     br(Assembler::GT, L_by1_loop);
3514   BIND(L_exit);
3515     mvnw(crc, crc);
3516 }
3517 
3518 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3519         Register len, Register tmp0, Register tmp1, Register tmp2,
3520         Register tmp3) {
3521     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3522     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3523 
3524     subs(len, len, 128);
3525     br(Assembler::GE, CRC_by64_pre);
3526   BIND(CRC_less64);
3527     adds(len, len, 128-32);
3528     br(Assembler::GE, CRC_by32_loop);
3529   BIND(CRC_less32);
3530     adds(len, len, 32-4);
3531     br(Assembler::GE, CRC_by4_loop);
3532     adds(len, len, 4);
3533     br(Assembler::GT, CRC_by1_loop);
3534     b(L_exit);
3535 
3536   BIND(CRC_by32_loop);
3537     ldp(tmp0, tmp1, Address(post(buf, 16)));
3538     subs(len, len, 32);
3539     crc32cx(crc, crc, tmp0);
3540     ldr(tmp2, Address(post(buf, 8)));
3541     crc32cx(crc, crc, tmp1);
3542     ldr(tmp3, Address(post(buf, 8)));
3543     crc32cx(crc, crc, tmp2);
3544     crc32cx(crc, crc, tmp3);
3545     br(Assembler::GE, CRC_by32_loop);
3546     cmn(len, 32);
3547     br(Assembler::NE, CRC_less32);
3548     b(L_exit);
3549 
3550   BIND(CRC_by4_loop);
3551     ldrw(tmp0, Address(post(buf, 4)));
3552     subs(len, len, 4);
3553     crc32cw(crc, crc, tmp0);
3554     br(Assembler::GE, CRC_by4_loop);
3555     adds(len, len, 4);
3556     br(Assembler::LE, L_exit);
3557   BIND(CRC_by1_loop);
3558     ldrb(tmp0, Address(post(buf, 1)));
3559     subs(len, len, 1);
3560     crc32cb(crc, crc, tmp0);
3561     br(Assembler::GT, CRC_by1_loop);
3562     b(L_exit);
3563 
3564   BIND(CRC_by64_pre);
3565     sub(buf, buf, 8);
3566     ldp(tmp0, tmp1, Address(buf, 8));
3567     crc32cx(crc, crc, tmp0);
3568     ldr(tmp2, Address(buf, 24));
3569     crc32cx(crc, crc, tmp1);
3570     ldr(tmp3, Address(buf, 32));
3571     crc32cx(crc, crc, tmp2);
3572     ldr(tmp0, Address(buf, 40));
3573     crc32cx(crc, crc, tmp3);
3574     ldr(tmp1, Address(buf, 48));
3575     crc32cx(crc, crc, tmp0);
3576     ldr(tmp2, Address(buf, 56));
3577     crc32cx(crc, crc, tmp1);
3578     ldr(tmp3, Address(pre(buf, 64)));
3579 
3580     b(CRC_by64_loop);
3581 
3582     align(CodeEntryAlignment);
3583   BIND(CRC_by64_loop);
3584     subs(len, len, 64);
3585     crc32cx(crc, crc, tmp2);
3586     ldr(tmp0, Address(buf, 8));
3587     crc32cx(crc, crc, tmp3);
3588     ldr(tmp1, Address(buf, 16));
3589     crc32cx(crc, crc, tmp0);
3590     ldr(tmp2, Address(buf, 24));
3591     crc32cx(crc, crc, tmp1);
3592     ldr(tmp3, Address(buf, 32));
3593     crc32cx(crc, crc, tmp2);
3594     ldr(tmp0, Address(buf, 40));
3595     crc32cx(crc, crc, tmp3);
3596     ldr(tmp1, Address(buf, 48));
3597     crc32cx(crc, crc, tmp0);
3598     ldr(tmp2, Address(buf, 56));
3599     crc32cx(crc, crc, tmp1);
3600     ldr(tmp3, Address(pre(buf, 64)));
3601     br(Assembler::GE, CRC_by64_loop);
3602 
3603     // post-loop
3604     crc32cx(crc, crc, tmp2);
3605     crc32cx(crc, crc, tmp3);
3606 
3607     sub(len, len, 64);
3608     add(buf, buf, 8);
3609     cmn(len, 128);
3610     br(Assembler::NE, CRC_less64);
3611   BIND(L_exit);
3612 }
3613 
3614 /**
3615  * @param crc   register containing existing CRC (32-bit)
3616  * @param buf   register pointing to input byte buffer (byte*)
3617  * @param len   register containing number of bytes
3618  * @param table register that will contain address of CRC table
3619  * @param tmp   scratch register
3620  */
3621 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3622         Register table0, Register table1, Register table2, Register table3,
3623         Register tmp, Register tmp2, Register tmp3) {
3624   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3625 }
3626 
3627 
3628 SkipIfEqual::SkipIfEqual(
3629     MacroAssembler* masm, const bool* flag_addr, bool value) {
3630   _masm = masm;
3631   unsigned long offset;
3632   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3633   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3634   _masm->cbzw(rscratch1, _label);
3635 }
3636 
3637 SkipIfEqual::~SkipIfEqual() {
3638   _masm->bind(_label);
3639 }
3640 
3641 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3642   Address adr;
3643   switch(dst.getMode()) {
3644   case Address::base_plus_offset:
3645     // This is the expected mode, although we allow all the other
3646     // forms below.
3647     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3648     break;
3649   default:
3650     lea(rscratch2, dst);
3651     adr = Address(rscratch2);
3652     break;
3653   }
3654   ldr(rscratch1, adr);
3655   add(rscratch1, rscratch1, src);
3656   str(rscratch1, adr);
3657 }
3658 
3659 void MacroAssembler::cmpptr(Register src1, Address src2) {
3660   unsigned long offset;
3661   adrp(rscratch1, src2, offset);
3662   ldr(rscratch1, Address(rscratch1, offset));
3663   cmp(src1, rscratch1);
3664 }
3665 
3666 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3667   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3668   bs->obj_equals(this, obj1, obj2);
3669 }
3670 
3671 void MacroAssembler::load_klass(Register dst, Register src) {
3672   if (UseCompressedClassPointers) {
3673     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3674     decode_klass_not_null(dst);
3675   } else {
3676     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3677   }
3678 }
3679 
3680 // ((OopHandle)result).resolve();
3681 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3682   // OopHandle::resolve is an indirection.
3683   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3684 }
3685 
3686 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3687   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3688   ldr(dst, Address(rmethod, Method::const_offset()));
3689   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3690   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3691   ldr(dst, Address(dst, mirror_offset));
3692   resolve_oop_handle(dst, tmp);
3693 }
3694 
3695 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3696   if (UseCompressedClassPointers) {
3697     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3698     if (Universe::narrow_klass_base() == NULL) {
3699       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3700       return;
3701     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3702                && Universe::narrow_klass_shift() == 0) {
3703       // Only the bottom 32 bits matter
3704       cmpw(trial_klass, tmp);
3705       return;
3706     }
3707     decode_klass_not_null(tmp);
3708   } else {
3709     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3710   }
3711   cmp(trial_klass, tmp);
3712 }
3713 
3714 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3715   load_klass(dst, src);
3716   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3717 }
3718 
3719 void MacroAssembler::store_klass(Register dst, Register src) {
3720   // FIXME: Should this be a store release?  concurrent gcs assumes
3721   // klass length is valid if klass field is not null.
3722   if (UseCompressedClassPointers) {
3723     encode_klass_not_null(src);
3724     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3725   } else {
3726     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3727   }
3728 }
3729 
3730 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3731   if (UseCompressedClassPointers) {
3732     // Store to klass gap in destination
3733     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3734   }
3735 }
3736 
3737 // Algorithm must match CompressedOops::encode.
3738 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3739 #ifdef ASSERT
3740   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3741 #endif
3742   verify_oop(s, "broken oop in encode_heap_oop");
3743   if (Universe::narrow_oop_base() == NULL) {
3744     if (Universe::narrow_oop_shift() != 0) {
3745       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3746       lsr(d, s, LogMinObjAlignmentInBytes);
3747     } else {
3748       mov(d, s);
3749     }
3750   } else {
3751     subs(d, s, rheapbase);
3752     csel(d, d, zr, Assembler::HS);
3753     lsr(d, d, LogMinObjAlignmentInBytes);
3754 
3755     /*  Old algorithm: is this any worse?
3756     Label nonnull;
3757     cbnz(r, nonnull);
3758     sub(r, r, rheapbase);
3759     bind(nonnull);
3760     lsr(r, r, LogMinObjAlignmentInBytes);
3761     */
3762   }
3763 }
3764 
3765 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3766 #ifdef ASSERT
3767   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3768   if (CheckCompressedOops) {
3769     Label ok;
3770     cbnz(r, ok);
3771     stop("null oop passed to encode_heap_oop_not_null");
3772     bind(ok);
3773   }
3774 #endif
3775   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3776   if (Universe::narrow_oop_base() != NULL) {
3777     sub(r, r, rheapbase);
3778   }
3779   if (Universe::narrow_oop_shift() != 0) {
3780     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3781     lsr(r, r, LogMinObjAlignmentInBytes);
3782   }
3783 }
3784 
3785 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3786 #ifdef ASSERT
3787   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3788   if (CheckCompressedOops) {
3789     Label ok;
3790     cbnz(src, ok);
3791     stop("null oop passed to encode_heap_oop_not_null2");
3792     bind(ok);
3793   }
3794 #endif
3795   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3796 
3797   Register data = src;
3798   if (Universe::narrow_oop_base() != NULL) {
3799     sub(dst, src, rheapbase);
3800     data = dst;
3801   }
3802   if (Universe::narrow_oop_shift() != 0) {
3803     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3804     lsr(dst, data, LogMinObjAlignmentInBytes);
3805     data = dst;
3806   }
3807   if (data == src)
3808     mov(dst, src);
3809 }
3810 
3811 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3812 #ifdef ASSERT
3813   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3814 #endif
3815   if (Universe::narrow_oop_base() == NULL) {
3816     if (Universe::narrow_oop_shift() != 0 || d != s) {
3817       lsl(d, s, Universe::narrow_oop_shift());
3818     }
3819   } else {
3820     Label done;
3821     if (d != s)
3822       mov(d, s);
3823     cbz(s, done);
3824     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3825     bind(done);
3826   }
3827   verify_oop(d, "broken oop in decode_heap_oop");
3828 }
3829 
3830 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3831   assert (UseCompressedOops, "should only be used for compressed headers");
3832   assert (Universe::heap() != NULL, "java heap should be initialized");
3833   // Cannot assert, unverified entry point counts instructions (see .ad file)
3834   // vtableStubs also counts instructions in pd_code_size_limit.
3835   // Also do not verify_oop as this is called by verify_oop.
3836   if (Universe::narrow_oop_shift() != 0) {
3837     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3838     if (Universe::narrow_oop_base() != NULL) {
3839       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3840     } else {
3841       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3842     }
3843   } else {
3844     assert (Universe::narrow_oop_base() == NULL, "sanity");
3845   }
3846 }
3847 
3848 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3849   assert (UseCompressedOops, "should only be used for compressed headers");
3850   assert (Universe::heap() != NULL, "java heap should be initialized");
3851   // Cannot assert, unverified entry point counts instructions (see .ad file)
3852   // vtableStubs also counts instructions in pd_code_size_limit.
3853   // Also do not verify_oop as this is called by verify_oop.
3854   if (Universe::narrow_oop_shift() != 0) {
3855     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3856     if (Universe::narrow_oop_base() != NULL) {
3857       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3858     } else {
3859       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3860     }
3861   } else {
3862     assert (Universe::narrow_oop_base() == NULL, "sanity");
3863     if (dst != src) {
3864       mov(dst, src);
3865     }
3866   }
3867 }
3868 
3869 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3870   if (Universe::narrow_klass_base() == NULL) {
3871     if (Universe::narrow_klass_shift() != 0) {
3872       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3873       lsr(dst, src, LogKlassAlignmentInBytes);
3874     } else {
3875       if (dst != src) mov(dst, src);
3876     }
3877     return;
3878   }
3879 
3880   if (use_XOR_for_compressed_class_base) {
3881     if (Universe::narrow_klass_shift() != 0) {
3882       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3883       lsr(dst, dst, LogKlassAlignmentInBytes);
3884     } else {
3885       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3886     }
3887     return;
3888   }
3889 
3890   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3891       && Universe::narrow_klass_shift() == 0) {
3892     movw(dst, src);
3893     return;
3894   }
3895 
3896 #ifdef ASSERT
3897   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3898 #endif
3899 
3900   Register rbase = dst;
3901   if (dst == src) rbase = rheapbase;
3902   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3903   sub(dst, src, rbase);
3904   if (Universe::narrow_klass_shift() != 0) {
3905     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3906     lsr(dst, dst, LogKlassAlignmentInBytes);
3907   }
3908   if (dst == src) reinit_heapbase();
3909 }
3910 
3911 void MacroAssembler::encode_klass_not_null(Register r) {
3912   encode_klass_not_null(r, r);
3913 }
3914 
3915 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3916   Register rbase = dst;
3917   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3918 
3919   if (Universe::narrow_klass_base() == NULL) {
3920     if (Universe::narrow_klass_shift() != 0) {
3921       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3922       lsl(dst, src, LogKlassAlignmentInBytes);
3923     } else {
3924       if (dst != src) mov(dst, src);
3925     }
3926     return;
3927   }
3928 
3929   if (use_XOR_for_compressed_class_base) {
3930     if (Universe::narrow_klass_shift() != 0) {
3931       lsl(dst, src, LogKlassAlignmentInBytes);
3932       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3933     } else {
3934       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3935     }
3936     return;
3937   }
3938 
3939   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3940       && Universe::narrow_klass_shift() == 0) {
3941     if (dst != src)
3942       movw(dst, src);
3943     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3944     return;
3945   }
3946 
3947   // Cannot assert, unverified entry point counts instructions (see .ad file)
3948   // vtableStubs also counts instructions in pd_code_size_limit.
3949   // Also do not verify_oop as this is called by verify_oop.
3950   if (dst == src) rbase = rheapbase;
3951   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3952   if (Universe::narrow_klass_shift() != 0) {
3953     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3954     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3955   } else {
3956     add(dst, rbase, src);
3957   }
3958   if (dst == src) reinit_heapbase();
3959 }
3960 
3961 void  MacroAssembler::decode_klass_not_null(Register r) {
3962   decode_klass_not_null(r, r);
3963 }
3964 
3965 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3966 #ifdef ASSERT
3967   {
3968     ThreadInVMfromUnknown tiv;
3969     assert (UseCompressedOops, "should only be used for compressed oops");
3970     assert (Universe::heap() != NULL, "java heap should be initialized");
3971     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3972     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3973   }
3974 #endif
3975   int oop_index = oop_recorder()->find_index(obj);
3976   InstructionMark im(this);
3977   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3978   code_section()->relocate(inst_mark(), rspec);
3979   movz(dst, 0xDEAD, 16);
3980   movk(dst, 0xBEEF);
3981 }
3982 
3983 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3984   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3985   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3986   int index = oop_recorder()->find_index(k);
3987   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3988 
3989   InstructionMark im(this);
3990   RelocationHolder rspec = metadata_Relocation::spec(index);
3991   code_section()->relocate(inst_mark(), rspec);
3992   narrowKlass nk = Klass::encode_klass(k);
3993   movz(dst, (nk >> 16), 16);
3994   movk(dst, nk & 0xffff);
3995 }
3996 
3997 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
3998                                     Register dst, Address src,
3999                                     Register tmp1, Register thread_tmp) {
4000   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4001   decorators = AccessInternal::decorator_fixup(decorators);
4002   bool as_raw = (decorators & AS_RAW) != 0;
4003   if (as_raw) {
4004     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4005   } else {
4006     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4007   }
4008 }
4009 
4010 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4011                                      Address dst, Register src,
4012                                      Register tmp1, Register thread_tmp) {
4013   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4014   decorators = AccessInternal::decorator_fixup(decorators);
4015   bool as_raw = (decorators & AS_RAW) != 0;
4016   if (as_raw) {
4017     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4018   } else {
4019     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4020   }
4021 }
4022 
4023 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4024   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4025   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4026     decorators |= ACCESS_READ | ACCESS_WRITE;
4027   }
4028   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4029   return bs->resolve(this, decorators, obj);
4030 }
4031 
4032 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4033                                    Register thread_tmp, DecoratorSet decorators) {
4034   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4035 }
4036 
4037 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4038                                             Register thread_tmp, DecoratorSet decorators) {
4039   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4040 }
4041 
4042 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4043                                     Register thread_tmp, DecoratorSet decorators) {
4044   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4045 }
4046 
4047 // Used for storing NULLs.
4048 void MacroAssembler::store_heap_oop_null(Address dst) {
4049   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4050 }
4051 
4052 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4053   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4054   int index = oop_recorder()->allocate_metadata_index(obj);
4055   RelocationHolder rspec = metadata_Relocation::spec(index);
4056   return Address((address)obj, rspec);
4057 }
4058 
4059 // Move an oop into a register.  immediate is true if we want
4060 // immediate instrcutions, i.e. we are not going to patch this
4061 // instruction while the code is being executed by another thread.  In
4062 // that case we can use move immediates rather than the constant pool.
4063 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4064   int oop_index;
4065   if (obj == NULL) {
4066     oop_index = oop_recorder()->allocate_oop_index(obj);
4067   } else {
4068 #ifdef ASSERT
4069     {
4070       ThreadInVMfromUnknown tiv;
4071       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4072     }
4073 #endif
4074     oop_index = oop_recorder()->find_index(obj);
4075   }
4076   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4077   if (! immediate) {
4078     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4079     ldr_constant(dst, Address(dummy, rspec));
4080   } else
4081     mov(dst, Address((address)obj, rspec));
4082 }
4083 
4084 // Move a metadata address into a register.
4085 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4086   int oop_index;
4087   if (obj == NULL) {
4088     oop_index = oop_recorder()->allocate_metadata_index(obj);
4089   } else {
4090     oop_index = oop_recorder()->find_index(obj);
4091   }
4092   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4093   mov(dst, Address((address)obj, rspec));
4094 }
4095 
4096 Address MacroAssembler::constant_oop_address(jobject obj) {
4097 #ifdef ASSERT
4098   {
4099     ThreadInVMfromUnknown tiv;
4100     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4101     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4102   }
4103 #endif
4104   int oop_index = oop_recorder()->find_index(obj);
4105   return Address((address)obj, oop_Relocation::spec(oop_index));
4106 }
4107 
4108 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4109 void MacroAssembler::tlab_allocate(Register obj,
4110                                    Register var_size_in_bytes,
4111                                    int con_size_in_bytes,
4112                                    Register t1,
4113                                    Register t2,
4114                                    Label& slow_case) {
4115   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4116   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4117 }
4118 
4119 // Defines obj, preserves var_size_in_bytes
4120 void MacroAssembler::eden_allocate(Register obj,
4121                                    Register var_size_in_bytes,
4122                                    int con_size_in_bytes,
4123                                    Register t1,
4124                                    Label& slow_case) {
4125   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4126   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4127 }
4128 
4129 // Zero words; len is in bytes
4130 // Destroys all registers except addr
4131 // len must be a nonzero multiple of wordSize
4132 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4133   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4134 
4135 #ifdef ASSERT
4136   { Label L;
4137     tst(len, BytesPerWord - 1);
4138     br(Assembler::EQ, L);
4139     stop("len is not a multiple of BytesPerWord");
4140     bind(L);
4141   }
4142 #endif
4143 
4144 #ifndef PRODUCT
4145   block_comment("zero memory");
4146 #endif
4147 
4148   Label loop;
4149   Label entry;
4150 
4151 //  Algorithm:
4152 //
4153 //    scratch1 = cnt & 7;
4154 //    cnt -= scratch1;
4155 //    p += scratch1;
4156 //    switch (scratch1) {
4157 //      do {
4158 //        cnt -= 8;
4159 //          p[-8] = 0;
4160 //        case 7:
4161 //          p[-7] = 0;
4162 //        case 6:
4163 //          p[-6] = 0;
4164 //          // ...
4165 //        case 1:
4166 //          p[-1] = 0;
4167 //        case 0:
4168 //          p += 8;
4169 //      } while (cnt);
4170 //    }
4171 
4172   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4173 
4174   lsr(len, len, LogBytesPerWord);
4175   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4176   sub(len, len, rscratch1);      // cnt -= unroll
4177   // t1 always points to the end of the region we're about to zero
4178   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4179   adr(rscratch2, entry);
4180   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4181   br(rscratch2);
4182   bind(loop);
4183   sub(len, len, unroll);
4184   for (int i = -unroll; i < 0; i++)
4185     Assembler::str(zr, Address(t1, i * wordSize));
4186   bind(entry);
4187   add(t1, t1, unroll * wordSize);
4188   cbnz(len, loop);
4189 }
4190 
4191 void MacroAssembler::verify_tlab() {
4192 #ifdef ASSERT
4193   if (UseTLAB && VerifyOops) {
4194     Label next, ok;
4195 
4196     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4197 
4198     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4199     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4200     cmp(rscratch2, rscratch1);
4201     br(Assembler::HS, next);
4202     STOP("assert(top >= start)");
4203     should_not_reach_here();
4204 
4205     bind(next);
4206     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4207     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4208     cmp(rscratch2, rscratch1);
4209     br(Assembler::HS, ok);
4210     STOP("assert(top <= end)");
4211     should_not_reach_here();
4212 
4213     bind(ok);
4214     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4215   }
4216 #endif
4217 }
4218 
4219 // Writes to stack successive pages until offset reached to check for
4220 // stack overflow + shadow pages.  This clobbers tmp.
4221 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4222   assert_different_registers(tmp, size, rscratch1);
4223   mov(tmp, sp);
4224   // Bang stack for total size given plus shadow page size.
4225   // Bang one page at a time because large size can bang beyond yellow and
4226   // red zones.
4227   Label loop;
4228   mov(rscratch1, os::vm_page_size());
4229   bind(loop);
4230   lea(tmp, Address(tmp, -os::vm_page_size()));
4231   subsw(size, size, rscratch1);
4232   str(size, Address(tmp));
4233   br(Assembler::GT, loop);
4234 
4235   // Bang down shadow pages too.
4236   // At this point, (tmp-0) is the last address touched, so don't
4237   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4238   // was post-decremented.)  Skip this address by starting at i=1, and
4239   // touch a few more pages below.  N.B.  It is important to touch all
4240   // the way down to and including i=StackShadowPages.
4241   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4242     // this could be any sized move but this is can be a debugging crumb
4243     // so the bigger the better.
4244     lea(tmp, Address(tmp, -os::vm_page_size()));
4245     str(size, Address(tmp));
4246   }
4247 }
4248 
4249 
4250 // Move the address of the polling page into dest.
4251 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4252   if (SafepointMechanism::uses_thread_local_poll()) {
4253     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4254   } else {
4255     unsigned long off;
4256     adrp(dest, Address(page, rtype), off);
4257     assert(off == 0, "polling page must be page aligned");
4258   }
4259 }
4260 
4261 // Move the address of the polling page into r, then read the polling
4262 // page.
4263 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4264   get_polling_page(r, page, rtype);
4265   return read_polling_page(r, rtype);
4266 }
4267 
4268 // Read the polling page.  The address of the polling page must
4269 // already be in r.
4270 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4271   InstructionMark im(this);
4272   code_section()->relocate(inst_mark(), rtype);
4273   ldrw(zr, Address(r, 0));
4274   return inst_mark();
4275 }
4276 
4277 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4278   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4279   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4280   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4281   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4282   long offset_low = dest_page - low_page;
4283   long offset_high = dest_page - high_page;
4284 
4285   assert(is_valid_AArch64_address(dest.target()), "bad address");
4286   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4287 
4288   InstructionMark im(this);
4289   code_section()->relocate(inst_mark(), dest.rspec());
4290   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4291   // the code cache so that if it is relocated we know it will still reach
4292   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4293     _adrp(reg1, dest.target());
4294   } else {
4295     unsigned long target = (unsigned long)dest.target();
4296     unsigned long adrp_target
4297       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4298 
4299     _adrp(reg1, (address)adrp_target);
4300     movk(reg1, target >> 32, 32);
4301   }
4302   byte_offset = (unsigned long)dest.target() & 0xfff;
4303 }
4304 
4305 void MacroAssembler::load_byte_map_base(Register reg) {
4306   jbyte *byte_map_base =
4307     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4308 
4309   if (is_valid_AArch64_address((address)byte_map_base)) {
4310     // Strictly speaking the byte_map_base isn't an address at all,
4311     // and it might even be negative.
4312     unsigned long offset;
4313     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4314     // We expect offset to be zero with most collectors.
4315     if (offset != 0) {
4316       add(reg, reg, offset);
4317     }
4318   } else {
4319     mov(reg, (uint64_t)byte_map_base);
4320   }
4321 }
4322 
4323 void MacroAssembler::build_frame(int framesize) {
4324   assert(framesize > 0, "framesize must be > 0");
4325   if (framesize < ((1 << 9) + 2 * wordSize)) {
4326     sub(sp, sp, framesize);
4327     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4328     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4329   } else {
4330     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4331     if (PreserveFramePointer) mov(rfp, sp);
4332     if (framesize < ((1 << 12) + 2 * wordSize))
4333       sub(sp, sp, framesize - 2 * wordSize);
4334     else {
4335       mov(rscratch1, framesize - 2 * wordSize);
4336       sub(sp, sp, rscratch1);
4337     }
4338   }
4339 }
4340 
4341 void MacroAssembler::remove_frame(int framesize) {
4342   assert(framesize > 0, "framesize must be > 0");
4343   if (framesize < ((1 << 9) + 2 * wordSize)) {
4344     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4345     add(sp, sp, framesize);
4346   } else {
4347     if (framesize < ((1 << 12) + 2 * wordSize))
4348       add(sp, sp, framesize - 2 * wordSize);
4349     else {
4350       mov(rscratch1, framesize - 2 * wordSize);
4351       add(sp, sp, rscratch1);
4352     }
4353     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4354   }
4355 }
4356 
4357 #ifdef COMPILER2
4358 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4359 
4360 // Search for str1 in str2 and return index or -1
4361 void MacroAssembler::string_indexof(Register str2, Register str1,
4362                                     Register cnt2, Register cnt1,
4363                                     Register tmp1, Register tmp2,
4364                                     Register tmp3, Register tmp4,
4365                                     Register tmp5, Register tmp6,
4366                                     int icnt1, Register result, int ae) {
4367   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4368   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4369 
4370   Register ch1 = rscratch1;
4371   Register ch2 = rscratch2;
4372   Register cnt1tmp = tmp1;
4373   Register cnt2tmp = tmp2;
4374   Register cnt1_neg = cnt1;
4375   Register cnt2_neg = cnt2;
4376   Register result_tmp = tmp4;
4377 
4378   bool isL = ae == StrIntrinsicNode::LL;
4379 
4380   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4381   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4382   int str1_chr_shift = str1_isL ? 0:1;
4383   int str2_chr_shift = str2_isL ? 0:1;
4384   int str1_chr_size = str1_isL ? 1:2;
4385   int str2_chr_size = str2_isL ? 1:2;
4386   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4387                                       (chr_insn)&MacroAssembler::ldrh;
4388   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4389                                       (chr_insn)&MacroAssembler::ldrh;
4390   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4391   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4392 
4393   // Note, inline_string_indexOf() generates checks:
4394   // if (substr.count > string.count) return -1;
4395   // if (substr.count == 0) return 0;
4396 
4397   // We have two strings, a source string in str2, cnt2 and a pattern string
4398   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4399 
4400   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4401   // With a small pattern and source we use linear scan.
4402 
4403   if (icnt1 == -1) {
4404     sub(result_tmp, cnt2, cnt1);
4405     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4406     br(LT, LINEARSEARCH);
4407     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4408     subs(zr, cnt1, 256);
4409     lsr(tmp1, cnt2, 2);
4410     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4411     br(GE, LINEARSTUB);
4412   }
4413 
4414 // The Boyer Moore alogorithm is based on the description here:-
4415 //
4416 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4417 //
4418 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4419 // and the 'Good Suffix' rule.
4420 //
4421 // These rules are essentially heuristics for how far we can shift the
4422 // pattern along the search string.
4423 //
4424 // The implementation here uses the 'Bad Character' rule only because of the
4425 // complexity of initialisation for the 'Good Suffix' rule.
4426 //
4427 // This is also known as the Boyer-Moore-Horspool algorithm:-
4428 //
4429 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4430 //
4431 // This particular implementation has few java-specific optimizations.
4432 //
4433 // #define ASIZE 256
4434 //
4435 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4436 //       int i, j;
4437 //       unsigned c;
4438 //       unsigned char bc[ASIZE];
4439 //
4440 //       /* Preprocessing */
4441 //       for (i = 0; i < ASIZE; ++i)
4442 //          bc[i] = m;
4443 //       for (i = 0; i < m - 1; ) {
4444 //          c = x[i];
4445 //          ++i;
4446 //          // c < 256 for Latin1 string, so, no need for branch
4447 //          #ifdef PATTERN_STRING_IS_LATIN1
4448 //          bc[c] = m - i;
4449 //          #else
4450 //          if (c < ASIZE) bc[c] = m - i;
4451 //          #endif
4452 //       }
4453 //
4454 //       /* Searching */
4455 //       j = 0;
4456 //       while (j <= n - m) {
4457 //          c = y[i+j];
4458 //          if (x[m-1] == c)
4459 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4460 //          if (i < 0) return j;
4461 //          // c < 256 for Latin1 string, so, no need for branch
4462 //          #ifdef SOURCE_STRING_IS_LATIN1
4463 //          // LL case: (c< 256) always true. Remove branch
4464 //          j += bc[y[j+m-1]];
4465 //          #endif
4466 //          #ifndef PATTERN_STRING_IS_UTF
4467 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4468 //          if (c < ASIZE)
4469 //            j += bc[y[j+m-1]];
4470 //          else
4471 //            j += 1
4472 //          #endif
4473 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4474 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4475 //          if (c < ASIZE)
4476 //            j += bc[y[j+m-1]];
4477 //          else
4478 //            j += m
4479 //          #endif
4480 //       }
4481 //    }
4482 
4483   if (icnt1 == -1) {
4484     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4485         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4486     Register cnt1end = tmp2;
4487     Register str2end = cnt2;
4488     Register skipch = tmp2;
4489 
4490     // str1 length is >=8, so, we can read at least 1 register for cases when
4491     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4492     // UL case. We'll re-read last character in inner pre-loop code to have
4493     // single outer pre-loop load
4494     const int firstStep = isL ? 7 : 3;
4495 
4496     const int ASIZE = 256;
4497     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4498     sub(sp, sp, ASIZE);
4499     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4500     mov(ch1, sp);
4501     BIND(BM_INIT_LOOP);
4502       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4503       subs(tmp5, tmp5, 1);
4504       br(GT, BM_INIT_LOOP);
4505 
4506       sub(cnt1tmp, cnt1, 1);
4507       mov(tmp5, str2);
4508       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4509       sub(ch2, cnt1, 1);
4510       mov(tmp3, str1);
4511     BIND(BCLOOP);
4512       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4513       if (!str1_isL) {
4514         subs(zr, ch1, ASIZE);
4515         br(HS, BCSKIP);
4516       }
4517       strb(ch2, Address(sp, ch1));
4518     BIND(BCSKIP);
4519       subs(ch2, ch2, 1);
4520       br(GT, BCLOOP);
4521 
4522       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4523       if (str1_isL == str2_isL) {
4524         // load last 8 bytes (8LL/4UU symbols)
4525         ldr(tmp6, Address(tmp6, -wordSize));
4526       } else {
4527         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4528         // convert Latin1 to UTF. We'll have to wait until load completed, but
4529         // it's still faster than per-character loads+checks
4530         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4531         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4532         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4533         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4534         orr(ch2, ch1, ch2, LSL, 16);
4535         orr(tmp6, tmp6, tmp3, LSL, 48);
4536         orr(tmp6, tmp6, ch2, LSL, 16);
4537       }
4538     BIND(BMLOOPSTR2);
4539       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4540       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4541       if (str1_isL == str2_isL) {
4542         // re-init tmp3. It's for free because it's executed in parallel with
4543         // load above. Alternative is to initialize it before loop, but it'll
4544         // affect performance on in-order systems with 2 or more ld/st pipelines
4545         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4546       }
4547       if (!isL) { // UU/UL case
4548         lsl(ch2, cnt1tmp, 1); // offset in bytes
4549       }
4550       cmp(tmp3, skipch);
4551       br(NE, BMSKIP);
4552       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4553       mov(ch1, tmp6);
4554       if (isL) {
4555         b(BMLOOPSTR1_AFTER_LOAD);
4556       } else {
4557         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4558         b(BMLOOPSTR1_CMP);
4559       }
4560     BIND(BMLOOPSTR1);
4561       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4562       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4563     BIND(BMLOOPSTR1_AFTER_LOAD);
4564       subs(cnt1tmp, cnt1tmp, 1);
4565       br(LT, BMLOOPSTR1_LASTCMP);
4566     BIND(BMLOOPSTR1_CMP);
4567       cmp(ch1, ch2);
4568       br(EQ, BMLOOPSTR1);
4569     BIND(BMSKIP);
4570       if (!isL) {
4571         // if we've met UTF symbol while searching Latin1 pattern, then we can
4572         // skip cnt1 symbols
4573         if (str1_isL != str2_isL) {
4574           mov(result_tmp, cnt1);
4575         } else {
4576           mov(result_tmp, 1);
4577         }
4578         subs(zr, skipch, ASIZE);
4579         br(HS, BMADV);
4580       }
4581       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4582     BIND(BMADV);
4583       sub(cnt1tmp, cnt1, 1);
4584       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4585       cmp(str2, str2end);
4586       br(LE, BMLOOPSTR2);
4587       add(sp, sp, ASIZE);
4588       b(NOMATCH);
4589     BIND(BMLOOPSTR1_LASTCMP);
4590       cmp(ch1, ch2);
4591       br(NE, BMSKIP);
4592     BIND(BMMATCH);
4593       sub(result, str2, tmp5);
4594       if (!str2_isL) lsr(result, result, 1);
4595       add(sp, sp, ASIZE);
4596       b(DONE);
4597 
4598     BIND(LINEARSTUB);
4599     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4600     br(LT, LINEAR_MEDIUM);
4601     mov(result, zr);
4602     RuntimeAddress stub = NULL;
4603     if (isL) {
4604       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4605       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4606     } else if (str1_isL) {
4607       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4608        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4609     } else {
4610       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4611       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4612     }
4613     trampoline_call(stub);
4614     b(DONE);
4615   }
4616 
4617   BIND(LINEARSEARCH);
4618   {
4619     Label DO1, DO2, DO3;
4620 
4621     Register str2tmp = tmp2;
4622     Register first = tmp3;
4623 
4624     if (icnt1 == -1)
4625     {
4626         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4627 
4628         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4629         br(LT, DOSHORT);
4630       BIND(LINEAR_MEDIUM);
4631         (this->*str1_load_1chr)(first, Address(str1));
4632         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4633         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4634         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4635         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4636 
4637       BIND(FIRST_LOOP);
4638         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4639         cmp(first, ch2);
4640         br(EQ, STR1_LOOP);
4641       BIND(STR2_NEXT);
4642         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4643         br(LE, FIRST_LOOP);
4644         b(NOMATCH);
4645 
4646       BIND(STR1_LOOP);
4647         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4648         add(cnt2tmp, cnt2_neg, str2_chr_size);
4649         br(GE, MATCH);
4650 
4651       BIND(STR1_NEXT);
4652         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4653         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4654         cmp(ch1, ch2);
4655         br(NE, STR2_NEXT);
4656         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4657         add(cnt2tmp, cnt2tmp, str2_chr_size);
4658         br(LT, STR1_NEXT);
4659         b(MATCH);
4660 
4661       BIND(DOSHORT);
4662       if (str1_isL == str2_isL) {
4663         cmp(cnt1, (u1)2);
4664         br(LT, DO1);
4665         br(GT, DO3);
4666       }
4667     }
4668 
4669     if (icnt1 == 4) {
4670       Label CH1_LOOP;
4671 
4672         (this->*load_4chr)(ch1, str1);
4673         sub(result_tmp, cnt2, 4);
4674         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4675         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4676 
4677       BIND(CH1_LOOP);
4678         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4679         cmp(ch1, ch2);
4680         br(EQ, MATCH);
4681         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4682         br(LE, CH1_LOOP);
4683         b(NOMATCH);
4684       }
4685 
4686     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4687       Label CH1_LOOP;
4688 
4689       BIND(DO2);
4690         (this->*load_2chr)(ch1, str1);
4691         if (icnt1 == 2) {
4692           sub(result_tmp, cnt2, 2);
4693         }
4694         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4695         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4696       BIND(CH1_LOOP);
4697         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4698         cmp(ch1, ch2);
4699         br(EQ, MATCH);
4700         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4701         br(LE, CH1_LOOP);
4702         b(NOMATCH);
4703     }
4704 
4705     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4706       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4707 
4708       BIND(DO3);
4709         (this->*load_2chr)(first, str1);
4710         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4711         if (icnt1 == 3) {
4712           sub(result_tmp, cnt2, 3);
4713         }
4714         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4715         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4716       BIND(FIRST_LOOP);
4717         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4718         cmpw(first, ch2);
4719         br(EQ, STR1_LOOP);
4720       BIND(STR2_NEXT);
4721         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4722         br(LE, FIRST_LOOP);
4723         b(NOMATCH);
4724 
4725       BIND(STR1_LOOP);
4726         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4727         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4728         cmp(ch1, ch2);
4729         br(NE, STR2_NEXT);
4730         b(MATCH);
4731     }
4732 
4733     if (icnt1 == -1 || icnt1 == 1) {
4734       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4735 
4736       BIND(DO1);
4737         (this->*str1_load_1chr)(ch1, str1);
4738         cmp(cnt2, (u1)8);
4739         br(LT, DO1_SHORT);
4740 
4741         sub(result_tmp, cnt2, 8/str2_chr_size);
4742         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4743         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4744         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4745 
4746         if (str2_isL) {
4747           orr(ch1, ch1, ch1, LSL, 8);
4748         }
4749         orr(ch1, ch1, ch1, LSL, 16);
4750         orr(ch1, ch1, ch1, LSL, 32);
4751       BIND(CH1_LOOP);
4752         ldr(ch2, Address(str2, cnt2_neg));
4753         eor(ch2, ch1, ch2);
4754         sub(tmp1, ch2, tmp3);
4755         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4756         bics(tmp1, tmp1, tmp2);
4757         br(NE, HAS_ZERO);
4758         adds(cnt2_neg, cnt2_neg, 8);
4759         br(LT, CH1_LOOP);
4760 
4761         cmp(cnt2_neg, (u1)8);
4762         mov(cnt2_neg, 0);
4763         br(LT, CH1_LOOP);
4764         b(NOMATCH);
4765 
4766       BIND(HAS_ZERO);
4767         rev(tmp1, tmp1);
4768         clz(tmp1, tmp1);
4769         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4770         b(MATCH);
4771 
4772       BIND(DO1_SHORT);
4773         mov(result_tmp, cnt2);
4774         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4775         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4776       BIND(DO1_LOOP);
4777         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4778         cmpw(ch1, ch2);
4779         br(EQ, MATCH);
4780         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4781         br(LT, DO1_LOOP);
4782     }
4783   }
4784   BIND(NOMATCH);
4785     mov(result, -1);
4786     b(DONE);
4787   BIND(MATCH);
4788     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4789   BIND(DONE);
4790 }
4791 
4792 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4793 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4794 
4795 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4796                                          Register ch, Register result,
4797                                          Register tmp1, Register tmp2, Register tmp3)
4798 {
4799   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4800   Register cnt1_neg = cnt1;
4801   Register ch1 = rscratch1;
4802   Register result_tmp = rscratch2;
4803 
4804   cmp(cnt1, (u1)4);
4805   br(LT, DO1_SHORT);
4806 
4807   orr(ch, ch, ch, LSL, 16);
4808   orr(ch, ch, ch, LSL, 32);
4809 
4810   sub(cnt1, cnt1, 4);
4811   mov(result_tmp, cnt1);
4812   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4813   sub(cnt1_neg, zr, cnt1, LSL, 1);
4814 
4815   mov(tmp3, 0x0001000100010001);
4816 
4817   BIND(CH1_LOOP);
4818     ldr(ch1, Address(str1, cnt1_neg));
4819     eor(ch1, ch, ch1);
4820     sub(tmp1, ch1, tmp3);
4821     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4822     bics(tmp1, tmp1, tmp2);
4823     br(NE, HAS_ZERO);
4824     adds(cnt1_neg, cnt1_neg, 8);
4825     br(LT, CH1_LOOP);
4826 
4827     cmp(cnt1_neg, (u1)8);
4828     mov(cnt1_neg, 0);
4829     br(LT, CH1_LOOP);
4830     b(NOMATCH);
4831 
4832   BIND(HAS_ZERO);
4833     rev(tmp1, tmp1);
4834     clz(tmp1, tmp1);
4835     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4836     b(MATCH);
4837 
4838   BIND(DO1_SHORT);
4839     mov(result_tmp, cnt1);
4840     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4841     sub(cnt1_neg, zr, cnt1, LSL, 1);
4842   BIND(DO1_LOOP);
4843     ldrh(ch1, Address(str1, cnt1_neg));
4844     cmpw(ch, ch1);
4845     br(EQ, MATCH);
4846     adds(cnt1_neg, cnt1_neg, 2);
4847     br(LT, DO1_LOOP);
4848   BIND(NOMATCH);
4849     mov(result, -1);
4850     b(DONE);
4851   BIND(MATCH);
4852     add(result, result_tmp, cnt1_neg, ASR, 1);
4853   BIND(DONE);
4854 }
4855 
4856 // Compare strings.
4857 void MacroAssembler::string_compare(Register str1, Register str2,
4858     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4859     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4860   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4861       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4862       SHORT_LOOP_START, TAIL_CHECK;
4863 
4864   const u1 STUB_THRESHOLD = 64 + 8;
4865   bool isLL = ae == StrIntrinsicNode::LL;
4866   bool isLU = ae == StrIntrinsicNode::LU;
4867   bool isUL = ae == StrIntrinsicNode::UL;
4868 
4869   bool str1_isL = isLL || isLU;
4870   bool str2_isL = isLL || isUL;
4871 
4872   int str1_chr_shift = str1_isL ? 0 : 1;
4873   int str2_chr_shift = str2_isL ? 0 : 1;
4874   int str1_chr_size = str1_isL ? 1 : 2;
4875   int str2_chr_size = str2_isL ? 1 : 2;
4876   int minCharsInWord = isLL ? wordSize : wordSize/2;
4877 
4878   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4879   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4880                                       (chr_insn)&MacroAssembler::ldrh;
4881   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4882                                       (chr_insn)&MacroAssembler::ldrh;
4883   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4884                             (uxt_insn)&MacroAssembler::uxthw;
4885 
4886   BLOCK_COMMENT("string_compare {");
4887 
4888   // Bizzarely, the counts are passed in bytes, regardless of whether they
4889   // are L or U strings, however the result is always in characters.
4890   if (!str1_isL) asrw(cnt1, cnt1, 1);
4891   if (!str2_isL) asrw(cnt2, cnt2, 1);
4892 
4893   // Compute the minimum of the string lengths and save the difference.
4894   subsw(result, cnt1, cnt2);
4895   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4896 
4897   // A very short string
4898   cmpw(cnt2, minCharsInWord);
4899   br(Assembler::LT, SHORT_STRING);
4900 
4901   // Compare longwords
4902   // load first parts of strings and finish initialization while loading
4903   {
4904     if (str1_isL == str2_isL) { // LL or UU
4905       ldr(tmp1, Address(str1));
4906       cmp(str1, str2);
4907       br(Assembler::EQ, DONE);
4908       ldr(tmp2, Address(str2));
4909       cmp(cnt2, STUB_THRESHOLD);
4910       br(GE, STUB);
4911       subsw(cnt2, cnt2, minCharsInWord);
4912       br(EQ, TAIL_CHECK);
4913       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4914       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4915       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4916     } else if (isLU) {
4917       ldrs(vtmp, Address(str1));
4918       cmp(str1, str2);
4919       br(Assembler::EQ, DONE);
4920       ldr(tmp2, Address(str2));
4921       cmp(cnt2, STUB_THRESHOLD);
4922       br(GE, STUB);
4923       subsw(cnt2, cnt2, 4);
4924       br(EQ, TAIL_CHECK);
4925       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4926       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4927       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4928       zip1(vtmp, T8B, vtmp, vtmpZ);
4929       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4930       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4931       add(cnt1, cnt1, 4);
4932       fmovd(tmp1, vtmp);
4933     } else { // UL case
4934       ldr(tmp1, Address(str1));
4935       cmp(str1, str2);
4936       br(Assembler::EQ, DONE);
4937       ldrs(vtmp, Address(str2));
4938       cmp(cnt2, STUB_THRESHOLD);
4939       br(GE, STUB);
4940       subsw(cnt2, cnt2, 4);
4941       br(EQ, TAIL_CHECK);
4942       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4943       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4944       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4945       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4946       zip1(vtmp, T8B, vtmp, vtmpZ);
4947       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4948       add(cnt1, cnt1, 8);
4949       fmovd(tmp2, vtmp);
4950     }
4951     adds(cnt2, cnt2, isUL ? 4 : 8);
4952     br(GE, TAIL);
4953     eor(rscratch2, tmp1, tmp2);
4954     cbnz(rscratch2, DIFFERENCE);
4955     // main loop
4956     bind(NEXT_WORD);
4957     if (str1_isL == str2_isL) {
4958       ldr(tmp1, Address(str1, cnt2));
4959       ldr(tmp2, Address(str2, cnt2));
4960       adds(cnt2, cnt2, 8);
4961     } else if (isLU) {
4962       ldrs(vtmp, Address(str1, cnt1));
4963       ldr(tmp2, Address(str2, cnt2));
4964       add(cnt1, cnt1, 4);
4965       zip1(vtmp, T8B, vtmp, vtmpZ);
4966       fmovd(tmp1, vtmp);
4967       adds(cnt2, cnt2, 8);
4968     } else { // UL
4969       ldrs(vtmp, Address(str2, cnt2));
4970       ldr(tmp1, Address(str1, cnt1));
4971       zip1(vtmp, T8B, vtmp, vtmpZ);
4972       add(cnt1, cnt1, 8);
4973       fmovd(tmp2, vtmp);
4974       adds(cnt2, cnt2, 4);
4975     }
4976     br(GE, TAIL);
4977 
4978     eor(rscratch2, tmp1, tmp2);
4979     cbz(rscratch2, NEXT_WORD);
4980     b(DIFFERENCE);
4981     bind(TAIL);
4982     eor(rscratch2, tmp1, tmp2);
4983     cbnz(rscratch2, DIFFERENCE);
4984     // Last longword.  In the case where length == 4 we compare the
4985     // same longword twice, but that's still faster than another
4986     // conditional branch.
4987     if (str1_isL == str2_isL) {
4988       ldr(tmp1, Address(str1));
4989       ldr(tmp2, Address(str2));
4990     } else if (isLU) {
4991       ldrs(vtmp, Address(str1));
4992       ldr(tmp2, Address(str2));
4993       zip1(vtmp, T8B, vtmp, vtmpZ);
4994       fmovd(tmp1, vtmp);
4995     } else { // UL
4996       ldrs(vtmp, Address(str2));
4997       ldr(tmp1, Address(str1));
4998       zip1(vtmp, T8B, vtmp, vtmpZ);
4999       fmovd(tmp2, vtmp);
5000     }
5001     bind(TAIL_CHECK);
5002     eor(rscratch2, tmp1, tmp2);
5003     cbz(rscratch2, DONE);
5004 
5005     // Find the first different characters in the longwords and
5006     // compute their difference.
5007     bind(DIFFERENCE);
5008     rev(rscratch2, rscratch2);
5009     clz(rscratch2, rscratch2);
5010     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5011     lsrv(tmp1, tmp1, rscratch2);
5012     (this->*ext_chr)(tmp1, tmp1);
5013     lsrv(tmp2, tmp2, rscratch2);
5014     (this->*ext_chr)(tmp2, tmp2);
5015     subw(result, tmp1, tmp2);
5016     b(DONE);
5017   }
5018 
5019   bind(STUB);
5020     RuntimeAddress stub = NULL;
5021     switch(ae) {
5022       case StrIntrinsicNode::LL:
5023         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5024         break;
5025       case StrIntrinsicNode::UU:
5026         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5027         break;
5028       case StrIntrinsicNode::LU:
5029         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5030         break;
5031       case StrIntrinsicNode::UL:
5032         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5033         break;
5034       default:
5035         ShouldNotReachHere();
5036      }
5037     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5038     trampoline_call(stub);
5039     b(DONE);
5040 
5041   bind(SHORT_STRING);
5042   // Is the minimum length zero?
5043   cbz(cnt2, DONE);
5044   // arrange code to do most branches while loading and loading next characters
5045   // while comparing previous
5046   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5047   subs(cnt2, cnt2, 1);
5048   br(EQ, SHORT_LAST_INIT);
5049   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5050   b(SHORT_LOOP_START);
5051   bind(SHORT_LOOP);
5052   subs(cnt2, cnt2, 1);
5053   br(EQ, SHORT_LAST);
5054   bind(SHORT_LOOP_START);
5055   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5056   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5057   cmp(tmp1, cnt1);
5058   br(NE, SHORT_LOOP_TAIL);
5059   subs(cnt2, cnt2, 1);
5060   br(EQ, SHORT_LAST2);
5061   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5062   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5063   cmp(tmp2, rscratch1);
5064   br(EQ, SHORT_LOOP);
5065   sub(result, tmp2, rscratch1);
5066   b(DONE);
5067   bind(SHORT_LOOP_TAIL);
5068   sub(result, tmp1, cnt1);
5069   b(DONE);
5070   bind(SHORT_LAST2);
5071   cmp(tmp2, rscratch1);
5072   br(EQ, DONE);
5073   sub(result, tmp2, rscratch1);
5074 
5075   b(DONE);
5076   bind(SHORT_LAST_INIT);
5077   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5078   bind(SHORT_LAST);
5079   cmp(tmp1, cnt1);
5080   br(EQ, DONE);
5081   sub(result, tmp1, cnt1);
5082 
5083   bind(DONE);
5084 
5085   BLOCK_COMMENT("} string_compare");
5086 }
5087 #endif // COMPILER2
5088 
5089 // This method checks if provided byte array contains byte with highest bit set.
5090 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5091     // Simple and most common case of aligned small array which is not at the
5092     // end of memory page is placed here. All other cases are in stub.
5093     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5094     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5095     assert_different_registers(ary1, len, result);
5096 
5097     cmpw(len, 0);
5098     br(LE, SET_RESULT);
5099     cmpw(len, 4 * wordSize);
5100     br(GE, STUB_LONG); // size > 32 then go to stub
5101 
5102     int shift = 64 - exact_log2(os::vm_page_size());
5103     lsl(rscratch1, ary1, shift);
5104     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5105     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5106     br(CS, STUB); // at the end of page then go to stub
5107     subs(len, len, wordSize);
5108     br(LT, END);
5109 
5110   BIND(LOOP);
5111     ldr(rscratch1, Address(post(ary1, wordSize)));
5112     tst(rscratch1, UPPER_BIT_MASK);
5113     br(NE, SET_RESULT);
5114     subs(len, len, wordSize);
5115     br(GE, LOOP);
5116     cmpw(len, -wordSize);
5117     br(EQ, SET_RESULT);
5118 
5119   BIND(END);
5120     ldr(result, Address(ary1));
5121     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5122     lslv(result, result, len);
5123     tst(result, UPPER_BIT_MASK);
5124     b(SET_RESULT);
5125 
5126   BIND(STUB);
5127     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5128     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5129     trampoline_call(has_neg);
5130     b(DONE);
5131 
5132   BIND(STUB_LONG);
5133     RuntimeAddress has_neg_long =  RuntimeAddress(
5134             StubRoutines::aarch64::has_negatives_long());
5135     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5136     trampoline_call(has_neg_long);
5137     b(DONE);
5138 
5139   BIND(SET_RESULT);
5140     cset(result, NE); // set true or false
5141 
5142   BIND(DONE);
5143 }
5144 
5145 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5146                                    Register tmp4, Register tmp5, Register result,
5147                                    Register cnt1, int elem_size) {
5148   Label DONE, SAME;
5149   Register tmp1 = rscratch1;
5150   Register tmp2 = rscratch2;
5151   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5152   int elem_per_word = wordSize/elem_size;
5153   int log_elem_size = exact_log2(elem_size);
5154   int length_offset = arrayOopDesc::length_offset_in_bytes();
5155   int base_offset
5156     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5157   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5158 
5159   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5160   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5161 
5162 #ifndef PRODUCT
5163   {
5164     const char kind = (elem_size == 2) ? 'U' : 'L';
5165     char comment[64];
5166     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5167     BLOCK_COMMENT(comment);
5168   }
5169 #endif
5170 
5171   // if (a1 == a2)
5172   //     return true;
5173   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5174   br(EQ, SAME);
5175 
5176   if (UseSimpleArrayEquals) {
5177     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5178     // if (a1 == null || a2 == null)
5179     //     return false;
5180     // a1 & a2 == 0 means (some-pointer is null) or
5181     // (very-rare-or-even-probably-impossible-pointer-values)
5182     // so, we can save one branch in most cases
5183     tst(a1, a2);
5184     mov(result, false);
5185     br(EQ, A_MIGHT_BE_NULL);
5186     // if (a1.length != a2.length)
5187     //      return false;
5188     bind(A_IS_NOT_NULL);
5189     ldrw(cnt1, Address(a1, length_offset));
5190     ldrw(cnt2, Address(a2, length_offset));
5191     eorw(tmp5, cnt1, cnt2);
5192     cbnzw(tmp5, DONE);
5193     lea(a1, Address(a1, base_offset));
5194     lea(a2, Address(a2, base_offset));
5195     // Check for short strings, i.e. smaller than wordSize.
5196     subs(cnt1, cnt1, elem_per_word);
5197     br(Assembler::LT, SHORT);
5198     // Main 8 byte comparison loop.
5199     bind(NEXT_WORD); {
5200       ldr(tmp1, Address(post(a1, wordSize)));
5201       ldr(tmp2, Address(post(a2, wordSize)));
5202       subs(cnt1, cnt1, elem_per_word);
5203       eor(tmp5, tmp1, tmp2);
5204       cbnz(tmp5, DONE);
5205     } br(GT, NEXT_WORD);
5206     // Last longword.  In the case where length == 4 we compare the
5207     // same longword twice, but that's still faster than another
5208     // conditional branch.
5209     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5210     // length == 4.
5211     if (log_elem_size > 0)
5212       lsl(cnt1, cnt1, log_elem_size);
5213     ldr(tmp3, Address(a1, cnt1));
5214     ldr(tmp4, Address(a2, cnt1));
5215     eor(tmp5, tmp3, tmp4);
5216     cbnz(tmp5, DONE);
5217     b(SAME);
5218     bind(A_MIGHT_BE_NULL);
5219     // in case both a1 and a2 are not-null, proceed with loads
5220     cbz(a1, DONE);
5221     cbz(a2, DONE);
5222     b(A_IS_NOT_NULL);
5223     bind(SHORT);
5224 
5225     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5226     {
5227       ldrw(tmp1, Address(post(a1, 4)));
5228       ldrw(tmp2, Address(post(a2, 4)));
5229       eorw(tmp5, tmp1, tmp2);
5230       cbnzw(tmp5, DONE);
5231     }
5232     bind(TAIL03);
5233     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5234     {
5235       ldrh(tmp3, Address(post(a1, 2)));
5236       ldrh(tmp4, Address(post(a2, 2)));
5237       eorw(tmp5, tmp3, tmp4);
5238       cbnzw(tmp5, DONE);
5239     }
5240     bind(TAIL01);
5241     if (elem_size == 1) { // Only needed when comparing byte arrays.
5242       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5243       {
5244         ldrb(tmp1, a1);
5245         ldrb(tmp2, a2);
5246         eorw(tmp5, tmp1, tmp2);
5247         cbnzw(tmp5, DONE);
5248       }
5249     }
5250   } else {
5251     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5252         CSET_EQ, LAST_CHECK;
5253     mov(result, false);
5254     cbz(a1, DONE);
5255     ldrw(cnt1, Address(a1, length_offset));
5256     cbz(a2, DONE);
5257     ldrw(cnt2, Address(a2, length_offset));
5258     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5259     // faster to perform another branch before comparing a1 and a2
5260     cmp(cnt1, (u1)elem_per_word);
5261     br(LE, SHORT); // short or same
5262     ldr(tmp3, Address(pre(a1, base_offset)));
5263     subs(zr, cnt1, stubBytesThreshold);
5264     br(GE, STUB);
5265     ldr(tmp4, Address(pre(a2, base_offset)));
5266     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5267     cmp(cnt2, cnt1);
5268     br(NE, DONE);
5269 
5270     // Main 16 byte comparison loop with 2 exits
5271     bind(NEXT_DWORD); {
5272       ldr(tmp1, Address(pre(a1, wordSize)));
5273       ldr(tmp2, Address(pre(a2, wordSize)));
5274       subs(cnt1, cnt1, 2 * elem_per_word);
5275       br(LE, TAIL);
5276       eor(tmp4, tmp3, tmp4);
5277       cbnz(tmp4, DONE);
5278       ldr(tmp3, Address(pre(a1, wordSize)));
5279       ldr(tmp4, Address(pre(a2, wordSize)));
5280       cmp(cnt1, (u1)elem_per_word);
5281       br(LE, TAIL2);
5282       cmp(tmp1, tmp2);
5283     } br(EQ, NEXT_DWORD);
5284     b(DONE);
5285 
5286     bind(TAIL);
5287     eor(tmp4, tmp3, tmp4);
5288     eor(tmp2, tmp1, tmp2);
5289     lslv(tmp2, tmp2, tmp5);
5290     orr(tmp5, tmp4, tmp2);
5291     cmp(tmp5, zr);
5292     b(CSET_EQ);
5293 
5294     bind(TAIL2);
5295     eor(tmp2, tmp1, tmp2);
5296     cbnz(tmp2, DONE);
5297     b(LAST_CHECK);
5298 
5299     bind(STUB);
5300     ldr(tmp4, Address(pre(a2, base_offset)));
5301     cmp(cnt2, cnt1);
5302     br(NE, DONE);
5303     if (elem_size == 2) { // convert to byte counter
5304       lsl(cnt1, cnt1, 1);
5305     }
5306     eor(tmp5, tmp3, tmp4);
5307     cbnz(tmp5, DONE);
5308     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5309     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5310     trampoline_call(stub);
5311     b(DONE);
5312 
5313     bind(EARLY_OUT);
5314     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5315     // so, if a2 == null => return false(0), else return true, so we can return a2
5316     mov(result, a2);
5317     b(DONE);
5318     bind(SHORT);
5319     cmp(cnt2, cnt1);
5320     br(NE, DONE);
5321     cbz(cnt1, SAME);
5322     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5323     ldr(tmp3, Address(a1, base_offset));
5324     ldr(tmp4, Address(a2, base_offset));
5325     bind(LAST_CHECK);
5326     eor(tmp4, tmp3, tmp4);
5327     lslv(tmp5, tmp4, tmp5);
5328     cmp(tmp5, zr);
5329     bind(CSET_EQ);
5330     cset(result, EQ);
5331     b(DONE);
5332   }
5333 
5334   bind(SAME);
5335   mov(result, true);
5336   // That's it.
5337   bind(DONE);
5338 
5339   BLOCK_COMMENT("} array_equals");
5340 }
5341 
5342 // Compare Strings
5343 
5344 // For Strings we're passed the address of the first characters in a1
5345 // and a2 and the length in cnt1.
5346 // elem_size is the element size in bytes: either 1 or 2.
5347 // There are two implementations.  For arrays >= 8 bytes, all
5348 // comparisons (including the final one, which may overlap) are
5349 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5350 // halfword, then a short, and then a byte.
5351 
5352 void MacroAssembler::string_equals(Register a1, Register a2,
5353                                    Register result, Register cnt1, int elem_size)
5354 {
5355   Label SAME, DONE, SHORT, NEXT_WORD;
5356   Register tmp1 = rscratch1;
5357   Register tmp2 = rscratch2;
5358   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5359 
5360   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5361   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5362 
5363 #ifndef PRODUCT
5364   {
5365     const char kind = (elem_size == 2) ? 'U' : 'L';
5366     char comment[64];
5367     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5368     BLOCK_COMMENT(comment);
5369   }
5370 #endif
5371 
5372   mov(result, false);
5373 
5374   // Check for short strings, i.e. smaller than wordSize.
5375   subs(cnt1, cnt1, wordSize);
5376   br(Assembler::LT, SHORT);
5377   // Main 8 byte comparison loop.
5378   bind(NEXT_WORD); {
5379     ldr(tmp1, Address(post(a1, wordSize)));
5380     ldr(tmp2, Address(post(a2, wordSize)));
5381     subs(cnt1, cnt1, wordSize);
5382     eor(tmp1, tmp1, tmp2);
5383     cbnz(tmp1, DONE);
5384   } br(GT, NEXT_WORD);
5385   // Last longword.  In the case where length == 4 we compare the
5386   // same longword twice, but that's still faster than another
5387   // conditional branch.
5388   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5389   // length == 4.
5390   ldr(tmp1, Address(a1, cnt1));
5391   ldr(tmp2, Address(a2, cnt1));
5392   eor(tmp2, tmp1, tmp2);
5393   cbnz(tmp2, DONE);
5394   b(SAME);
5395 
5396   bind(SHORT);
5397   Label TAIL03, TAIL01;
5398 
5399   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5400   {
5401     ldrw(tmp1, Address(post(a1, 4)));
5402     ldrw(tmp2, Address(post(a2, 4)));
5403     eorw(tmp1, tmp1, tmp2);
5404     cbnzw(tmp1, DONE);
5405   }
5406   bind(TAIL03);
5407   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5408   {
5409     ldrh(tmp1, Address(post(a1, 2)));
5410     ldrh(tmp2, Address(post(a2, 2)));
5411     eorw(tmp1, tmp1, tmp2);
5412     cbnzw(tmp1, DONE);
5413   }
5414   bind(TAIL01);
5415   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5416     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5417     {
5418       ldrb(tmp1, a1);
5419       ldrb(tmp2, a2);
5420       eorw(tmp1, tmp1, tmp2);
5421       cbnzw(tmp1, DONE);
5422     }
5423   }
5424   // Arrays are equal.
5425   bind(SAME);
5426   mov(result, true);
5427 
5428   // That's it.
5429   bind(DONE);
5430   BLOCK_COMMENT("} string_equals");
5431 }
5432 
5433 
5434 // The size of the blocks erased by the zero_blocks stub.  We must
5435 // handle anything smaller than this ourselves in zero_words().
5436 const int MacroAssembler::zero_words_block_size = 8;
5437 
5438 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5439 // possible, handling small word counts locally and delegating
5440 // anything larger to the zero_blocks stub.  It is expanded many times
5441 // in compiled code, so it is important to keep it short.
5442 
5443 // ptr:   Address of a buffer to be zeroed.
5444 // cnt:   Count in HeapWords.
5445 //
5446 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5447 void MacroAssembler::zero_words(Register ptr, Register cnt)
5448 {
5449   assert(is_power_of_2(zero_words_block_size), "adjust this");
5450   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5451 
5452   BLOCK_COMMENT("zero_words {");
5453   cmp(cnt, (u1)zero_words_block_size);
5454   Label around;
5455   br(LO, around);
5456   {
5457     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5458     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5459     if (StubRoutines::aarch64::complete()) {
5460       trampoline_call(zero_blocks);
5461     } else {
5462       bl(zero_blocks);
5463     }
5464   }
5465   bind(around);
5466   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5467     Label l;
5468     tbz(cnt, exact_log2(i), l);
5469     for (int j = 0; j < i; j += 2) {
5470       stp(zr, zr, post(ptr, 16));
5471     }
5472     bind(l);
5473   }
5474   {
5475     Label l;
5476     tbz(cnt, 0, l);
5477     str(zr, Address(ptr));
5478     bind(l);
5479   }
5480   BLOCK_COMMENT("} zero_words");
5481 }
5482 
5483 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5484 // cnt:          Immediate count in HeapWords.
5485 #define SmallArraySize (18 * BytesPerLong)
5486 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5487 {
5488   BLOCK_COMMENT("zero_words {");
5489   int i = cnt & 1;  // store any odd word to start
5490   if (i) str(zr, Address(base));
5491 
5492   if (cnt <= SmallArraySize / BytesPerLong) {
5493     for (; i < (int)cnt; i += 2)
5494       stp(zr, zr, Address(base, i * wordSize));
5495   } else {
5496     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5497     int remainder = cnt % (2 * unroll);
5498     for (; i < remainder; i += 2)
5499       stp(zr, zr, Address(base, i * wordSize));
5500 
5501     Label loop;
5502     Register cnt_reg = rscratch1;
5503     Register loop_base = rscratch2;
5504     cnt = cnt - remainder;
5505     mov(cnt_reg, cnt);
5506     // adjust base and prebias by -2 * wordSize so we can pre-increment
5507     add(loop_base, base, (remainder - 2) * wordSize);
5508     bind(loop);
5509     sub(cnt_reg, cnt_reg, 2 * unroll);
5510     for (i = 1; i < unroll; i++)
5511       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5512     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5513     cbnz(cnt_reg, loop);
5514   }
5515   BLOCK_COMMENT("} zero_words");
5516 }
5517 
5518 // Zero blocks of memory by using DC ZVA.
5519 //
5520 // Aligns the base address first sufficently for DC ZVA, then uses
5521 // DC ZVA repeatedly for every full block.  cnt is the size to be
5522 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5523 // in cnt.
5524 //
5525 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5526 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5527 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5528   Register tmp = rscratch1;
5529   Register tmp2 = rscratch2;
5530   int zva_length = VM_Version::zva_length();
5531   Label initial_table_end, loop_zva;
5532   Label fini;
5533 
5534   // Base must be 16 byte aligned. If not just return and let caller handle it
5535   tst(base, 0x0f);
5536   br(Assembler::NE, fini);
5537   // Align base with ZVA length.
5538   neg(tmp, base);
5539   andr(tmp, tmp, zva_length - 1);
5540 
5541   // tmp: the number of bytes to be filled to align the base with ZVA length.
5542   add(base, base, tmp);
5543   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5544   adr(tmp2, initial_table_end);
5545   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5546   br(tmp2);
5547 
5548   for (int i = -zva_length + 16; i < 0; i += 16)
5549     stp(zr, zr, Address(base, i));
5550   bind(initial_table_end);
5551 
5552   sub(cnt, cnt, zva_length >> 3);
5553   bind(loop_zva);
5554   dc(Assembler::ZVA, base);
5555   subs(cnt, cnt, zva_length >> 3);
5556   add(base, base, zva_length);
5557   br(Assembler::GE, loop_zva);
5558   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5559   bind(fini);
5560 }
5561 
5562 // base:   Address of a buffer to be filled, 8 bytes aligned.
5563 // cnt:    Count in 8-byte unit.
5564 // value:  Value to be filled with.
5565 // base will point to the end of the buffer after filling.
5566 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5567 {
5568 //  Algorithm:
5569 //
5570 //    scratch1 = cnt & 7;
5571 //    cnt -= scratch1;
5572 //    p += scratch1;
5573 //    switch (scratch1) {
5574 //      do {
5575 //        cnt -= 8;
5576 //          p[-8] = v;
5577 //        case 7:
5578 //          p[-7] = v;
5579 //        case 6:
5580 //          p[-6] = v;
5581 //          // ...
5582 //        case 1:
5583 //          p[-1] = v;
5584 //        case 0:
5585 //          p += 8;
5586 //      } while (cnt);
5587 //    }
5588 
5589   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5590 
5591   Label fini, skip, entry, loop;
5592   const int unroll = 8; // Number of stp instructions we'll unroll
5593 
5594   cbz(cnt, fini);
5595   tbz(base, 3, skip);
5596   str(value, Address(post(base, 8)));
5597   sub(cnt, cnt, 1);
5598   bind(skip);
5599 
5600   andr(rscratch1, cnt, (unroll-1) * 2);
5601   sub(cnt, cnt, rscratch1);
5602   add(base, base, rscratch1, Assembler::LSL, 3);
5603   adr(rscratch2, entry);
5604   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5605   br(rscratch2);
5606 
5607   bind(loop);
5608   add(base, base, unroll * 16);
5609   for (int i = -unroll; i < 0; i++)
5610     stp(value, value, Address(base, i * 16));
5611   bind(entry);
5612   subs(cnt, cnt, unroll * 2);
5613   br(Assembler::GE, loop);
5614 
5615   tbz(cnt, 0, fini);
5616   str(value, Address(post(base, 8)));
5617   bind(fini);
5618 }
5619 
5620 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5621 // java/lang/StringUTF16.compress.
5622 void MacroAssembler::encode_iso_array(Register src, Register dst,
5623                       Register len, Register result,
5624                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5625                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5626 {
5627     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5628         NEXT_32_START, NEXT_32_PRFM_START;
5629     Register tmp1 = rscratch1, tmp2 = rscratch2;
5630 
5631       mov(result, len); // Save initial len
5632 
5633 #ifndef BUILTIN_SIM
5634       cmp(len, (u1)8); // handle shortest strings first
5635       br(LT, LOOP_1);
5636       cmp(len, (u1)32);
5637       br(LT, NEXT_8);
5638       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5639       // to convert chars to bytes
5640       if (SoftwarePrefetchHintDistance >= 0) {
5641         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5642         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5643         br(LE, NEXT_32_START);
5644         b(NEXT_32_PRFM_START);
5645         BIND(NEXT_32_PRFM);
5646           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5647         BIND(NEXT_32_PRFM_START);
5648           prfm(Address(src, SoftwarePrefetchHintDistance));
5649           orr(v4, T16B, Vtmp1, Vtmp2);
5650           orr(v5, T16B, Vtmp3, Vtmp4);
5651           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5652           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5653           stpq(Vtmp1, Vtmp3, dst);
5654           uzp2(v5, T16B, v4, v5); // high bytes
5655           umov(tmp2, v5, D, 1);
5656           fmovd(tmp1, v5);
5657           orr(tmp1, tmp1, tmp2);
5658           cbnz(tmp1, LOOP_8);
5659           sub(len, len, 32);
5660           add(dst, dst, 32);
5661           add(src, src, 64);
5662           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5663           br(GE, NEXT_32_PRFM);
5664           cmp(len, (u1)32);
5665           br(LT, LOOP_8);
5666         BIND(NEXT_32);
5667           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5668         BIND(NEXT_32_START);
5669       } else {
5670         BIND(NEXT_32);
5671           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5672       }
5673       prfm(Address(src, SoftwarePrefetchHintDistance));
5674       uzp1(v4, T16B, Vtmp1, Vtmp2);
5675       uzp1(v5, T16B, Vtmp3, Vtmp4);
5676       stpq(v4, v5, dst);
5677       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5678       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5679       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5680       umov(tmp2, Vtmp1, D, 1);
5681       fmovd(tmp1, Vtmp1);
5682       orr(tmp1, tmp1, tmp2);
5683       cbnz(tmp1, LOOP_8);
5684       sub(len, len, 32);
5685       add(dst, dst, 32);
5686       add(src, src, 64);
5687       cmp(len, (u1)32);
5688       br(GE, NEXT_32);
5689       cbz(len, DONE);
5690 
5691     BIND(LOOP_8);
5692       cmp(len, (u1)8);
5693       br(LT, LOOP_1);
5694     BIND(NEXT_8);
5695       ld1(Vtmp1, T8H, src);
5696       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5697       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5698       strd(Vtmp2, dst);
5699       fmovd(tmp1, Vtmp3);
5700       cbnz(tmp1, NEXT_1);
5701 
5702       sub(len, len, 8);
5703       add(dst, dst, 8);
5704       add(src, src, 16);
5705       cmp(len, (u1)8);
5706       br(GE, NEXT_8);
5707 
5708     BIND(LOOP_1);
5709 #endif
5710     cbz(len, DONE);
5711     BIND(NEXT_1);
5712       ldrh(tmp1, Address(post(src, 2)));
5713       strb(tmp1, Address(post(dst, 1)));
5714       tst(tmp1, 0xff00);
5715       br(NE, SET_RESULT);
5716       subs(len, len, 1);
5717       br(GT, NEXT_1);
5718 
5719     BIND(SET_RESULT);
5720       sub(result, result, len); // Return index where we stopped
5721                                 // Return len == 0 if we processed all
5722                                 // characters
5723     BIND(DONE);
5724 }
5725 
5726 
5727 // Inflate byte[] array to char[].
5728 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5729                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5730                                         Register tmp4) {
5731   Label big, done, after_init, to_stub;
5732 
5733   assert_different_registers(src, dst, len, tmp4, rscratch1);
5734 
5735   fmovd(vtmp1, zr);
5736   lsrw(tmp4, len, 3);
5737   bind(after_init);
5738   cbnzw(tmp4, big);
5739   // Short string: less than 8 bytes.
5740   {
5741     Label loop, tiny;
5742 
5743     cmpw(len, 4);
5744     br(LT, tiny);
5745     // Use SIMD to do 4 bytes.
5746     ldrs(vtmp2, post(src, 4));
5747     zip1(vtmp3, T8B, vtmp2, vtmp1);
5748     subw(len, len, 4);
5749     strd(vtmp3, post(dst, 8));
5750 
5751     cbzw(len, done);
5752 
5753     // Do the remaining bytes by steam.
5754     bind(loop);
5755     ldrb(tmp4, post(src, 1));
5756     strh(tmp4, post(dst, 2));
5757     subw(len, len, 1);
5758 
5759     bind(tiny);
5760     cbnz(len, loop);
5761 
5762     b(done);
5763   }
5764 
5765   if (SoftwarePrefetchHintDistance >= 0) {
5766     bind(to_stub);
5767       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5768       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5769       trampoline_call(stub);
5770       b(after_init);
5771   }
5772 
5773   // Unpack the bytes 8 at a time.
5774   bind(big);
5775   {
5776     Label loop, around, loop_last, loop_start;
5777 
5778     if (SoftwarePrefetchHintDistance >= 0) {
5779       const int large_loop_threshold = (64 + 16)/8;
5780       ldrd(vtmp2, post(src, 8));
5781       andw(len, len, 7);
5782       cmp(tmp4, (u1)large_loop_threshold);
5783       br(GE, to_stub);
5784       b(loop_start);
5785 
5786       bind(loop);
5787       ldrd(vtmp2, post(src, 8));
5788       bind(loop_start);
5789       subs(tmp4, tmp4, 1);
5790       br(EQ, loop_last);
5791       zip1(vtmp2, T16B, vtmp2, vtmp1);
5792       ldrd(vtmp3, post(src, 8));
5793       st1(vtmp2, T8H, post(dst, 16));
5794       subs(tmp4, tmp4, 1);
5795       zip1(vtmp3, T16B, vtmp3, vtmp1);
5796       st1(vtmp3, T8H, post(dst, 16));
5797       br(NE, loop);
5798       b(around);
5799       bind(loop_last);
5800       zip1(vtmp2, T16B, vtmp2, vtmp1);
5801       st1(vtmp2, T8H, post(dst, 16));
5802       bind(around);
5803       cbz(len, done);
5804     } else {
5805       andw(len, len, 7);
5806       bind(loop);
5807       ldrd(vtmp2, post(src, 8));
5808       sub(tmp4, tmp4, 1);
5809       zip1(vtmp3, T16B, vtmp2, vtmp1);
5810       st1(vtmp3, T8H, post(dst, 16));
5811       cbnz(tmp4, loop);
5812     }
5813   }
5814 
5815   // Do the tail of up to 8 bytes.
5816   add(src, src, len);
5817   ldrd(vtmp3, Address(src, -8));
5818   add(dst, dst, len, ext::uxtw, 1);
5819   zip1(vtmp3, T16B, vtmp3, vtmp1);
5820   strq(vtmp3, Address(dst, -16));
5821 
5822   bind(done);
5823 }
5824 
5825 // Compress char[] array to byte[].
5826 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5827                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5828                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5829                                          Register result) {
5830   encode_iso_array(src, dst, len, result,
5831                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5832   cmp(len, zr);
5833   csel(result, result, zr, EQ);
5834 }
5835 
5836 // get_thread() can be called anywhere inside generated code so we
5837 // need to save whatever non-callee save context might get clobbered
5838 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5839 // the call setup code.
5840 //
5841 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5842 //
5843 void MacroAssembler::get_thread(Register dst) {
5844   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5845   push(saved_regs, sp);
5846 
5847   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5848   blrt(lr, 1, 0, 1);
5849   if (dst != c_rarg0) {
5850     mov(dst, c_rarg0);
5851   }
5852 
5853   pop(saved_regs, sp);
5854 }