1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   if (last_java_pc != NULL) {
 377     adr(scratch, last_java_pc);
 378   } else {
 379     // FIXME: This is almost never correct.  We should delete all
 380     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 381     // correct return address instead.
 382     adr(scratch, pc());
 383   }
 384 
 385   str(scratch, Address(rthread,
 386                        JavaThread::frame_anchor_offset()
 387                        + JavaFrameAnchor::last_Java_pc_offset()));
 388 
 389   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 390 }
 391 
 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 393                                          Register last_java_fp,
 394                                          Label &L,
 395                                          Register scratch) {
 396   if (L.is_bound()) {
 397     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 398   } else {
 399     InstructionMark im(this);
 400     L.add_patch_at(code(), locator());
 401     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 402   }
 403 }
 404 
 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 407   assert(CodeCache::find_blob(entry.target()) != NULL,
 408          "destination of far call not found in code cache");
 409   if (far_branches()) {
 410     unsigned long offset;
 411     // We can use ADRP here because we know that the total size of
 412     // the code cache cannot exceed 2Gb.
 413     adrp(tmp, entry, offset);
 414     add(tmp, tmp, offset);
 415     if (cbuf) cbuf->set_insts_mark();
 416     blr(tmp);
 417   } else {
 418     if (cbuf) cbuf->set_insts_mark();
 419     bl(entry);
 420   }
 421 }
 422 
 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 424   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 425   assert(CodeCache::find_blob(entry.target()) != NULL,
 426          "destination of far call not found in code cache");
 427   if (far_branches()) {
 428     unsigned long offset;
 429     // We can use ADRP here because we know that the total size of
 430     // the code cache cannot exceed 2Gb.
 431     adrp(tmp, entry, offset);
 432     add(tmp, tmp, offset);
 433     if (cbuf) cbuf->set_insts_mark();
 434     br(tmp);
 435   } else {
 436     if (cbuf) cbuf->set_insts_mark();
 437     b(entry);
 438   }
 439 }
 440 
 441 void MacroAssembler::reserved_stack_check() {
 442     // testing if reserved zone needs to be enabled
 443     Label no_reserved_zone_enabling;
 444 
 445     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 446     cmp(sp, rscratch1);
 447     br(Assembler::LO, no_reserved_zone_enabling);
 448 
 449     enter();   // LR and FP are live.
 450     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 451     mov(c_rarg0, rthread);
 452     blr(rscratch1);
 453     leave();
 454 
 455     // We have already removed our own frame.
 456     // throw_delayed_StackOverflowError will think that it's been
 457     // called by our caller.
 458     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 459     br(rscratch1);
 460     should_not_reach_here();
 461 
 462     bind(no_reserved_zone_enabling);
 463 }
 464 
 465 int MacroAssembler::biased_locking_enter(Register lock_reg,
 466                                          Register obj_reg,
 467                                          Register swap_reg,
 468                                          Register tmp_reg,
 469                                          bool swap_reg_contains_mark,
 470                                          Label& done,
 471                                          Label* slow_case,
 472                                          BiasedLockingCounters* counters) {
 473   assert(UseBiasedLocking, "why call this otherwise?");
 474   assert_different_registers(lock_reg, obj_reg, swap_reg);
 475 
 476   if (PrintBiasedLockingStatistics && counters == NULL)
 477     counters = BiasedLocking::counters();
 478 
 479   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 480   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 481   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 482   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 483   Address saved_mark_addr(lock_reg, 0);
 484 
 485   // Biased locking
 486   // See whether the lock is currently biased toward our thread and
 487   // whether the epoch is still valid
 488   // Note that the runtime guarantees sufficient alignment of JavaThread
 489   // pointers to allow age to be placed into low bits
 490   // First check to see whether biasing is even enabled for this object
 491   Label cas_label;
 492   int null_check_offset = -1;
 493   if (!swap_reg_contains_mark) {
 494     null_check_offset = offset();
 495     ldr(swap_reg, mark_addr);
 496   }
 497   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 498   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 499   br(Assembler::NE, cas_label);
 500   // The bias pattern is present in the object's header. Need to check
 501   // whether the bias owner and the epoch are both still current.
 502   load_prototype_header(tmp_reg, obj_reg);
 503   orr(tmp_reg, tmp_reg, rthread);
 504   eor(tmp_reg, swap_reg, tmp_reg);
 505   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 506   if (counters != NULL) {
 507     Label around;
 508     cbnz(tmp_reg, around);
 509     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 510     b(done);
 511     bind(around);
 512   } else {
 513     cbz(tmp_reg, done);
 514   }
 515 
 516   Label try_revoke_bias;
 517   Label try_rebias;
 518 
 519   // At this point we know that the header has the bias pattern and
 520   // that we are not the bias owner in the current epoch. We need to
 521   // figure out more details about the state of the header in order to
 522   // know what operations can be legally performed on the object's
 523   // header.
 524 
 525   // If the low three bits in the xor result aren't clear, that means
 526   // the prototype header is no longer biased and we have to revoke
 527   // the bias on this object.
 528   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 529   cbnz(rscratch1, try_revoke_bias);
 530 
 531   // Biasing is still enabled for this data type. See whether the
 532   // epoch of the current bias is still valid, meaning that the epoch
 533   // bits of the mark word are equal to the epoch bits of the
 534   // prototype header. (Note that the prototype header's epoch bits
 535   // only change at a safepoint.) If not, attempt to rebias the object
 536   // toward the current thread. Note that we must be absolutely sure
 537   // that the current epoch is invalid in order to do this because
 538   // otherwise the manipulations it performs on the mark word are
 539   // illegal.
 540   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 541   cbnz(rscratch1, try_rebias);
 542 
 543   // The epoch of the current bias is still valid but we know nothing
 544   // about the owner; it might be set or it might be clear. Try to
 545   // acquire the bias of the object using an atomic operation. If this
 546   // fails we will go in to the runtime to revoke the object's bias.
 547   // Note that we first construct the presumed unbiased header so we
 548   // don't accidentally blow away another thread's valid bias.
 549   {
 550     Label here;
 551     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 552     andr(swap_reg, swap_reg, rscratch1);
 553     orr(tmp_reg, swap_reg, rthread);
 554     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 555     // If the biasing toward our thread failed, this means that
 556     // another thread succeeded in biasing it toward itself and we
 557     // need to revoke that bias. The revocation will occur in the
 558     // interpreter runtime in the slow case.
 559     bind(here);
 560     if (counters != NULL) {
 561       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 562                   tmp_reg, rscratch1, rscratch2);
 563     }
 564   }
 565   b(done);
 566 
 567   bind(try_rebias);
 568   // At this point we know the epoch has expired, meaning that the
 569   // current "bias owner", if any, is actually invalid. Under these
 570   // circumstances _only_, we are allowed to use the current header's
 571   // value as the comparison value when doing the cas to acquire the
 572   // bias in the current epoch. In other words, we allow transfer of
 573   // the bias from one thread to another directly in this situation.
 574   //
 575   // FIXME: due to a lack of registers we currently blow away the age
 576   // bits in this situation. Should attempt to preserve them.
 577   {
 578     Label here;
 579     load_prototype_header(tmp_reg, obj_reg);
 580     orr(tmp_reg, rthread, tmp_reg);
 581     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 582     // If the biasing toward our thread failed, then another thread
 583     // succeeded in biasing it toward itself and we need to revoke that
 584     // bias. The revocation will occur in the runtime in the slow case.
 585     bind(here);
 586     if (counters != NULL) {
 587       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 588                   tmp_reg, rscratch1, rscratch2);
 589     }
 590   }
 591   b(done);
 592 
 593   bind(try_revoke_bias);
 594   // The prototype mark in the klass doesn't have the bias bit set any
 595   // more, indicating that objects of this data type are not supposed
 596   // to be biased any more. We are going to try to reset the mark of
 597   // this object to the prototype value and fall through to the
 598   // CAS-based locking scheme. Note that if our CAS fails, it means
 599   // that another thread raced us for the privilege of revoking the
 600   // bias of this particular object, so it's okay to continue in the
 601   // normal locking code.
 602   //
 603   // FIXME: due to a lack of registers we currently blow away the age
 604   // bits in this situation. Should attempt to preserve them.
 605   {
 606     Label here, nope;
 607     load_prototype_header(tmp_reg, obj_reg);
 608     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 609     bind(here);
 610 
 611     // Fall through to the normal CAS-based lock, because no matter what
 612     // the result of the above CAS, some thread must have succeeded in
 613     // removing the bias bit from the object's header.
 614     if (counters != NULL) {
 615       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 616                   rscratch1, rscratch2);
 617     }
 618     bind(nope);
 619   }
 620 
 621   bind(cas_label);
 622 
 623   return null_check_offset;
 624 }
 625 
 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 627   assert(UseBiasedLocking, "why call this otherwise?");
 628 
 629   // Check for biased locking unlock case, which is a no-op
 630   // Note: we do not have to check the thread ID for two reasons.
 631   // First, the interpreter checks for IllegalMonitorStateException at
 632   // a higher level. Second, if the bias was revoked while we held the
 633   // lock, the object could not be rebiased toward another thread, so
 634   // the bias bit would be clear.
 635   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 636   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 637   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 638   br(Assembler::EQ, done);
 639 }
 640 
 641 static void pass_arg0(MacroAssembler* masm, Register arg) {
 642   if (c_rarg0 != arg ) {
 643     masm->mov(c_rarg0, arg);
 644   }
 645 }
 646 
 647 static void pass_arg1(MacroAssembler* masm, Register arg) {
 648   if (c_rarg1 != arg ) {
 649     masm->mov(c_rarg1, arg);
 650   }
 651 }
 652 
 653 static void pass_arg2(MacroAssembler* masm, Register arg) {
 654   if (c_rarg2 != arg ) {
 655     masm->mov(c_rarg2, arg);
 656   }
 657 }
 658 
 659 static void pass_arg3(MacroAssembler* masm, Register arg) {
 660   if (c_rarg3 != arg ) {
 661     masm->mov(c_rarg3, arg);
 662   }
 663 }
 664 
 665 void MacroAssembler::call_VM_base(Register oop_result,
 666                                   Register java_thread,
 667                                   Register last_java_sp,
 668                                   address  entry_point,
 669                                   int      number_of_arguments,
 670                                   bool     check_exceptions) {
 671    // determine java_thread register
 672   if (!java_thread->is_valid()) {
 673     java_thread = rthread;
 674   }
 675 
 676   // determine last_java_sp register
 677   if (!last_java_sp->is_valid()) {
 678     last_java_sp = esp;
 679   }
 680 
 681   // debugging support
 682   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 683   assert(java_thread == rthread, "unexpected register");
 684 #ifdef ASSERT
 685   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 686   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 687 #endif // ASSERT
 688 
 689   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 690   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 691 
 692   // push java thread (becomes first argument of C function)
 693 
 694   mov(c_rarg0, java_thread);
 695 
 696   // set last Java frame before call
 697   assert(last_java_sp != rfp, "can't use rfp");
 698 
 699   Label l;
 700   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 701 
 702   // do the call, remove parameters
 703   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 704 
 705   // reset last Java frame
 706   // Only interpreter should have to clear fp
 707   reset_last_Java_frame(true);
 708 
 709    // C++ interp handles this in the interpreter
 710   check_and_handle_popframe(java_thread);
 711   check_and_handle_earlyret(java_thread);
 712 
 713   if (check_exceptions) {
 714     // check for pending exceptions (java_thread is set upon return)
 715     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 716     Label ok;
 717     cbz(rscratch1, ok);
 718     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 719     br(rscratch1);
 720     bind(ok);
 721   }
 722 
 723   // get oop result if there is one and reset the value in the thread
 724   if (oop_result->is_valid()) {
 725     get_vm_result(oop_result, java_thread);
 726   }
 727 }
 728 
 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 730   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 731 }
 732 
 733 // Maybe emit a call via a trampoline.  If the code cache is small
 734 // trampolines won't be emitted.
 735 
 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 737   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 738   assert(entry.rspec().type() == relocInfo::runtime_call_type
 739          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 740          || entry.rspec().type() == relocInfo::static_call_type
 741          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 742 
 743   // We need a trampoline if branches are far.
 744   if (far_branches()) {
 745     bool in_scratch_emit_size = false;
 746 #ifdef COMPILER2
 747     // We don't want to emit a trampoline if C2 is generating dummy
 748     // code during its branch shortening phase.
 749     CompileTask* task = ciEnv::current()->task();
 750     in_scratch_emit_size =
 751       (task != NULL && is_c2_compile(task->comp_level()) &&
 752        Compile::current()->in_scratch_emit_size());
 753 #endif
 754     if (!in_scratch_emit_size) {
 755       address stub = emit_trampoline_stub(offset(), entry.target());
 756       if (stub == NULL) {
 757         return NULL; // CodeCache is full
 758       }
 759     }
 760   }
 761 
 762   if (cbuf) cbuf->set_insts_mark();
 763   relocate(entry.rspec());
 764   if (!far_branches()) {
 765     bl(entry.target());
 766   } else {
 767     bl(pc());
 768   }
 769   // just need to return a non-null address
 770   return pc();
 771 }
 772 
 773 
 774 // Emit a trampoline stub for a call to a target which is too far away.
 775 //
 776 // code sequences:
 777 //
 778 // call-site:
 779 //   branch-and-link to <destination> or <trampoline stub>
 780 //
 781 // Related trampoline stub for this call site in the stub section:
 782 //   load the call target from the constant pool
 783 //   branch (LR still points to the call site above)
 784 
 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 786                                              address dest) {
 787   // Max stub size: alignment nop, TrampolineStub.
 788   address stub = start_a_stub(NativeInstruction::instruction_size
 789                    + NativeCallTrampolineStub::instruction_size);
 790   if (stub == NULL) {
 791     return NULL;  // CodeBuffer::expand failed
 792   }
 793 
 794   // Create a trampoline stub relocation which relates this trampoline stub
 795   // with the call instruction at insts_call_instruction_offset in the
 796   // instructions code-section.
 797   align(wordSize);
 798   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 799                                             + insts_call_instruction_offset));
 800   const int stub_start_offset = offset();
 801 
 802   // Now, create the trampoline stub's code:
 803   // - load the call
 804   // - call
 805   Label target;
 806   ldr(rscratch1, target);
 807   br(rscratch1);
 808   bind(target);
 809   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 810          "should be");
 811   emit_int64((int64_t)dest);
 812 
 813   const address stub_start_addr = addr_at(stub_start_offset);
 814 
 815   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 816 
 817   end_a_stub();
 818   return stub_start_addr;
 819 }
 820 
 821 void MacroAssembler::c2bool(Register x) {
 822   // implements x == 0 ? 0 : 1
 823   // note: must only look at least-significant byte of x
 824   //       since C-style booleans are stored in one byte
 825   //       only! (was bug)
 826   tst(x, 0xff);
 827   cset(x, Assembler::NE);
 828 }
 829 
 830 address MacroAssembler::ic_call(address entry, jint method_index) {
 831   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 832   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 833   // unsigned long offset;
 834   // ldr_constant(rscratch2, const_ptr);
 835   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 836   return trampoline_call(Address(entry, rh));
 837 }
 838 
 839 // Implementation of call_VM versions
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              bool check_exceptions) {
 844   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 845 }
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              Register arg_1,
 850                              bool check_exceptions) {
 851   pass_arg1(this, arg_1);
 852   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              address entry_point,
 868                              Register arg_1,
 869                              Register arg_2,
 870                              Register arg_3,
 871                              bool check_exceptions) {
 872   assert(arg_1 != c_rarg3, "smashed arg");
 873   assert(arg_2 != c_rarg3, "smashed arg");
 874   pass_arg3(this, arg_3);
 875 
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878 
 879   pass_arg1(this, arg_1);
 880   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 881 }
 882 
 883 void MacroAssembler::call_VM(Register oop_result,
 884                              Register last_java_sp,
 885                              address entry_point,
 886                              int number_of_arguments,
 887                              bool check_exceptions) {
 888   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              Register arg_1,
 895                              bool check_exceptions) {
 896   pass_arg1(this, arg_1);
 897   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 898 }
 899 
 900 void MacroAssembler::call_VM(Register oop_result,
 901                              Register last_java_sp,
 902                              address entry_point,
 903                              Register arg_1,
 904                              Register arg_2,
 905                              bool check_exceptions) {
 906 
 907   assert(arg_1 != c_rarg2, "smashed arg");
 908   pass_arg2(this, arg_2);
 909   pass_arg1(this, arg_1);
 910   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 911 }
 912 
 913 void MacroAssembler::call_VM(Register oop_result,
 914                              Register last_java_sp,
 915                              address entry_point,
 916                              Register arg_1,
 917                              Register arg_2,
 918                              Register arg_3,
 919                              bool check_exceptions) {
 920   assert(arg_1 != c_rarg3, "smashed arg");
 921   assert(arg_2 != c_rarg3, "smashed arg");
 922   pass_arg3(this, arg_3);
 923   assert(arg_1 != c_rarg2, "smashed arg");
 924   pass_arg2(this, arg_2);
 925   pass_arg1(this, arg_1);
 926   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 927 }
 928 
 929 
 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 931   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 933   verify_oop(oop_result, "broken oop in call_VM_base");
 934 }
 935 
 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 937   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 939 }
 940 
 941 void MacroAssembler::align(int modulus) {
 942   while (offset() % modulus != 0) nop();
 943 }
 944 
 945 // these are no-ops overridden by InterpreterMacroAssembler
 946 
 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 948 
 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 950 
 951 
 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 953                                                       Register tmp,
 954                                                       int offset) {
 955   intptr_t value = *delayed_value_addr;
 956   if (value != 0)
 957     return RegisterOrConstant(value + offset);
 958 
 959   // load indirectly to solve generation ordering problem
 960   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 961 
 962   if (offset != 0)
 963     add(tmp, tmp, offset);
 964 
 965   return RegisterOrConstant(tmp);
 966 }
 967 
 968 
 969 void MacroAssembler:: notify(int type) {
 970   if (type == bytecode_start) {
 971     // set_last_Java_frame(esp, rfp, (address)NULL);
 972     Assembler:: notify(type);
 973     // reset_last_Java_frame(true);
 974   }
 975   else
 976     Assembler:: notify(type);
 977 }
 978 
 979 // Look up the method for a megamorphic invokeinterface call.
 980 // The target method is determined by <intf_klass, itable_index>.
 981 // The receiver klass is in recv_klass.
 982 // On success, the result will be in method_result, and execution falls through.
 983 // On failure, execution transfers to the given label.
 984 void MacroAssembler::lookup_interface_method(Register recv_klass,
 985                                              Register intf_klass,
 986                                              RegisterOrConstant itable_index,
 987                                              Register method_result,
 988                                              Register scan_temp,
 989                                              Label& L_no_such_interface,
 990                          bool return_method) {
 991   assert_different_registers(recv_klass, intf_klass, scan_temp);
 992   assert_different_registers(method_result, intf_klass, scan_temp);
 993   assert(recv_klass != method_result || !return_method,
 994      "recv_klass can be destroyed when method isn't needed");
 995   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 996          "caller must use same register for non-constant itable index as for method");
 997 
 998   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 999   int vtable_base = in_bytes(Klass::vtable_start_offset());
1000   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1001   int scan_step   = itableOffsetEntry::size() * wordSize;
1002   int vte_size    = vtableEntry::size_in_bytes();
1003   assert(vte_size == wordSize, "else adjust times_vte_scale");
1004 
1005   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1006 
1007   // %%% Could store the aligned, prescaled offset in the klassoop.
1008   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1009   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1010   add(scan_temp, scan_temp, vtable_base);
1011 
1012   if (return_method) {
1013     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1014     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1015     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1016     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1017     if (itentry_off)
1018       add(recv_klass, recv_klass, itentry_off);
1019   }
1020 
1021   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1022   //   if (scan->interface() == intf) {
1023   //     result = (klass + scan->offset() + itable_index);
1024   //   }
1025   // }
1026   Label search, found_method;
1027 
1028   for (int peel = 1; peel >= 0; peel--) {
1029     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1030     cmp(intf_klass, method_result);
1031 
1032     if (peel) {
1033       br(Assembler::EQ, found_method);
1034     } else {
1035       br(Assembler::NE, search);
1036       // (invert the test to fall through to found_method...)
1037     }
1038 
1039     if (!peel)  break;
1040 
1041     bind(search);
1042 
1043     // Check that the previous entry is non-null.  A null entry means that
1044     // the receiver class doesn't implement the interface, and wasn't the
1045     // same as when the caller was compiled.
1046     cbz(method_result, L_no_such_interface);
1047     add(scan_temp, scan_temp, scan_step);
1048   }
1049 
1050   bind(found_method);
1051 
1052   // Got a hit.
1053   if (return_method) {
1054     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1055     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1056   }
1057 }
1058 
1059 // virtual method calling
1060 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1061                                            RegisterOrConstant vtable_index,
1062                                            Register method_result) {
1063   const int base = in_bytes(Klass::vtable_start_offset());
1064   assert(vtableEntry::size() * wordSize == 8,
1065          "adjust the scaling in the code below");
1066   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1067 
1068   if (vtable_index.is_register()) {
1069     lea(method_result, Address(recv_klass,
1070                                vtable_index.as_register(),
1071                                Address::lsl(LogBytesPerWord)));
1072     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1073   } else {
1074     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1075     ldr(method_result,
1076         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1077   }
1078 }
1079 
1080 void MacroAssembler::check_klass_subtype(Register sub_klass,
1081                            Register super_klass,
1082                            Register temp_reg,
1083                            Label& L_success) {
1084   Label L_failure;
1085   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1086   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1087   bind(L_failure);
1088 }
1089 
1090 
1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1092                                                    Register super_klass,
1093                                                    Register temp_reg,
1094                                                    Label* L_success,
1095                                                    Label* L_failure,
1096                                                    Label* L_slow_path,
1097                                         RegisterOrConstant super_check_offset) {
1098   assert_different_registers(sub_klass, super_klass, temp_reg);
1099   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1100   if (super_check_offset.is_register()) {
1101     assert_different_registers(sub_klass, super_klass,
1102                                super_check_offset.as_register());
1103   } else if (must_load_sco) {
1104     assert(temp_reg != noreg, "supply either a temp or a register offset");
1105   }
1106 
1107   Label L_fallthrough;
1108   int label_nulls = 0;
1109   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1110   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1111   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1112   assert(label_nulls <= 1, "at most one NULL in the batch");
1113 
1114   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1115   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1116   Address super_check_offset_addr(super_klass, sco_offset);
1117 
1118   // Hacked jmp, which may only be used just before L_fallthrough.
1119 #define final_jmp(label)                                                \
1120   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1121   else                            b(label)                /*omit semi*/
1122 
1123   // If the pointers are equal, we are done (e.g., String[] elements).
1124   // This self-check enables sharing of secondary supertype arrays among
1125   // non-primary types such as array-of-interface.  Otherwise, each such
1126   // type would need its own customized SSA.
1127   // We move this check to the front of the fast path because many
1128   // type checks are in fact trivially successful in this manner,
1129   // so we get a nicely predicted branch right at the start of the check.
1130   cmp(sub_klass, super_klass);
1131   br(Assembler::EQ, *L_success);
1132 
1133   // Check the supertype display:
1134   if (must_load_sco) {
1135     ldrw(temp_reg, super_check_offset_addr);
1136     super_check_offset = RegisterOrConstant(temp_reg);
1137   }
1138   Address super_check_addr(sub_klass, super_check_offset);
1139   ldr(rscratch1, super_check_addr);
1140   cmp(super_klass, rscratch1); // load displayed supertype
1141 
1142   // This check has worked decisively for primary supers.
1143   // Secondary supers are sought in the super_cache ('super_cache_addr').
1144   // (Secondary supers are interfaces and very deeply nested subtypes.)
1145   // This works in the same check above because of a tricky aliasing
1146   // between the super_cache and the primary super display elements.
1147   // (The 'super_check_addr' can address either, as the case requires.)
1148   // Note that the cache is updated below if it does not help us find
1149   // what we need immediately.
1150   // So if it was a primary super, we can just fail immediately.
1151   // Otherwise, it's the slow path for us (no success at this point).
1152 
1153   if (super_check_offset.is_register()) {
1154     br(Assembler::EQ, *L_success);
1155     subs(zr, super_check_offset.as_register(), sc_offset);
1156     if (L_failure == &L_fallthrough) {
1157       br(Assembler::EQ, *L_slow_path);
1158     } else {
1159       br(Assembler::NE, *L_failure);
1160       final_jmp(*L_slow_path);
1161     }
1162   } else if (super_check_offset.as_constant() == sc_offset) {
1163     // Need a slow path; fast failure is impossible.
1164     if (L_slow_path == &L_fallthrough) {
1165       br(Assembler::EQ, *L_success);
1166     } else {
1167       br(Assembler::NE, *L_slow_path);
1168       final_jmp(*L_success);
1169     }
1170   } else {
1171     // No slow path; it's a fast decision.
1172     if (L_failure == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_failure);
1176       final_jmp(*L_success);
1177     }
1178   }
1179 
1180   bind(L_fallthrough);
1181 
1182 #undef final_jmp
1183 }
1184 
1185 // These two are taken from x86, but they look generally useful
1186 
1187 // scans count pointer sized words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldr(scratch, post(addr, wordSize));
1195   cmp(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 // scans count 4 byte words at [addr] for occurence of value,
1203 // generic
1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1205                                 Register scratch) {
1206   Label Lloop, Lexit;
1207   cbz(count, Lexit);
1208   bind(Lloop);
1209   ldrw(scratch, post(addr, wordSize));
1210   cmpw(value, scratch);
1211   br(EQ, Lexit);
1212   sub(count, count, 1);
1213   cbnz(count, Lloop);
1214   bind(Lexit);
1215 }
1216 
1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1218                                                    Register super_klass,
1219                                                    Register temp_reg,
1220                                                    Register temp2_reg,
1221                                                    Label* L_success,
1222                                                    Label* L_failure,
1223                                                    bool set_cond_codes) {
1224   assert_different_registers(sub_klass, super_klass, temp_reg);
1225   if (temp2_reg != noreg)
1226     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1228 
1229   Label L_fallthrough;
1230   int label_nulls = 0;
1231   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1232   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1233   assert(label_nulls <= 1, "at most one NULL in the batch");
1234 
1235   // a couple of useful fields in sub_klass:
1236   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1237   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1238   Address secondary_supers_addr(sub_klass, ss_offset);
1239   Address super_cache_addr(     sub_klass, sc_offset);
1240 
1241   BLOCK_COMMENT("check_klass_subtype_slow_path");
1242 
1243   // Do a linear scan of the secondary super-klass chain.
1244   // This code is rarely used, so simplicity is a virtue here.
1245   // The repne_scan instruction uses fixed registers, which we must spill.
1246   // Don't worry too much about pre-existing connections with the input regs.
1247 
1248   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1249   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1250 
1251   RegSet pushed_registers;
1252   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1253   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1254 
1255   if (super_klass != r0 || UseCompressedOops) {
1256     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1257   }
1258 
1259   push(pushed_registers, sp);
1260 
1261   // Get super_klass value into r0 (even if it was in r5 or r2).
1262   if (super_klass != r0) {
1263     mov(r0, super_klass);
1264   }
1265 
1266 #ifndef PRODUCT
1267   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1268   Address pst_counter_addr(rscratch2);
1269   ldr(rscratch1, pst_counter_addr);
1270   add(rscratch1, rscratch1, 1);
1271   str(rscratch1, pst_counter_addr);
1272 #endif //PRODUCT
1273 
1274   // We will consult the secondary-super array.
1275   ldr(r5, secondary_supers_addr);
1276   // Load the array length.
1277   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1278   // Skip to start of data.
1279   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1280 
1281   cmp(sp, zr); // Clear Z flag; SP is never zero
1282   // Scan R2 words at [R5] for an occurrence of R0.
1283   // Set NZ/Z based on last compare.
1284   repne_scan(r5, r0, r2, rscratch1);
1285 
1286   // Unspill the temp. registers:
1287   pop(pushed_registers, sp);
1288 
1289   br(Assembler::NE, *L_failure);
1290 
1291   // Success.  Cache the super we found and proceed in triumph.
1292   str(super_klass, super_cache_addr);
1293 
1294   if (L_success != &L_fallthrough) {
1295     b(*L_success);
1296   }
1297 
1298 #undef IS_A_TEMP
1299 
1300   bind(L_fallthrough);
1301 }
1302 
1303 
1304 void MacroAssembler::verify_oop(Register reg, const char* s) {
1305   if (!VerifyOops) return;
1306 
1307   // Pass register number to verify_oop_subroutine
1308   const char* b = NULL;
1309   {
1310     ResourceMark rm;
1311     stringStream ss;
1312     ss.print("verify_oop: %s: %s", reg->name(), s);
1313     b = code_string(ss.as_string());
1314   }
1315   BLOCK_COMMENT("verify_oop {");
1316 
1317   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1318   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1319 
1320   mov(r0, reg);
1321   mov(rscratch1, (address)b);
1322 
1323   // call indirectly to solve generation ordering problem
1324   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1325   ldr(rscratch2, Address(rscratch2));
1326   blr(rscratch2);
1327 
1328   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1329   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1330 
1331   BLOCK_COMMENT("} verify_oop");
1332 }
1333 
1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1335   if (!VerifyOops) return;
1336 
1337   const char* b = NULL;
1338   {
1339     ResourceMark rm;
1340     stringStream ss;
1341     ss.print("verify_oop_addr: %s", s);
1342     b = code_string(ss.as_string());
1343   }
1344   BLOCK_COMMENT("verify_oop_addr {");
1345 
1346   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1347   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1348 
1349   // addr may contain sp so we will have to adjust it based on the
1350   // pushes that we just did.
1351   if (addr.uses(sp)) {
1352     lea(r0, addr);
1353     ldr(r0, Address(r0, 4 * wordSize));
1354   } else {
1355     ldr(r0, addr);
1356   }
1357   mov(rscratch1, (address)b);
1358 
1359   // call indirectly to solve generation ordering problem
1360   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1361   ldr(rscratch2, Address(rscratch2));
1362   blr(rscratch2);
1363 
1364   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1365   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1366 
1367   BLOCK_COMMENT("} verify_oop_addr");
1368 }
1369 
1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1371                                          int extra_slot_offset) {
1372   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1373   int stackElementSize = Interpreter::stackElementSize;
1374   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1375 #ifdef ASSERT
1376   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1377   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1378 #endif
1379   if (arg_slot.is_constant()) {
1380     return Address(esp, arg_slot.as_constant() * stackElementSize
1381                    + offset);
1382   } else {
1383     add(rscratch1, esp, arg_slot.as_register(),
1384         ext::uxtx, exact_log2(stackElementSize));
1385     return Address(rscratch1, offset);
1386   }
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base(address entry_point,
1390                                        int number_of_arguments,
1391                                        Label *retaddr) {
1392   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1393 }
1394 
1395 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1396                                         int number_of_gp_arguments,
1397                                         int number_of_fp_arguments,
1398                                         ret_type type,
1399                                         Label *retaddr) {
1400   Label E, L;
1401 
1402   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1403 
1404   // We add 1 to number_of_arguments because the thread in arg0 is
1405   // not counted
1406   mov(rscratch1, entry_point);
1407   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1408   if (retaddr)
1409     bind(*retaddr);
1410 
1411   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1412   maybe_isb();
1413 }
1414 
1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1416   call_VM_leaf_base(entry_point, number_of_arguments);
1417 }
1418 
1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1420   pass_arg0(this, arg_0);
1421   call_VM_leaf_base(entry_point, 1);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1425   pass_arg0(this, arg_0);
1426   pass_arg1(this, arg_1);
1427   call_VM_leaf_base(entry_point, 2);
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1431                                   Register arg_1, Register arg_2) {
1432   pass_arg0(this, arg_0);
1433   pass_arg1(this, arg_1);
1434   pass_arg2(this, arg_2);
1435   call_VM_leaf_base(entry_point, 3);
1436 }
1437 
1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1439   pass_arg0(this, arg_0);
1440   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1441 }
1442 
1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1444 
1445   assert(arg_0 != c_rarg1, "smashed arg");
1446   pass_arg1(this, arg_1);
1447   pass_arg0(this, arg_0);
1448   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1449 }
1450 
1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1452   assert(arg_0 != c_rarg2, "smashed arg");
1453   assert(arg_1 != c_rarg2, "smashed arg");
1454   pass_arg2(this, arg_2);
1455   assert(arg_0 != c_rarg1, "smashed arg");
1456   pass_arg1(this, arg_1);
1457   pass_arg0(this, arg_0);
1458   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1459 }
1460 
1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1462   assert(arg_0 != c_rarg3, "smashed arg");
1463   assert(arg_1 != c_rarg3, "smashed arg");
1464   assert(arg_2 != c_rarg3, "smashed arg");
1465   pass_arg3(this, arg_3);
1466   assert(arg_0 != c_rarg2, "smashed arg");
1467   assert(arg_1 != c_rarg2, "smashed arg");
1468   pass_arg2(this, arg_2);
1469   assert(arg_0 != c_rarg1, "smashed arg");
1470   pass_arg1(this, arg_1);
1471   pass_arg0(this, arg_0);
1472   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1473 }
1474 
1475 void MacroAssembler::null_check(Register reg, int offset) {
1476   if (needs_explicit_null_check(offset)) {
1477     // provoke OS NULL exception if reg = NULL by
1478     // accessing M[reg] w/o changing any registers
1479     // NOTE: this is plenty to provoke a segv
1480     ldr(zr, Address(reg));
1481   } else {
1482     // nothing to do, (later) access of M[reg + offset]
1483     // will provoke OS NULL exception if reg = NULL
1484   }
1485 }
1486 
1487 // MacroAssembler protected routines needed to implement
1488 // public methods
1489 
1490 void MacroAssembler::mov(Register r, Address dest) {
1491   code_section()->relocate(pc(), dest.rspec());
1492   u_int64_t imm64 = (u_int64_t)dest.target();
1493   movptr(r, imm64);
1494 }
1495 
1496 // Move a constant pointer into r.  In AArch64 mode the virtual
1497 // address space is 48 bits in size, so we only need three
1498 // instructions to create a patchable instruction sequence that can
1499 // reach anywhere.
1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1501 #ifndef PRODUCT
1502   {
1503     char buffer[64];
1504     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1505     block_comment(buffer);
1506   }
1507 #endif
1508   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1509   movz(r, imm64 & 0xffff);
1510   imm64 >>= 16;
1511   movk(r, imm64 & 0xffff, 16);
1512   imm64 >>= 16;
1513   movk(r, imm64 & 0xffff, 32);
1514 }
1515 
1516 // Macro to mov replicated immediate to vector register.
1517 //  Vd will get the following values for different arrangements in T
1518 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1519 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1520 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1521 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1522 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1523 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1524 //   T1D/T2D: invalid
1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1526   assert(T != T1D && T != T2D, "invalid arrangement");
1527   if (T == T8B || T == T16B) {
1528     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1529     movi(Vd, T, imm32 & 0xff, 0);
1530     return;
1531   }
1532   u_int32_t nimm32 = ~imm32;
1533   if (T == T4H || T == T8H) {
1534     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1535     imm32 &= 0xffff;
1536     nimm32 &= 0xffff;
1537   }
1538   u_int32_t x = imm32;
1539   int movi_cnt = 0;
1540   int movn_cnt = 0;
1541   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1542   x = nimm32;
1543   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1544   if (movn_cnt < movi_cnt) imm32 = nimm32;
1545   unsigned lsl = 0;
1546   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1547   if (movn_cnt < movi_cnt)
1548     mvni(Vd, T, imm32 & 0xff, lsl);
1549   else
1550     movi(Vd, T, imm32 & 0xff, lsl);
1551   imm32 >>= 8; lsl += 8;
1552   while (imm32) {
1553     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1554     if (movn_cnt < movi_cnt)
1555       bici(Vd, T, imm32 & 0xff, lsl);
1556     else
1557       orri(Vd, T, imm32 & 0xff, lsl);
1558     lsl += 8; imm32 >>= 8;
1559   }
1560 }
1561 
1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1563 {
1564 #ifndef PRODUCT
1565   {
1566     char buffer[64];
1567     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1568     block_comment(buffer);
1569   }
1570 #endif
1571   if (operand_valid_for_logical_immediate(false, imm64)) {
1572     orr(dst, zr, imm64);
1573   } else {
1574     // we can use a combination of MOVZ or MOVN with
1575     // MOVK to build up the constant
1576     u_int64_t imm_h[4];
1577     int zero_count = 0;
1578     int neg_count = 0;
1579     int i;
1580     for (i = 0; i < 4; i++) {
1581       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1582       if (imm_h[i] == 0) {
1583         zero_count++;
1584       } else if (imm_h[i] == 0xffffL) {
1585         neg_count++;
1586       }
1587     }
1588     if (zero_count == 4) {
1589       // one MOVZ will do
1590       movz(dst, 0);
1591     } else if (neg_count == 4) {
1592       // one MOVN will do
1593       movn(dst, 0);
1594     } else if (zero_count == 3) {
1595       for (i = 0; i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1598           break;
1599         }
1600       }
1601     } else if (neg_count == 3) {
1602       // one MOVN will do
1603       for (int i = 0; i < 4; i++) {
1604         if (imm_h[i] != 0xffffL) {
1605           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1606           break;
1607         }
1608       }
1609     } else if (zero_count == 2) {
1610       // one MOVZ and one MOVK will do
1611       for (i = 0; i < 3; i++) {
1612         if (imm_h[i] != 0L) {
1613           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1614           i++;
1615           break;
1616         }
1617       }
1618       for (;i < 4; i++) {
1619         if (imm_h[i] != 0L) {
1620           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1621         }
1622       }
1623     } else if (neg_count == 2) {
1624       // one MOVN and one MOVK will do
1625       for (i = 0; i < 4; i++) {
1626         if (imm_h[i] != 0xffffL) {
1627           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1628           i++;
1629           break;
1630         }
1631       }
1632       for (;i < 4; i++) {
1633         if (imm_h[i] != 0xffffL) {
1634           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1635         }
1636       }
1637     } else if (zero_count == 1) {
1638       // one MOVZ and two MOVKs will do
1639       for (i = 0; i < 4; i++) {
1640         if (imm_h[i] != 0L) {
1641           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1642           i++;
1643           break;
1644         }
1645       }
1646       for (;i < 4; i++) {
1647         if (imm_h[i] != 0x0L) {
1648           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649         }
1650       }
1651     } else if (neg_count == 1) {
1652       // one MOVN and two MOVKs will do
1653       for (i = 0; i < 4; i++) {
1654         if (imm_h[i] != 0xffffL) {
1655           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1656           i++;
1657           break;
1658         }
1659       }
1660       for (;i < 4; i++) {
1661         if (imm_h[i] != 0xffffL) {
1662           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1663         }
1664       }
1665     } else {
1666       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1667       movz(dst, (u_int32_t)imm_h[0], 0);
1668       for (i = 1; i < 4; i++) {
1669         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1670       }
1671     }
1672   }
1673 }
1674 
1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1676 {
1677 #ifndef PRODUCT
1678     {
1679       char buffer[64];
1680       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1681       block_comment(buffer);
1682     }
1683 #endif
1684   if (operand_valid_for_logical_immediate(true, imm32)) {
1685     orrw(dst, zr, imm32);
1686   } else {
1687     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1688     // constant
1689     u_int32_t imm_h[2];
1690     imm_h[0] = imm32 & 0xffff;
1691     imm_h[1] = ((imm32 >> 16) & 0xffff);
1692     if (imm_h[0] == 0) {
1693       movzw(dst, imm_h[1], 16);
1694     } else if (imm_h[0] == 0xffff) {
1695       movnw(dst, imm_h[1] ^ 0xffff, 16);
1696     } else if (imm_h[1] == 0) {
1697       movzw(dst, imm_h[0], 0);
1698     } else if (imm_h[1] == 0xffff) {
1699       movnw(dst, imm_h[0] ^ 0xffff, 0);
1700     } else {
1701       // use a MOVZ and MOVK (makes it easier to debug)
1702       movzw(dst, imm_h[0], 0);
1703       movkw(dst, imm_h[1], 16);
1704     }
1705   }
1706 }
1707 
1708 // Form an address from base + offset in Rd.  Rd may or may
1709 // not actually be used: you must use the Address that is returned.
1710 // It is up to you to ensure that the shift provided matches the size
1711 // of your data.
1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1713   if (Address::offset_ok_for_immed(byte_offset, shift))
1714     // It fits; no need for any heroics
1715     return Address(base, byte_offset);
1716 
1717   // Don't do anything clever with negative or misaligned offsets
1718   unsigned mask = (1 << shift) - 1;
1719   if (byte_offset < 0 || byte_offset & mask) {
1720     mov(Rd, byte_offset);
1721     add(Rd, base, Rd);
1722     return Address(Rd);
1723   }
1724 
1725   // See if we can do this with two 12-bit offsets
1726   {
1727     unsigned long word_offset = byte_offset >> shift;
1728     unsigned long masked_offset = word_offset & 0xfff000;
1729     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1730         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1731       add(Rd, base, masked_offset << shift);
1732       word_offset -= masked_offset;
1733       return Address(Rd, word_offset << shift);
1734     }
1735   }
1736 
1737   // Do it the hard way
1738   mov(Rd, byte_offset);
1739   add(Rd, base, Rd);
1740   return Address(Rd);
1741 }
1742 
1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1744   if (UseLSE) {
1745     mov(tmp, 1);
1746     ldadd(Assembler::word, tmp, zr, counter_addr);
1747     return;
1748   }
1749   Label retry_load;
1750   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1751     prfm(Address(counter_addr), PSTL1STRM);
1752   bind(retry_load);
1753   // flush and load exclusive from the memory location
1754   ldxrw(tmp, counter_addr);
1755   addw(tmp, tmp, 1);
1756   // if we store+flush with no intervening write tmp wil be zero
1757   stxrw(tmp2, tmp, counter_addr);
1758   cbnzw(tmp2, retry_load);
1759 }
1760 
1761 
1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1763                                     bool want_remainder, Register scratch)
1764 {
1765   // Full implementation of Java idiv and irem.  The function
1766   // returns the (pc) offset of the div instruction - may be needed
1767   // for implicit exceptions.
1768   //
1769   // constraint : ra/rb =/= scratch
1770   //         normal case
1771   //
1772   // input : ra: dividend
1773   //         rb: divisor
1774   //
1775   // result: either
1776   //         quotient  (= ra idiv rb)
1777   //         remainder (= ra irem rb)
1778 
1779   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1780 
1781   int idivl_offset = offset();
1782   if (! want_remainder) {
1783     sdivw(result, ra, rb);
1784   } else {
1785     sdivw(scratch, ra, rb);
1786     Assembler::msubw(result, scratch, rb, ra);
1787   }
1788 
1789   return idivl_offset;
1790 }
1791 
1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1793                                     bool want_remainder, Register scratch)
1794 {
1795   // Full implementation of Java ldiv and lrem.  The function
1796   // returns the (pc) offset of the div instruction - may be needed
1797   // for implicit exceptions.
1798   //
1799   // constraint : ra/rb =/= scratch
1800   //         normal case
1801   //
1802   // input : ra: dividend
1803   //         rb: divisor
1804   //
1805   // result: either
1806   //         quotient  (= ra idiv rb)
1807   //         remainder (= ra irem rb)
1808 
1809   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1810 
1811   int idivq_offset = offset();
1812   if (! want_remainder) {
1813     sdiv(result, ra, rb);
1814   } else {
1815     sdiv(scratch, ra, rb);
1816     Assembler::msub(result, scratch, rb, ra);
1817   }
1818 
1819   return idivq_offset;
1820 }
1821 
1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1823   address prev = pc() - NativeMembar::instruction_size;
1824   address last = code()->last_insn();
1825   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1826     NativeMembar *bar = NativeMembar_at(prev);
1827     // We are merging two memory barrier instructions.  On AArch64 we
1828     // can do this simply by ORing them together.
1829     bar->set_kind(bar->get_kind() | order_constraint);
1830     BLOCK_COMMENT("merged membar");
1831   } else {
1832     code()->set_last_insn(pc());
1833     dmb(Assembler::barrier(order_constraint));
1834   }
1835 }
1836 
1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1838   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1839     merge_ldst(rt, adr, size_in_bytes, is_store);
1840     code()->clear_last_insn();
1841     return true;
1842   } else {
1843     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1844     const unsigned mask = size_in_bytes - 1;
1845     if (adr.getMode() == Address::base_plus_offset &&
1846         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1847       code()->set_last_insn(pc());
1848     }
1849     return false;
1850   }
1851 }
1852 
1853 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1854   // We always try to merge two adjacent loads into one ldp.
1855   if (!try_merge_ldst(Rx, adr, 8, false)) {
1856     Assembler::ldr(Rx, adr);
1857   }
1858 }
1859 
1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1861   // We always try to merge two adjacent loads into one ldp.
1862   if (!try_merge_ldst(Rw, adr, 4, false)) {
1863     Assembler::ldrw(Rw, adr);
1864   }
1865 }
1866 
1867 void MacroAssembler::str(Register Rx, const Address &adr) {
1868   // We always try to merge two adjacent stores into one stp.
1869   if (!try_merge_ldst(Rx, adr, 8, true)) {
1870     Assembler::str(Rx, adr);
1871   }
1872 }
1873 
1874 void MacroAssembler::strw(Register Rw, const Address &adr) {
1875   // We always try to merge two adjacent stores into one stp.
1876   if (!try_merge_ldst(Rw, adr, 4, true)) {
1877     Assembler::strw(Rw, adr);
1878   }
1879 }
1880 
1881 // MacroAssembler routines found actually to be needed
1882 
1883 void MacroAssembler::push(Register src)
1884 {
1885   str(src, Address(pre(esp, -1 * wordSize)));
1886 }
1887 
1888 void MacroAssembler::pop(Register dst)
1889 {
1890   ldr(dst, Address(post(esp, 1 * wordSize)));
1891 }
1892 
1893 // Note: load_unsigned_short used to be called load_unsigned_word.
1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1895   int off = offset();
1896   ldrh(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1901   int off = offset();
1902   ldrb(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_short(Register dst, Address src) {
1907   int off = offset();
1908   ldrsh(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1913   int off = offset();
1914   ldrsb(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1919   int off = offset();
1920   ldrshw(dst, src);
1921   return off;
1922 }
1923 
1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1925   int off = offset();
1926   ldrsbw(dst, src);
1927   return off;
1928 }
1929 
1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1931   switch (size_in_bytes) {
1932   case  8:  ldr(dst, src); break;
1933   case  4:  ldrw(dst, src); break;
1934   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1935   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1936   default:  ShouldNotReachHere();
1937   }
1938 }
1939 
1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1941   switch (size_in_bytes) {
1942   case  8:  str(src, dst); break;
1943   case  4:  strw(src, dst); break;
1944   case  2:  strh(src, dst); break;
1945   case  1:  strb(src, dst); break;
1946   default:  ShouldNotReachHere();
1947   }
1948 }
1949 
1950 void MacroAssembler::decrementw(Register reg, int value)
1951 {
1952   if (value < 0)  { incrementw(reg, -value);      return; }
1953   if (value == 0) {                               return; }
1954   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1955   /* else */ {
1956     guarantee(reg != rscratch2, "invalid dst for register decrement");
1957     movw(rscratch2, (unsigned)value);
1958     subw(reg, reg, rscratch2);
1959   }
1960 }
1961 
1962 void MacroAssembler::decrement(Register reg, int value)
1963 {
1964   if (value < 0)  { increment(reg, -value);      return; }
1965   if (value == 0) {                              return; }
1966   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1967   /* else */ {
1968     assert(reg != rscratch2, "invalid dst for register decrement");
1969     mov(rscratch2, (unsigned long)value);
1970     sub(reg, reg, rscratch2);
1971   }
1972 }
1973 
1974 void MacroAssembler::decrementw(Address dst, int value)
1975 {
1976   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1977   if (dst.getMode() == Address::literal) {
1978     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1979     lea(rscratch2, dst);
1980     dst = Address(rscratch2);
1981   }
1982   ldrw(rscratch1, dst);
1983   decrementw(rscratch1, value);
1984   strw(rscratch1, dst);
1985 }
1986 
1987 void MacroAssembler::decrement(Address dst, int value)
1988 {
1989   assert(!dst.uses(rscratch1), "invalid address for decrement");
1990   if (dst.getMode() == Address::literal) {
1991     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1992     lea(rscratch2, dst);
1993     dst = Address(rscratch2);
1994   }
1995   ldr(rscratch1, dst);
1996   decrement(rscratch1, value);
1997   str(rscratch1, dst);
1998 }
1999 
2000 void MacroAssembler::incrementw(Register reg, int value)
2001 {
2002   if (value < 0)  { decrementw(reg, -value);      return; }
2003   if (value == 0) {                               return; }
2004   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2005   /* else */ {
2006     assert(reg != rscratch2, "invalid dst for register increment");
2007     movw(rscratch2, (unsigned)value);
2008     addw(reg, reg, rscratch2);
2009   }
2010 }
2011 
2012 void MacroAssembler::increment(Register reg, int value)
2013 {
2014   if (value < 0)  { decrement(reg, -value);      return; }
2015   if (value == 0) {                              return; }
2016   if (value < (1 << 12)) { add(reg, reg, value); return; }
2017   /* else */ {
2018     assert(reg != rscratch2, "invalid dst for register increment");
2019     movw(rscratch2, (unsigned)value);
2020     add(reg, reg, rscratch2);
2021   }
2022 }
2023 
2024 void MacroAssembler::incrementw(Address dst, int value)
2025 {
2026   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2027   if (dst.getMode() == Address::literal) {
2028     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2029     lea(rscratch2, dst);
2030     dst = Address(rscratch2);
2031   }
2032   ldrw(rscratch1, dst);
2033   incrementw(rscratch1, value);
2034   strw(rscratch1, dst);
2035 }
2036 
2037 void MacroAssembler::increment(Address dst, int value)
2038 {
2039   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2040   if (dst.getMode() == Address::literal) {
2041     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2042     lea(rscratch2, dst);
2043     dst = Address(rscratch2);
2044   }
2045   ldr(rscratch1, dst);
2046   increment(rscratch1, value);
2047   str(rscratch1, dst);
2048 }
2049 
2050 
2051 void MacroAssembler::pusha() {
2052   push(0x7fffffff, sp);
2053 }
2054 
2055 void MacroAssembler::popa() {
2056   pop(0x7fffffff, sp);
2057 }
2058 
2059 // Push lots of registers in the bit set supplied.  Don't push sp.
2060 // Return the number of words pushed
2061 int MacroAssembler::push(unsigned int bitset, Register stack) {
2062   int words_pushed = 0;
2063 
2064   // Scan bitset to accumulate register pairs
2065   unsigned char regs[32];
2066   int count = 0;
2067   for (int reg = 0; reg <= 30; reg++) {
2068     if (1 & bitset)
2069       regs[count++] = reg;
2070     bitset >>= 1;
2071   }
2072   regs[count++] = zr->encoding_nocheck();
2073   count &= ~1;  // Only push an even nuber of regs
2074 
2075   if (count) {
2076     stp(as_Register(regs[0]), as_Register(regs[1]),
2077        Address(pre(stack, -count * wordSize)));
2078     words_pushed += 2;
2079   }
2080   for (int i = 2; i < count; i += 2) {
2081     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2082        Address(stack, i * wordSize));
2083     words_pushed += 2;
2084   }
2085 
2086   assert(words_pushed == count, "oops, pushed != count");
2087 
2088   return count;
2089 }
2090 
2091 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2092   int words_pushed = 0;
2093 
2094   // Scan bitset to accumulate register pairs
2095   unsigned char regs[32];
2096   int count = 0;
2097   for (int reg = 0; reg <= 30; reg++) {
2098     if (1 & bitset)
2099       regs[count++] = reg;
2100     bitset >>= 1;
2101   }
2102   regs[count++] = zr->encoding_nocheck();
2103   count &= ~1;
2104 
2105   for (int i = 2; i < count; i += 2) {
2106     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2107        Address(stack, i * wordSize));
2108     words_pushed += 2;
2109   }
2110   if (count) {
2111     ldp(as_Register(regs[0]), as_Register(regs[1]),
2112        Address(post(stack, count * wordSize)));
2113     words_pushed += 2;
2114   }
2115 
2116   assert(words_pushed == count, "oops, pushed != count");
2117 
2118   return count;
2119 }
2120 #ifdef ASSERT
2121 void MacroAssembler::verify_heapbase(const char* msg) {
2122 #if 0
2123   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2124   assert (Universe::heap() != NULL, "java heap should be initialized");
2125   if (CheckCompressedOops) {
2126     Label ok;
2127     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2128     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2129     br(Assembler::EQ, ok);
2130     stop(msg);
2131     bind(ok);
2132     pop(1 << rscratch1->encoding(), sp);
2133   }
2134 #endif
2135 }
2136 #endif
2137 
2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2139   Label done, not_weak;
2140   cbz(value, done);           // Use NULL as-is.
2141 
2142   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2143   tbz(r0, 0, not_weak);    // Test for jweak tag.
2144 
2145   // Resolve jweak.
2146   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2147                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2148   verify_oop(value);
2149   b(done);
2150 
2151   bind(not_weak);
2152   // Resolve (untagged) jobject.
2153   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2154   verify_oop(value);
2155   bind(done);
2156 }
2157 
2158 void MacroAssembler::stop(const char* msg) {
2159   address ip = pc();
2160   pusha();
2161   mov(c_rarg0, (address)msg);
2162   mov(c_rarg1, (address)ip);
2163   mov(c_rarg2, sp);
2164   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2165   // call(c_rarg3);
2166   blrt(c_rarg3, 3, 0, 1);
2167   hlt(0);
2168 }
2169 
2170 void MacroAssembler::unimplemented(const char* what) {
2171   const char* buf = NULL;
2172   {
2173     ResourceMark rm;
2174     stringStream ss;
2175     ss.print("unimplemented: %s", what);
2176     buf = code_string(ss.as_string());
2177   }
2178   stop(buf);
2179 }
2180 
2181 // If a constant does not fit in an immediate field, generate some
2182 // number of MOV instructions and then perform the operation.
2183 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2184                                            add_sub_imm_insn insn1,
2185                                            add_sub_reg_insn insn2) {
2186   assert(Rd != zr, "Rd = zr and not setting flags?");
2187   if (operand_valid_for_add_sub_immediate((int)imm)) {
2188     (this->*insn1)(Rd, Rn, imm);
2189   } else {
2190     if (uabs(imm) < (1 << 24)) {
2191        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2192        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2193     } else {
2194        assert_different_registers(Rd, Rn);
2195        mov(Rd, (uint64_t)imm);
2196        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2197     }
2198   }
2199 }
2200 
2201 // Seperate vsn which sets the flags. Optimisations are more restricted
2202 // because we must set the flags correctly.
2203 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2204                                            add_sub_imm_insn insn1,
2205                                            add_sub_reg_insn insn2) {
2206   if (operand_valid_for_add_sub_immediate((int)imm)) {
2207     (this->*insn1)(Rd, Rn, imm);
2208   } else {
2209     assert_different_registers(Rd, Rn);
2210     assert(Rd != zr, "overflow in immediate operand");
2211     mov(Rd, (uint64_t)imm);
2212     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2213   }
2214 }
2215 
2216 
2217 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2218   if (increment.is_register()) {
2219     add(Rd, Rn, increment.as_register());
2220   } else {
2221     add(Rd, Rn, increment.as_constant());
2222   }
2223 }
2224 
2225 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2226   if (increment.is_register()) {
2227     addw(Rd, Rn, increment.as_register());
2228   } else {
2229     addw(Rd, Rn, increment.as_constant());
2230   }
2231 }
2232 
2233 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2234   if (decrement.is_register()) {
2235     sub(Rd, Rn, decrement.as_register());
2236   } else {
2237     sub(Rd, Rn, decrement.as_constant());
2238   }
2239 }
2240 
2241 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2242   if (decrement.is_register()) {
2243     subw(Rd, Rn, decrement.as_register());
2244   } else {
2245     subw(Rd, Rn, decrement.as_constant());
2246   }
2247 }
2248 
2249 void MacroAssembler::reinit_heapbase()
2250 {
2251   if (UseCompressedOops) {
2252     if (Universe::is_fully_initialized()) {
2253       mov(rheapbase, Universe::narrow_ptrs_base());
2254     } else {
2255       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2256       ldr(rheapbase, Address(rheapbase));
2257     }
2258   }
2259 }
2260 
2261 // this simulates the behaviour of the x86 cmpxchg instruction using a
2262 // load linked/store conditional pair. we use the acquire/release
2263 // versions of these instructions so that we flush pending writes as
2264 // per Java semantics.
2265 
2266 // n.b the x86 version assumes the old value to be compared against is
2267 // in rax and updates rax with the value located in memory if the
2268 // cmpxchg fails. we supply a register for the old value explicitly
2269 
2270 // the aarch64 load linked/store conditional instructions do not
2271 // accept an offset. so, unlike x86, we must provide a plain register
2272 // to identify the memory word to be compared/exchanged rather than a
2273 // register+offset Address.
2274 
2275 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2276                                 Label &succeed, Label *fail) {
2277   // oldv holds comparison value
2278   // newv holds value to write in exchange
2279   // addr identifies memory word to compare against/update
2280   if (UseLSE) {
2281     mov(tmp, oldv);
2282     casal(Assembler::xword, oldv, newv, addr);
2283     cmp(tmp, oldv);
2284     br(Assembler::EQ, succeed);
2285     membar(AnyAny);
2286   } else {
2287     Label retry_load, nope;
2288     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2289       prfm(Address(addr), PSTL1STRM);
2290     bind(retry_load);
2291     // flush and load exclusive from the memory location
2292     // and fail if it is not what we expect
2293     ldaxr(tmp, addr);
2294     cmp(tmp, oldv);
2295     br(Assembler::NE, nope);
2296     // if we store+flush with no intervening write tmp wil be zero
2297     stlxr(tmp, newv, addr);
2298     cbzw(tmp, succeed);
2299     // retry so we only ever return after a load fails to compare
2300     // ensures we don't return a stale value after a failed write.
2301     b(retry_load);
2302     // if the memory word differs we return it in oldv and signal a fail
2303     bind(nope);
2304     membar(AnyAny);
2305     mov(oldv, tmp);
2306   }
2307   if (fail)
2308     b(*fail);
2309 }
2310 
2311 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2312                                         Label &succeed, Label *fail) {
2313   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2314   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2315 }
2316 
2317 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2318                                 Label &succeed, Label *fail) {
2319   // oldv holds comparison value
2320   // newv holds value to write in exchange
2321   // addr identifies memory word to compare against/update
2322   // tmp returns 0/1 for success/failure
2323   if (UseLSE) {
2324     mov(tmp, oldv);
2325     casal(Assembler::word, oldv, newv, addr);
2326     cmp(tmp, oldv);
2327     br(Assembler::EQ, succeed);
2328     membar(AnyAny);
2329   } else {
2330     Label retry_load, nope;
2331     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2332       prfm(Address(addr), PSTL1STRM);
2333     bind(retry_load);
2334     // flush and load exclusive from the memory location
2335     // and fail if it is not what we expect
2336     ldaxrw(tmp, addr);
2337     cmp(tmp, oldv);
2338     br(Assembler::NE, nope);
2339     // if we store+flush with no intervening write tmp wil be zero
2340     stlxrw(tmp, newv, addr);
2341     cbzw(tmp, succeed);
2342     // retry so we only ever return after a load fails to compare
2343     // ensures we don't return a stale value after a failed write.
2344     b(retry_load);
2345     // if the memory word differs we return it in oldv and signal a fail
2346     bind(nope);
2347     membar(AnyAny);
2348     mov(oldv, tmp);
2349   }
2350   if (fail)
2351     b(*fail);
2352 }
2353 
2354 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2355 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2356 // Pass a register for the result, otherwise pass noreg.
2357 
2358 // Clobbers rscratch1
2359 void MacroAssembler::cmpxchg(Register addr, Register expected,
2360                              Register new_val,
2361                              enum operand_size size,
2362                              bool acquire, bool release,
2363                              bool weak,
2364                              Register result) {
2365   if (result == noreg)  result = rscratch1;
2366   BLOCK_COMMENT("cmpxchg {");
2367   if (UseLSE) {
2368     mov(result, expected);
2369     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2370     compare_eq(result, expected, size);
2371   } else {
2372     Label retry_load, done;
2373     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2374       prfm(Address(addr), PSTL1STRM);
2375     bind(retry_load);
2376     load_exclusive(result, addr, size, acquire);
2377     compare_eq(result, expected, size);
2378     br(Assembler::NE, done);
2379     store_exclusive(rscratch1, new_val, addr, size, release);
2380     if (weak) {
2381       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2382     } else {
2383       cbnzw(rscratch1, retry_load);
2384     }
2385     bind(done);
2386   }
2387   BLOCK_COMMENT("} cmpxchg");
2388 }
2389 
2390 // A generic comparison. Only compares for equality, clobbers rscratch1.
2391 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2392   if (size == xword) {
2393     cmp(rm, rn);
2394   } else if (size == word) {
2395     cmpw(rm, rn);
2396   } else if (size == halfword) {
2397     eorw(rscratch1, rm, rn);
2398     ands(zr, rscratch1, 0xffff);
2399   } else if (size == byte) {
2400     eorw(rscratch1, rm, rn);
2401     ands(zr, rscratch1, 0xff);
2402   } else {
2403     ShouldNotReachHere();
2404   }
2405 }
2406 
2407 
2408 static bool different(Register a, RegisterOrConstant b, Register c) {
2409   if (b.is_constant())
2410     return a != c;
2411   else
2412     return a != b.as_register() && a != c && b.as_register() != c;
2413 }
2414 
2415 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2416 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2417   if (UseLSE) {                                                         \
2418     prev = prev->is_valid() ? prev : zr;                                \
2419     if (incr.is_register()) {                                           \
2420       AOP(sz, incr.as_register(), prev, addr);                          \
2421     } else {                                                            \
2422       mov(rscratch2, incr.as_constant());                               \
2423       AOP(sz, rscratch2, prev, addr);                                   \
2424     }                                                                   \
2425     return;                                                             \
2426   }                                                                     \
2427   Register result = rscratch2;                                          \
2428   if (prev->is_valid())                                                 \
2429     result = different(prev, incr, addr) ? prev : rscratch2;            \
2430                                                                         \
2431   Label retry_load;                                                     \
2432   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2433     prfm(Address(addr), PSTL1STRM);                                     \
2434   bind(retry_load);                                                     \
2435   LDXR(result, addr);                                                   \
2436   OP(rscratch1, result, incr);                                          \
2437   STXR(rscratch2, rscratch1, addr);                                     \
2438   cbnzw(rscratch2, retry_load);                                         \
2439   if (prev->is_valid() && prev != result) {                             \
2440     IOP(prev, rscratch1, incr);                                         \
2441   }                                                                     \
2442 }
2443 
2444 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2445 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2446 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2447 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2448 
2449 #undef ATOMIC_OP
2450 
2451 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2452 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2453   if (UseLSE) {                                                         \
2454     prev = prev->is_valid() ? prev : zr;                                \
2455     AOP(sz, newv, prev, addr);                                          \
2456     return;                                                             \
2457   }                                                                     \
2458   Register result = rscratch2;                                          \
2459   if (prev->is_valid())                                                 \
2460     result = different(prev, newv, addr) ? prev : rscratch2;            \
2461                                                                         \
2462   Label retry_load;                                                     \
2463   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2464     prfm(Address(addr), PSTL1STRM);                                     \
2465   bind(retry_load);                                                     \
2466   LDXR(result, addr);                                                   \
2467   STXR(rscratch1, newv, addr);                                          \
2468   cbnzw(rscratch1, retry_load);                                         \
2469   if (prev->is_valid() && prev != result)                               \
2470     mov(prev, result);                                                  \
2471 }
2472 
2473 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2474 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2475 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2476 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2477 
2478 #undef ATOMIC_XCHG
2479 
2480 #ifndef PRODUCT
2481 extern "C" void findpc(intptr_t x);
2482 #endif
2483 
2484 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2485 {
2486   // In order to get locks to work, we need to fake a in_VM state
2487   if (ShowMessageBoxOnError ) {
2488     JavaThread* thread = JavaThread::current();
2489     JavaThreadState saved_state = thread->thread_state();
2490     thread->set_thread_state(_thread_in_vm);
2491 #ifndef PRODUCT
2492     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2493       ttyLocker ttyl;
2494       BytecodeCounter::print();
2495     }
2496 #endif
2497     if (os::message_box(msg, "Execution stopped, print registers?")) {
2498       ttyLocker ttyl;
2499       tty->print_cr(" pc = 0x%016lx", pc);
2500 #ifndef PRODUCT
2501       tty->cr();
2502       findpc(pc);
2503       tty->cr();
2504 #endif
2505       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2506       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2507       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2508       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2509       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2510       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2511       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2512       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2513       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2514       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2515       tty->print_cr("r10 = 0x%016lx", regs[10]);
2516       tty->print_cr("r11 = 0x%016lx", regs[11]);
2517       tty->print_cr("r12 = 0x%016lx", regs[12]);
2518       tty->print_cr("r13 = 0x%016lx", regs[13]);
2519       tty->print_cr("r14 = 0x%016lx", regs[14]);
2520       tty->print_cr("r15 = 0x%016lx", regs[15]);
2521       tty->print_cr("r16 = 0x%016lx", regs[16]);
2522       tty->print_cr("r17 = 0x%016lx", regs[17]);
2523       tty->print_cr("r18 = 0x%016lx", regs[18]);
2524       tty->print_cr("r19 = 0x%016lx", regs[19]);
2525       tty->print_cr("r20 = 0x%016lx", regs[20]);
2526       tty->print_cr("r21 = 0x%016lx", regs[21]);
2527       tty->print_cr("r22 = 0x%016lx", regs[22]);
2528       tty->print_cr("r23 = 0x%016lx", regs[23]);
2529       tty->print_cr("r24 = 0x%016lx", regs[24]);
2530       tty->print_cr("r25 = 0x%016lx", regs[25]);
2531       tty->print_cr("r26 = 0x%016lx", regs[26]);
2532       tty->print_cr("r27 = 0x%016lx", regs[27]);
2533       tty->print_cr("r28 = 0x%016lx", regs[28]);
2534       tty->print_cr("r30 = 0x%016lx", regs[30]);
2535       tty->print_cr("r31 = 0x%016lx", regs[31]);
2536       BREAKPOINT;
2537     }
2538     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2539   } else {
2540     ttyLocker ttyl;
2541     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2542                     msg);
2543     assert(false, "DEBUG MESSAGE: %s", msg);
2544   }
2545 }
2546 
2547 #ifdef BUILTIN_SIM
2548 // routine to generate an x86 prolog for a stub function which
2549 // bootstraps into the generated ARM code which directly follows the
2550 // stub
2551 //
2552 // the argument encodes the number of general and fp registers
2553 // passed by the caller and the callng convention (currently just
2554 // the number of general registers and assumes C argument passing)
2555 
2556 extern "C" {
2557 int aarch64_stub_prolog_size();
2558 void aarch64_stub_prolog();
2559 void aarch64_prolog();
2560 }
2561 
2562 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2563                                    address *prolog_ptr)
2564 {
2565   int calltype = (((ret_type & 0x3) << 8) |
2566                   ((fp_arg_count & 0xf) << 4) |
2567                   (gp_arg_count & 0xf));
2568 
2569   // the addresses for the x86 to ARM entry code we need to use
2570   address start = pc();
2571   // printf("start = %lx\n", start);
2572   int byteCount =  aarch64_stub_prolog_size();
2573   // printf("byteCount = %x\n", byteCount);
2574   int instructionCount = (byteCount + 3)/ 4;
2575   // printf("instructionCount = %x\n", instructionCount);
2576   for (int i = 0; i < instructionCount; i++) {
2577     nop();
2578   }
2579 
2580   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2581 
2582   // write the address of the setup routine and the call format at the
2583   // end of into the copied code
2584   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2585   if (prolog_ptr)
2586     patch_end[-2] = (u_int64_t)prolog_ptr;
2587   patch_end[-1] = calltype;
2588 }
2589 #endif
2590 
2591 void MacroAssembler::push_call_clobbered_fp_registers() {
2592   int step = 4 * wordSize;
2593   sub(sp, sp, step);
2594   mov(rscratch1, -step);
2595   // Push v0-v7, v16-v31.
2596   for (int i = 31; i>= 4; i -= 4) {
2597     if (i <= v7->encoding() || i >= v16->encoding())
2598       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2599           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2600   }
2601   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2602       as_FloatRegister(3), T1D, Address(sp));
2603 }
2604 
2605 void MacroAssembler::pop_call_clobbered_fp_registers() {
2606   for (int i = 0; i < 32; i += 4) {
2607     if (i <= v7->encoding() || i >= v16->encoding())
2608       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2609           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2610   }
2611 }
2612 
2613 void MacroAssembler::push_call_clobbered_registers() {
2614   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2615   push_call_clobbered_fp_registers();
2616 }
2617 
2618 void MacroAssembler::pop_call_clobbered_registers() {
2619   pop_call_clobbered_fp_registers();
2620   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2621 }
2622 
2623 void MacroAssembler::push_CPU_state(bool save_vectors) {
2624   int step = (save_vectors ? 8 : 4) * wordSize;
2625   push(0x3fffffff, sp);         // integer registers except lr & sp
2626   mov(rscratch1, -step);
2627   sub(sp, sp, step);
2628   for (int i = 28; i >= 4; i -= 4) {
2629     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2630         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2631   }
2632   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2633 }
2634 
2635 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2636   int step = (restore_vectors ? 8 : 4) * wordSize;
2637   for (int i = 0; i <= 28; i += 4)
2638     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2639         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2640   pop(0x3fffffff, sp);         // integer registers except lr & sp
2641 }
2642 
2643 /**
2644  * Helpers for multiply_to_len().
2645  */
2646 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2647                                      Register src1, Register src2) {
2648   adds(dest_lo, dest_lo, src1);
2649   adc(dest_hi, dest_hi, zr);
2650   adds(dest_lo, dest_lo, src2);
2651   adc(final_dest_hi, dest_hi, zr);
2652 }
2653 
2654 // Generate an address from (r + r1 extend offset).  "size" is the
2655 // size of the operand.  The result may be in rscratch2.
2656 Address MacroAssembler::offsetted_address(Register r, Register r1,
2657                                           Address::extend ext, int offset, int size) {
2658   if (offset || (ext.shift() % size != 0)) {
2659     lea(rscratch2, Address(r, r1, ext));
2660     return Address(rscratch2, offset);
2661   } else {
2662     return Address(r, r1, ext);
2663   }
2664 }
2665 
2666 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2667 {
2668   assert(offset >= 0, "spill to negative address?");
2669   // Offset reachable ?
2670   //   Not aligned - 9 bits signed offset
2671   //   Aligned - 12 bits unsigned offset shifted
2672   Register base = sp;
2673   if ((offset & (size-1)) && offset >= (1<<8)) {
2674     add(tmp, base, offset & ((1<<12)-1));
2675     base = tmp;
2676     offset &= -1<<12;
2677   }
2678 
2679   if (offset >= (1<<12) * size) {
2680     add(tmp, base, offset & (((1<<12)-1)<<12));
2681     base = tmp;
2682     offset &= ~(((1<<12)-1)<<12);
2683   }
2684 
2685   return Address(base, offset);
2686 }
2687 
2688 // Checks whether offset is aligned.
2689 // Returns true if it is, else false.
2690 bool MacroAssembler::merge_alignment_check(Register base,
2691                                            size_t size,
2692                                            long cur_offset,
2693                                            long prev_offset) const {
2694   if (AvoidUnalignedAccesses) {
2695     if (base == sp) {
2696       // Checks whether low offset if aligned to pair of registers.
2697       long pair_mask = size * 2 - 1;
2698       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2699       return (offset & pair_mask) == 0;
2700     } else { // If base is not sp, we can't guarantee the access is aligned.
2701       return false;
2702     }
2703   } else {
2704     long mask = size - 1;
2705     // Load/store pair instruction only supports element size aligned offset.
2706     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2707   }
2708 }
2709 
2710 // Checks whether current and previous loads/stores can be merged.
2711 // Returns true if it can be merged, else false.
2712 bool MacroAssembler::ldst_can_merge(Register rt,
2713                                     const Address &adr,
2714                                     size_t cur_size_in_bytes,
2715                                     bool is_store) const {
2716   address prev = pc() - NativeInstruction::instruction_size;
2717   address last = code()->last_insn();
2718 
2719   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2720     return false;
2721   }
2722 
2723   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2724     return false;
2725   }
2726 
2727   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2728   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2729 
2730   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2731   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2732 
2733   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2734     return false;
2735   }
2736 
2737   long max_offset = 63 * prev_size_in_bytes;
2738   long min_offset = -64 * prev_size_in_bytes;
2739 
2740   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2741 
2742   // Only same base can be merged.
2743   if (adr.base() != prev_ldst->base()) {
2744     return false;
2745   }
2746 
2747   long cur_offset = adr.offset();
2748   long prev_offset = prev_ldst->offset();
2749   size_t diff = abs(cur_offset - prev_offset);
2750   if (diff != prev_size_in_bytes) {
2751     return false;
2752   }
2753 
2754   // Following cases can not be merged:
2755   // ldr x2, [x2, #8]
2756   // ldr x3, [x2, #16]
2757   // or:
2758   // ldr x2, [x3, #8]
2759   // ldr x2, [x3, #16]
2760   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2761   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2762     return false;
2763   }
2764 
2765   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2766   // Offset range must be in ldp/stp instruction's range.
2767   if (low_offset > max_offset || low_offset < min_offset) {
2768     return false;
2769   }
2770 
2771   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2772     return true;
2773   }
2774 
2775   return false;
2776 }
2777 
2778 // Merge current load/store with previous load/store into ldp/stp.
2779 void MacroAssembler::merge_ldst(Register rt,
2780                                 const Address &adr,
2781                                 size_t cur_size_in_bytes,
2782                                 bool is_store) {
2783 
2784   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2785 
2786   Register rt_low, rt_high;
2787   address prev = pc() - NativeInstruction::instruction_size;
2788   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2789 
2790   long offset;
2791 
2792   if (adr.offset() < prev_ldst->offset()) {
2793     offset = adr.offset();
2794     rt_low = rt;
2795     rt_high = prev_ldst->target();
2796   } else {
2797     offset = prev_ldst->offset();
2798     rt_low = prev_ldst->target();
2799     rt_high = rt;
2800   }
2801 
2802   Address adr_p = Address(prev_ldst->base(), offset);
2803   // Overwrite previous generated binary.
2804   code_section()->set_end(prev);
2805 
2806   const int sz = prev_ldst->size_in_bytes();
2807   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2808   if (!is_store) {
2809     BLOCK_COMMENT("merged ldr pair");
2810     if (sz == 8) {
2811       ldp(rt_low, rt_high, adr_p);
2812     } else {
2813       ldpw(rt_low, rt_high, adr_p);
2814     }
2815   } else {
2816     BLOCK_COMMENT("merged str pair");
2817     if (sz == 8) {
2818       stp(rt_low, rt_high, adr_p);
2819     } else {
2820       stpw(rt_low, rt_high, adr_p);
2821     }
2822   }
2823 }
2824 
2825 /**
2826  * Multiply 64 bit by 64 bit first loop.
2827  */
2828 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2829                                            Register y, Register y_idx, Register z,
2830                                            Register carry, Register product,
2831                                            Register idx, Register kdx) {
2832   //
2833   //  jlong carry, x[], y[], z[];
2834   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2835   //    huge_128 product = y[idx] * x[xstart] + carry;
2836   //    z[kdx] = (jlong)product;
2837   //    carry  = (jlong)(product >>> 64);
2838   //  }
2839   //  z[xstart] = carry;
2840   //
2841 
2842   Label L_first_loop, L_first_loop_exit;
2843   Label L_one_x, L_one_y, L_multiply;
2844 
2845   subsw(xstart, xstart, 1);
2846   br(Assembler::MI, L_one_x);
2847 
2848   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2849   ldr(x_xstart, Address(rscratch1));
2850   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2851 
2852   bind(L_first_loop);
2853   subsw(idx, idx, 1);
2854   br(Assembler::MI, L_first_loop_exit);
2855   subsw(idx, idx, 1);
2856   br(Assembler::MI, L_one_y);
2857   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2858   ldr(y_idx, Address(rscratch1));
2859   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2860   bind(L_multiply);
2861 
2862   // AArch64 has a multiply-accumulate instruction that we can't use
2863   // here because it has no way to process carries, so we have to use
2864   // separate add and adc instructions.  Bah.
2865   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2866   mul(product, x_xstart, y_idx);
2867   adds(product, product, carry);
2868   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2869 
2870   subw(kdx, kdx, 2);
2871   ror(product, product, 32); // back to big-endian
2872   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2873 
2874   b(L_first_loop);
2875 
2876   bind(L_one_y);
2877   ldrw(y_idx, Address(y,  0));
2878   b(L_multiply);
2879 
2880   bind(L_one_x);
2881   ldrw(x_xstart, Address(x,  0));
2882   b(L_first_loop);
2883 
2884   bind(L_first_loop_exit);
2885 }
2886 
2887 /**
2888  * Multiply 128 bit by 128. Unrolled inner loop.
2889  *
2890  */
2891 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2892                                              Register carry, Register carry2,
2893                                              Register idx, Register jdx,
2894                                              Register yz_idx1, Register yz_idx2,
2895                                              Register tmp, Register tmp3, Register tmp4,
2896                                              Register tmp6, Register product_hi) {
2897 
2898   //   jlong carry, x[], y[], z[];
2899   //   int kdx = ystart+1;
2900   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2901   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2902   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2903   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2904   //     carry  = (jlong)(tmp4 >>> 64);
2905   //     z[kdx+idx+1] = (jlong)tmp3;
2906   //     z[kdx+idx] = (jlong)tmp4;
2907   //   }
2908   //   idx += 2;
2909   //   if (idx > 0) {
2910   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2911   //     z[kdx+idx] = (jlong)yz_idx1;
2912   //     carry  = (jlong)(yz_idx1 >>> 64);
2913   //   }
2914   //
2915 
2916   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2917 
2918   lsrw(jdx, idx, 2);
2919 
2920   bind(L_third_loop);
2921 
2922   subsw(jdx, jdx, 1);
2923   br(Assembler::MI, L_third_loop_exit);
2924   subw(idx, idx, 4);
2925 
2926   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2927 
2928   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2929 
2930   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2931 
2932   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2933   ror(yz_idx2, yz_idx2, 32);
2934 
2935   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2936 
2937   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2938   umulh(tmp4, product_hi, yz_idx1);
2939 
2940   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2941   ror(rscratch2, rscratch2, 32);
2942 
2943   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2944   umulh(carry2, product_hi, yz_idx2);
2945 
2946   // propagate sum of both multiplications into carry:tmp4:tmp3
2947   adds(tmp3, tmp3, carry);
2948   adc(tmp4, tmp4, zr);
2949   adds(tmp3, tmp3, rscratch1);
2950   adcs(tmp4, tmp4, tmp);
2951   adc(carry, carry2, zr);
2952   adds(tmp4, tmp4, rscratch2);
2953   adc(carry, carry, zr);
2954 
2955   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2956   ror(tmp4, tmp4, 32);
2957   stp(tmp4, tmp3, Address(tmp6, 0));
2958 
2959   b(L_third_loop);
2960   bind (L_third_loop_exit);
2961 
2962   andw (idx, idx, 0x3);
2963   cbz(idx, L_post_third_loop_done);
2964 
2965   Label L_check_1;
2966   subsw(idx, idx, 2);
2967   br(Assembler::MI, L_check_1);
2968 
2969   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2970   ldr(yz_idx1, Address(rscratch1, 0));
2971   ror(yz_idx1, yz_idx1, 32);
2972   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2973   umulh(tmp4, product_hi, yz_idx1);
2974   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2975   ldr(yz_idx2, Address(rscratch1, 0));
2976   ror(yz_idx2, yz_idx2, 32);
2977 
2978   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2979 
2980   ror(tmp3, tmp3, 32);
2981   str(tmp3, Address(rscratch1, 0));
2982 
2983   bind (L_check_1);
2984 
2985   andw (idx, idx, 0x1);
2986   subsw(idx, idx, 1);
2987   br(Assembler::MI, L_post_third_loop_done);
2988   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2989   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2990   umulh(carry2, tmp4, product_hi);
2991   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2992 
2993   add2_with_carry(carry2, tmp3, tmp4, carry);
2994 
2995   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2996   extr(carry, carry2, tmp3, 32);
2997 
2998   bind(L_post_third_loop_done);
2999 }
3000 
3001 /**
3002  * Code for BigInteger::multiplyToLen() instrinsic.
3003  *
3004  * r0: x
3005  * r1: xlen
3006  * r2: y
3007  * r3: ylen
3008  * r4:  z
3009  * r5: zlen
3010  * r10: tmp1
3011  * r11: tmp2
3012  * r12: tmp3
3013  * r13: tmp4
3014  * r14: tmp5
3015  * r15: tmp6
3016  * r16: tmp7
3017  *
3018  */
3019 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3020                                      Register z, Register zlen,
3021                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3022                                      Register tmp5, Register tmp6, Register product_hi) {
3023 
3024   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3025 
3026   const Register idx = tmp1;
3027   const Register kdx = tmp2;
3028   const Register xstart = tmp3;
3029 
3030   const Register y_idx = tmp4;
3031   const Register carry = tmp5;
3032   const Register product  = xlen;
3033   const Register x_xstart = zlen;  // reuse register
3034 
3035   // First Loop.
3036   //
3037   //  final static long LONG_MASK = 0xffffffffL;
3038   //  int xstart = xlen - 1;
3039   //  int ystart = ylen - 1;
3040   //  long carry = 0;
3041   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3042   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3043   //    z[kdx] = (int)product;
3044   //    carry = product >>> 32;
3045   //  }
3046   //  z[xstart] = (int)carry;
3047   //
3048 
3049   movw(idx, ylen);      // idx = ylen;
3050   movw(kdx, zlen);      // kdx = xlen+ylen;
3051   mov(carry, zr);       // carry = 0;
3052 
3053   Label L_done;
3054 
3055   movw(xstart, xlen);
3056   subsw(xstart, xstart, 1);
3057   br(Assembler::MI, L_done);
3058 
3059   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3060 
3061   Label L_second_loop;
3062   cbzw(kdx, L_second_loop);
3063 
3064   Label L_carry;
3065   subw(kdx, kdx, 1);
3066   cbzw(kdx, L_carry);
3067 
3068   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3069   lsr(carry, carry, 32);
3070   subw(kdx, kdx, 1);
3071 
3072   bind(L_carry);
3073   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3074 
3075   // Second and third (nested) loops.
3076   //
3077   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3078   //   carry = 0;
3079   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3080   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3081   //                    (z[k] & LONG_MASK) + carry;
3082   //     z[k] = (int)product;
3083   //     carry = product >>> 32;
3084   //   }
3085   //   z[i] = (int)carry;
3086   // }
3087   //
3088   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3089 
3090   const Register jdx = tmp1;
3091 
3092   bind(L_second_loop);
3093   mov(carry, zr);                // carry = 0;
3094   movw(jdx, ylen);               // j = ystart+1
3095 
3096   subsw(xstart, xstart, 1);      // i = xstart-1;
3097   br(Assembler::MI, L_done);
3098 
3099   str(z, Address(pre(sp, -4 * wordSize)));
3100 
3101   Label L_last_x;
3102   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3103   subsw(xstart, xstart, 1);       // i = xstart-1;
3104   br(Assembler::MI, L_last_x);
3105 
3106   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3107   ldr(product_hi, Address(rscratch1));
3108   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3109 
3110   Label L_third_loop_prologue;
3111   bind(L_third_loop_prologue);
3112 
3113   str(ylen, Address(sp, wordSize));
3114   stp(x, xstart, Address(sp, 2 * wordSize));
3115   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3116                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3117   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3118   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3119 
3120   addw(tmp3, xlen, 1);
3121   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3122   subsw(tmp3, tmp3, 1);
3123   br(Assembler::MI, L_done);
3124 
3125   lsr(carry, carry, 32);
3126   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3127   b(L_second_loop);
3128 
3129   // Next infrequent code is moved outside loops.
3130   bind(L_last_x);
3131   ldrw(product_hi, Address(x,  0));
3132   b(L_third_loop_prologue);
3133 
3134   bind(L_done);
3135 }
3136 
3137 // Code for BigInteger::mulAdd instrinsic
3138 // out     = r0
3139 // in      = r1
3140 // offset  = r2  (already out.length-offset)
3141 // len     = r3
3142 // k       = r4
3143 //
3144 // pseudo code from java implementation:
3145 // carry = 0;
3146 // offset = out.length-offset - 1;
3147 // for (int j=len-1; j >= 0; j--) {
3148 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3149 //     out[offset--] = (int)product;
3150 //     carry = product >>> 32;
3151 // }
3152 // return (int)carry;
3153 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3154       Register len, Register k) {
3155     Label LOOP, END;
3156     // pre-loop
3157     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3158     csel(out, zr, out, Assembler::EQ);
3159     br(Assembler::EQ, END);
3160     add(in, in, len, LSL, 2); // in[j+1] address
3161     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3162     mov(out, zr); // used to keep carry now
3163     BIND(LOOP);
3164     ldrw(rscratch1, Address(pre(in, -4)));
3165     madd(rscratch1, rscratch1, k, out);
3166     ldrw(rscratch2, Address(pre(offset, -4)));
3167     add(rscratch1, rscratch1, rscratch2);
3168     strw(rscratch1, Address(offset));
3169     lsr(out, rscratch1, 32);
3170     subs(len, len, 1);
3171     br(Assembler::NE, LOOP);
3172     BIND(END);
3173 }
3174 
3175 /**
3176  * Emits code to update CRC-32 with a byte value according to constants in table
3177  *
3178  * @param [in,out]crc   Register containing the crc.
3179  * @param [in]val       Register containing the byte to fold into the CRC.
3180  * @param [in]table     Register containing the table of crc constants.
3181  *
3182  * uint32_t crc;
3183  * val = crc_table[(val ^ crc) & 0xFF];
3184  * crc = val ^ (crc >> 8);
3185  *
3186  */
3187 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3188   eor(val, val, crc);
3189   andr(val, val, 0xff);
3190   ldrw(val, Address(table, val, Address::lsl(2)));
3191   eor(crc, val, crc, Assembler::LSR, 8);
3192 }
3193 
3194 /**
3195  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3196  *
3197  * @param [in,out]crc   Register containing the crc.
3198  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3199  * @param [in]table0    Register containing table 0 of crc constants.
3200  * @param [in]table1    Register containing table 1 of crc constants.
3201  * @param [in]table2    Register containing table 2 of crc constants.
3202  * @param [in]table3    Register containing table 3 of crc constants.
3203  *
3204  * uint32_t crc;
3205  *   v = crc ^ v
3206  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3207  *
3208  */
3209 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3210         Register table0, Register table1, Register table2, Register table3,
3211         bool upper) {
3212   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3213   uxtb(tmp, v);
3214   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3215   ubfx(tmp, v, 8, 8);
3216   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3217   eor(crc, crc, tmp);
3218   ubfx(tmp, v, 16, 8);
3219   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3220   eor(crc, crc, tmp);
3221   ubfx(tmp, v, 24, 8);
3222   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3223   eor(crc, crc, tmp);
3224 }
3225 
3226 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3227         Register len, Register tmp0, Register tmp1, Register tmp2,
3228         Register tmp3) {
3229     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3230     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3231 
3232     mvnw(crc, crc);
3233 
3234     subs(len, len, 128);
3235     br(Assembler::GE, CRC_by64_pre);
3236   BIND(CRC_less64);
3237     adds(len, len, 128-32);
3238     br(Assembler::GE, CRC_by32_loop);
3239   BIND(CRC_less32);
3240     adds(len, len, 32-4);
3241     br(Assembler::GE, CRC_by4_loop);
3242     adds(len, len, 4);
3243     br(Assembler::GT, CRC_by1_loop);
3244     b(L_exit);
3245 
3246   BIND(CRC_by32_loop);
3247     ldp(tmp0, tmp1, Address(post(buf, 16)));
3248     subs(len, len, 32);
3249     crc32x(crc, crc, tmp0);
3250     ldr(tmp2, Address(post(buf, 8)));
3251     crc32x(crc, crc, tmp1);
3252     ldr(tmp3, Address(post(buf, 8)));
3253     crc32x(crc, crc, tmp2);
3254     crc32x(crc, crc, tmp3);
3255     br(Assembler::GE, CRC_by32_loop);
3256     cmn(len, 32);
3257     br(Assembler::NE, CRC_less32);
3258     b(L_exit);
3259 
3260   BIND(CRC_by4_loop);
3261     ldrw(tmp0, Address(post(buf, 4)));
3262     subs(len, len, 4);
3263     crc32w(crc, crc, tmp0);
3264     br(Assembler::GE, CRC_by4_loop);
3265     adds(len, len, 4);
3266     br(Assembler::LE, L_exit);
3267   BIND(CRC_by1_loop);
3268     ldrb(tmp0, Address(post(buf, 1)));
3269     subs(len, len, 1);
3270     crc32b(crc, crc, tmp0);
3271     br(Assembler::GT, CRC_by1_loop);
3272     b(L_exit);
3273 
3274   BIND(CRC_by64_pre);
3275     sub(buf, buf, 8);
3276     ldp(tmp0, tmp1, Address(buf, 8));
3277     crc32x(crc, crc, tmp0);
3278     ldr(tmp2, Address(buf, 24));
3279     crc32x(crc, crc, tmp1);
3280     ldr(tmp3, Address(buf, 32));
3281     crc32x(crc, crc, tmp2);
3282     ldr(tmp0, Address(buf, 40));
3283     crc32x(crc, crc, tmp3);
3284     ldr(tmp1, Address(buf, 48));
3285     crc32x(crc, crc, tmp0);
3286     ldr(tmp2, Address(buf, 56));
3287     crc32x(crc, crc, tmp1);
3288     ldr(tmp3, Address(pre(buf, 64)));
3289 
3290     b(CRC_by64_loop);
3291 
3292     align(CodeEntryAlignment);
3293   BIND(CRC_by64_loop);
3294     subs(len, len, 64);
3295     crc32x(crc, crc, tmp2);
3296     ldr(tmp0, Address(buf, 8));
3297     crc32x(crc, crc, tmp3);
3298     ldr(tmp1, Address(buf, 16));
3299     crc32x(crc, crc, tmp0);
3300     ldr(tmp2, Address(buf, 24));
3301     crc32x(crc, crc, tmp1);
3302     ldr(tmp3, Address(buf, 32));
3303     crc32x(crc, crc, tmp2);
3304     ldr(tmp0, Address(buf, 40));
3305     crc32x(crc, crc, tmp3);
3306     ldr(tmp1, Address(buf, 48));
3307     crc32x(crc, crc, tmp0);
3308     ldr(tmp2, Address(buf, 56));
3309     crc32x(crc, crc, tmp1);
3310     ldr(tmp3, Address(pre(buf, 64)));
3311     br(Assembler::GE, CRC_by64_loop);
3312 
3313     // post-loop
3314     crc32x(crc, crc, tmp2);
3315     crc32x(crc, crc, tmp3);
3316 
3317     sub(len, len, 64);
3318     add(buf, buf, 8);
3319     cmn(len, 128);
3320     br(Assembler::NE, CRC_less64);
3321   BIND(L_exit);
3322     mvnw(crc, crc);
3323 }
3324 
3325 /**
3326  * @param crc   register containing existing CRC (32-bit)
3327  * @param buf   register pointing to input byte buffer (byte*)
3328  * @param len   register containing number of bytes
3329  * @param table register that will contain address of CRC table
3330  * @param tmp   scratch register
3331  */
3332 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3333         Register table0, Register table1, Register table2, Register table3,
3334         Register tmp, Register tmp2, Register tmp3) {
3335   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3336   unsigned long offset;
3337 
3338   if (UseCRC32) {
3339       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3340       return;
3341   }
3342 
3343     mvnw(crc, crc);
3344 
3345     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3346     if (offset) add(table0, table0, offset);
3347     add(table1, table0, 1*256*sizeof(juint));
3348     add(table2, table0, 2*256*sizeof(juint));
3349     add(table3, table0, 3*256*sizeof(juint));
3350 
3351   if (UseNeon) {
3352       cmp(len, (u1)64);
3353       br(Assembler::LT, L_by16);
3354       eor(v16, T16B, v16, v16);
3355 
3356     Label L_fold;
3357 
3358       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3359 
3360       ld1(v0, v1, T2D, post(buf, 32));
3361       ld1r(v4, T2D, post(tmp, 8));
3362       ld1r(v5, T2D, post(tmp, 8));
3363       ld1r(v6, T2D, post(tmp, 8));
3364       ld1r(v7, T2D, post(tmp, 8));
3365       mov(v16, T4S, 0, crc);
3366 
3367       eor(v0, T16B, v0, v16);
3368       sub(len, len, 64);
3369 
3370     BIND(L_fold);
3371       pmull(v22, T8H, v0, v5, T8B);
3372       pmull(v20, T8H, v0, v7, T8B);
3373       pmull(v23, T8H, v0, v4, T8B);
3374       pmull(v21, T8H, v0, v6, T8B);
3375 
3376       pmull2(v18, T8H, v0, v5, T16B);
3377       pmull2(v16, T8H, v0, v7, T16B);
3378       pmull2(v19, T8H, v0, v4, T16B);
3379       pmull2(v17, T8H, v0, v6, T16B);
3380 
3381       uzp1(v24, T8H, v20, v22);
3382       uzp2(v25, T8H, v20, v22);
3383       eor(v20, T16B, v24, v25);
3384 
3385       uzp1(v26, T8H, v16, v18);
3386       uzp2(v27, T8H, v16, v18);
3387       eor(v16, T16B, v26, v27);
3388 
3389       ushll2(v22, T4S, v20, T8H, 8);
3390       ushll(v20, T4S, v20, T4H, 8);
3391 
3392       ushll2(v18, T4S, v16, T8H, 8);
3393       ushll(v16, T4S, v16, T4H, 8);
3394 
3395       eor(v22, T16B, v23, v22);
3396       eor(v18, T16B, v19, v18);
3397       eor(v20, T16B, v21, v20);
3398       eor(v16, T16B, v17, v16);
3399 
3400       uzp1(v17, T2D, v16, v20);
3401       uzp2(v21, T2D, v16, v20);
3402       eor(v17, T16B, v17, v21);
3403 
3404       ushll2(v20, T2D, v17, T4S, 16);
3405       ushll(v16, T2D, v17, T2S, 16);
3406 
3407       eor(v20, T16B, v20, v22);
3408       eor(v16, T16B, v16, v18);
3409 
3410       uzp1(v17, T2D, v20, v16);
3411       uzp2(v21, T2D, v20, v16);
3412       eor(v28, T16B, v17, v21);
3413 
3414       pmull(v22, T8H, v1, v5, T8B);
3415       pmull(v20, T8H, v1, v7, T8B);
3416       pmull(v23, T8H, v1, v4, T8B);
3417       pmull(v21, T8H, v1, v6, T8B);
3418 
3419       pmull2(v18, T8H, v1, v5, T16B);
3420       pmull2(v16, T8H, v1, v7, T16B);
3421       pmull2(v19, T8H, v1, v4, T16B);
3422       pmull2(v17, T8H, v1, v6, T16B);
3423 
3424       ld1(v0, v1, T2D, post(buf, 32));
3425 
3426       uzp1(v24, T8H, v20, v22);
3427       uzp2(v25, T8H, v20, v22);
3428       eor(v20, T16B, v24, v25);
3429 
3430       uzp1(v26, T8H, v16, v18);
3431       uzp2(v27, T8H, v16, v18);
3432       eor(v16, T16B, v26, v27);
3433 
3434       ushll2(v22, T4S, v20, T8H, 8);
3435       ushll(v20, T4S, v20, T4H, 8);
3436 
3437       ushll2(v18, T4S, v16, T8H, 8);
3438       ushll(v16, T4S, v16, T4H, 8);
3439 
3440       eor(v22, T16B, v23, v22);
3441       eor(v18, T16B, v19, v18);
3442       eor(v20, T16B, v21, v20);
3443       eor(v16, T16B, v17, v16);
3444 
3445       uzp1(v17, T2D, v16, v20);
3446       uzp2(v21, T2D, v16, v20);
3447       eor(v16, T16B, v17, v21);
3448 
3449       ushll2(v20, T2D, v16, T4S, 16);
3450       ushll(v16, T2D, v16, T2S, 16);
3451 
3452       eor(v20, T16B, v22, v20);
3453       eor(v16, T16B, v16, v18);
3454 
3455       uzp1(v17, T2D, v20, v16);
3456       uzp2(v21, T2D, v20, v16);
3457       eor(v20, T16B, v17, v21);
3458 
3459       shl(v16, T2D, v28, 1);
3460       shl(v17, T2D, v20, 1);
3461 
3462       eor(v0, T16B, v0, v16);
3463       eor(v1, T16B, v1, v17);
3464 
3465       subs(len, len, 32);
3466       br(Assembler::GE, L_fold);
3467 
3468       mov(crc, 0);
3469       mov(tmp, v0, T1D, 0);
3470       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3471       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3472       mov(tmp, v0, T1D, 1);
3473       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3474       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3475       mov(tmp, v1, T1D, 0);
3476       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3477       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3478       mov(tmp, v1, T1D, 1);
3479       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3480       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3481 
3482       add(len, len, 32);
3483   }
3484 
3485   BIND(L_by16);
3486     subs(len, len, 16);
3487     br(Assembler::GE, L_by16_loop);
3488     adds(len, len, 16-4);
3489     br(Assembler::GE, L_by4_loop);
3490     adds(len, len, 4);
3491     br(Assembler::GT, L_by1_loop);
3492     b(L_exit);
3493 
3494   BIND(L_by4_loop);
3495     ldrw(tmp, Address(post(buf, 4)));
3496     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3497     subs(len, len, 4);
3498     br(Assembler::GE, L_by4_loop);
3499     adds(len, len, 4);
3500     br(Assembler::LE, L_exit);
3501   BIND(L_by1_loop);
3502     subs(len, len, 1);
3503     ldrb(tmp, Address(post(buf, 1)));
3504     update_byte_crc32(crc, tmp, table0);
3505     br(Assembler::GT, L_by1_loop);
3506     b(L_exit);
3507 
3508     align(CodeEntryAlignment);
3509   BIND(L_by16_loop);
3510     subs(len, len, 16);
3511     ldp(tmp, tmp3, Address(post(buf, 16)));
3512     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3513     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3514     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3515     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3516     br(Assembler::GE, L_by16_loop);
3517     adds(len, len, 16-4);
3518     br(Assembler::GE, L_by4_loop);
3519     adds(len, len, 4);
3520     br(Assembler::GT, L_by1_loop);
3521   BIND(L_exit);
3522     mvnw(crc, crc);
3523 }
3524 
3525 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3526         Register len, Register tmp0, Register tmp1, Register tmp2,
3527         Register tmp3) {
3528     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3529     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3530 
3531     subs(len, len, 128);
3532     br(Assembler::GE, CRC_by64_pre);
3533   BIND(CRC_less64);
3534     adds(len, len, 128-32);
3535     br(Assembler::GE, CRC_by32_loop);
3536   BIND(CRC_less32);
3537     adds(len, len, 32-4);
3538     br(Assembler::GE, CRC_by4_loop);
3539     adds(len, len, 4);
3540     br(Assembler::GT, CRC_by1_loop);
3541     b(L_exit);
3542 
3543   BIND(CRC_by32_loop);
3544     ldp(tmp0, tmp1, Address(post(buf, 16)));
3545     subs(len, len, 32);
3546     crc32cx(crc, crc, tmp0);
3547     ldr(tmp2, Address(post(buf, 8)));
3548     crc32cx(crc, crc, tmp1);
3549     ldr(tmp3, Address(post(buf, 8)));
3550     crc32cx(crc, crc, tmp2);
3551     crc32cx(crc, crc, tmp3);
3552     br(Assembler::GE, CRC_by32_loop);
3553     cmn(len, 32);
3554     br(Assembler::NE, CRC_less32);
3555     b(L_exit);
3556 
3557   BIND(CRC_by4_loop);
3558     ldrw(tmp0, Address(post(buf, 4)));
3559     subs(len, len, 4);
3560     crc32cw(crc, crc, tmp0);
3561     br(Assembler::GE, CRC_by4_loop);
3562     adds(len, len, 4);
3563     br(Assembler::LE, L_exit);
3564   BIND(CRC_by1_loop);
3565     ldrb(tmp0, Address(post(buf, 1)));
3566     subs(len, len, 1);
3567     crc32cb(crc, crc, tmp0);
3568     br(Assembler::GT, CRC_by1_loop);
3569     b(L_exit);
3570 
3571   BIND(CRC_by64_pre);
3572     sub(buf, buf, 8);
3573     ldp(tmp0, tmp1, Address(buf, 8));
3574     crc32cx(crc, crc, tmp0);
3575     ldr(tmp2, Address(buf, 24));
3576     crc32cx(crc, crc, tmp1);
3577     ldr(tmp3, Address(buf, 32));
3578     crc32cx(crc, crc, tmp2);
3579     ldr(tmp0, Address(buf, 40));
3580     crc32cx(crc, crc, tmp3);
3581     ldr(tmp1, Address(buf, 48));
3582     crc32cx(crc, crc, tmp0);
3583     ldr(tmp2, Address(buf, 56));
3584     crc32cx(crc, crc, tmp1);
3585     ldr(tmp3, Address(pre(buf, 64)));
3586 
3587     b(CRC_by64_loop);
3588 
3589     align(CodeEntryAlignment);
3590   BIND(CRC_by64_loop);
3591     subs(len, len, 64);
3592     crc32cx(crc, crc, tmp2);
3593     ldr(tmp0, Address(buf, 8));
3594     crc32cx(crc, crc, tmp3);
3595     ldr(tmp1, Address(buf, 16));
3596     crc32cx(crc, crc, tmp0);
3597     ldr(tmp2, Address(buf, 24));
3598     crc32cx(crc, crc, tmp1);
3599     ldr(tmp3, Address(buf, 32));
3600     crc32cx(crc, crc, tmp2);
3601     ldr(tmp0, Address(buf, 40));
3602     crc32cx(crc, crc, tmp3);
3603     ldr(tmp1, Address(buf, 48));
3604     crc32cx(crc, crc, tmp0);
3605     ldr(tmp2, Address(buf, 56));
3606     crc32cx(crc, crc, tmp1);
3607     ldr(tmp3, Address(pre(buf, 64)));
3608     br(Assembler::GE, CRC_by64_loop);
3609 
3610     // post-loop
3611     crc32cx(crc, crc, tmp2);
3612     crc32cx(crc, crc, tmp3);
3613 
3614     sub(len, len, 64);
3615     add(buf, buf, 8);
3616     cmn(len, 128);
3617     br(Assembler::NE, CRC_less64);
3618   BIND(L_exit);
3619 }
3620 
3621 /**
3622  * @param crc   register containing existing CRC (32-bit)
3623  * @param buf   register pointing to input byte buffer (byte*)
3624  * @param len   register containing number of bytes
3625  * @param table register that will contain address of CRC table
3626  * @param tmp   scratch register
3627  */
3628 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3629         Register table0, Register table1, Register table2, Register table3,
3630         Register tmp, Register tmp2, Register tmp3) {
3631   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3632 }
3633 
3634 
3635 SkipIfEqual::SkipIfEqual(
3636     MacroAssembler* masm, const bool* flag_addr, bool value) {
3637   _masm = masm;
3638   unsigned long offset;
3639   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3640   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3641   _masm->cbzw(rscratch1, _label);
3642 }
3643 
3644 SkipIfEqual::~SkipIfEqual() {
3645   _masm->bind(_label);
3646 }
3647 
3648 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3649   Address adr;
3650   switch(dst.getMode()) {
3651   case Address::base_plus_offset:
3652     // This is the expected mode, although we allow all the other
3653     // forms below.
3654     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3655     break;
3656   default:
3657     lea(rscratch2, dst);
3658     adr = Address(rscratch2);
3659     break;
3660   }
3661   ldr(rscratch1, adr);
3662   add(rscratch1, rscratch1, src);
3663   str(rscratch1, adr);
3664 }
3665 
3666 void MacroAssembler::cmpptr(Register src1, Address src2) {
3667   unsigned long offset;
3668   adrp(rscratch1, src2, offset);
3669   ldr(rscratch1, Address(rscratch1, offset));
3670   cmp(src1, rscratch1);
3671 }
3672 
3673 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3674   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3675   bs->obj_equals(this, obj1, obj2);
3676 }
3677 
3678 void MacroAssembler::load_klass(Register dst, Register src) {
3679   if (UseCompressedClassPointers) {
3680     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3681     decode_klass_not_null(dst);
3682   } else {
3683     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3684   }
3685 }
3686 
3687 // ((OopHandle)result).resolve();
3688 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3689   // OopHandle::resolve is an indirection.
3690   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3691 }
3692 
3693 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3694   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3695   ldr(dst, Address(rmethod, Method::const_offset()));
3696   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3697   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3698   ldr(dst, Address(dst, mirror_offset));
3699   resolve_oop_handle(dst, tmp);
3700 }
3701 
3702 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3703   if (UseCompressedClassPointers) {
3704     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3705     if (Universe::narrow_klass_base() == NULL) {
3706       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3707       return;
3708     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3709                && Universe::narrow_klass_shift() == 0) {
3710       // Only the bottom 32 bits matter
3711       cmpw(trial_klass, tmp);
3712       return;
3713     }
3714     decode_klass_not_null(tmp);
3715   } else {
3716     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3717   }
3718   cmp(trial_klass, tmp);
3719 }
3720 
3721 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3722   load_klass(dst, src);
3723   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3724 }
3725 
3726 void MacroAssembler::store_klass(Register dst, Register src) {
3727   // FIXME: Should this be a store release?  concurrent gcs assumes
3728   // klass length is valid if klass field is not null.
3729   if (UseCompressedClassPointers) {
3730     encode_klass_not_null(src);
3731     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3732   } else {
3733     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3734   }
3735 }
3736 
3737 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3738   if (UseCompressedClassPointers) {
3739     // Store to klass gap in destination
3740     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3741   }
3742 }
3743 
3744 // Algorithm must match CompressedOops::encode.
3745 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3746 #ifdef ASSERT
3747   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3748 #endif
3749   verify_oop(s, "broken oop in encode_heap_oop");
3750   if (Universe::narrow_oop_base() == NULL) {
3751     if (Universe::narrow_oop_shift() != 0) {
3752       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3753       lsr(d, s, LogMinObjAlignmentInBytes);
3754     } else {
3755       mov(d, s);
3756     }
3757   } else {
3758     subs(d, s, rheapbase);
3759     csel(d, d, zr, Assembler::HS);
3760     lsr(d, d, LogMinObjAlignmentInBytes);
3761 
3762     /*  Old algorithm: is this any worse?
3763     Label nonnull;
3764     cbnz(r, nonnull);
3765     sub(r, r, rheapbase);
3766     bind(nonnull);
3767     lsr(r, r, LogMinObjAlignmentInBytes);
3768     */
3769   }
3770 }
3771 
3772 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3773 #ifdef ASSERT
3774   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3775   if (CheckCompressedOops) {
3776     Label ok;
3777     cbnz(r, ok);
3778     stop("null oop passed to encode_heap_oop_not_null");
3779     bind(ok);
3780   }
3781 #endif
3782   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3783   if (Universe::narrow_oop_base() != NULL) {
3784     sub(r, r, rheapbase);
3785   }
3786   if (Universe::narrow_oop_shift() != 0) {
3787     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3788     lsr(r, r, LogMinObjAlignmentInBytes);
3789   }
3790 }
3791 
3792 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3793 #ifdef ASSERT
3794   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3795   if (CheckCompressedOops) {
3796     Label ok;
3797     cbnz(src, ok);
3798     stop("null oop passed to encode_heap_oop_not_null2");
3799     bind(ok);
3800   }
3801 #endif
3802   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3803 
3804   Register data = src;
3805   if (Universe::narrow_oop_base() != NULL) {
3806     sub(dst, src, rheapbase);
3807     data = dst;
3808   }
3809   if (Universe::narrow_oop_shift() != 0) {
3810     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3811     lsr(dst, data, LogMinObjAlignmentInBytes);
3812     data = dst;
3813   }
3814   if (data == src)
3815     mov(dst, src);
3816 }
3817 
3818 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3819 #ifdef ASSERT
3820   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3821 #endif
3822   if (Universe::narrow_oop_base() == NULL) {
3823     if (Universe::narrow_oop_shift() != 0 || d != s) {
3824       lsl(d, s, Universe::narrow_oop_shift());
3825     }
3826   } else {
3827     Label done;
3828     if (d != s)
3829       mov(d, s);
3830     cbz(s, done);
3831     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3832     bind(done);
3833   }
3834   verify_oop(d, "broken oop in decode_heap_oop");
3835 }
3836 
3837 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3838   assert (UseCompressedOops, "should only be used for compressed headers");
3839   assert (Universe::heap() != NULL, "java heap should be initialized");
3840   // Cannot assert, unverified entry point counts instructions (see .ad file)
3841   // vtableStubs also counts instructions in pd_code_size_limit.
3842   // Also do not verify_oop as this is called by verify_oop.
3843   if (Universe::narrow_oop_shift() != 0) {
3844     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3845     if (Universe::narrow_oop_base() != NULL) {
3846       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3847     } else {
3848       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3849     }
3850   } else {
3851     assert (Universe::narrow_oop_base() == NULL, "sanity");
3852   }
3853 }
3854 
3855 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3856   assert (UseCompressedOops, "should only be used for compressed headers");
3857   assert (Universe::heap() != NULL, "java heap should be initialized");
3858   // Cannot assert, unverified entry point counts instructions (see .ad file)
3859   // vtableStubs also counts instructions in pd_code_size_limit.
3860   // Also do not verify_oop as this is called by verify_oop.
3861   if (Universe::narrow_oop_shift() != 0) {
3862     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3863     if (Universe::narrow_oop_base() != NULL) {
3864       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3865     } else {
3866       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3867     }
3868   } else {
3869     assert (Universe::narrow_oop_base() == NULL, "sanity");
3870     if (dst != src) {
3871       mov(dst, src);
3872     }
3873   }
3874 }
3875 
3876 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3877   if (Universe::narrow_klass_base() == NULL) {
3878     if (Universe::narrow_klass_shift() != 0) {
3879       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3880       lsr(dst, src, LogKlassAlignmentInBytes);
3881     } else {
3882       if (dst != src) mov(dst, src);
3883     }
3884     return;
3885   }
3886 
3887   if (use_XOR_for_compressed_class_base) {
3888     if (Universe::narrow_klass_shift() != 0) {
3889       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3890       lsr(dst, dst, LogKlassAlignmentInBytes);
3891     } else {
3892       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3893     }
3894     return;
3895   }
3896 
3897   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3898       && Universe::narrow_klass_shift() == 0) {
3899     movw(dst, src);
3900     return;
3901   }
3902 
3903 #ifdef ASSERT
3904   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3905 #endif
3906 
3907   Register rbase = dst;
3908   if (dst == src) rbase = rheapbase;
3909   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3910   sub(dst, src, rbase);
3911   if (Universe::narrow_klass_shift() != 0) {
3912     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3913     lsr(dst, dst, LogKlassAlignmentInBytes);
3914   }
3915   if (dst == src) reinit_heapbase();
3916 }
3917 
3918 void MacroAssembler::encode_klass_not_null(Register r) {
3919   encode_klass_not_null(r, r);
3920 }
3921 
3922 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3923   Register rbase = dst;
3924   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3925 
3926   if (Universe::narrow_klass_base() == NULL) {
3927     if (Universe::narrow_klass_shift() != 0) {
3928       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3929       lsl(dst, src, LogKlassAlignmentInBytes);
3930     } else {
3931       if (dst != src) mov(dst, src);
3932     }
3933     return;
3934   }
3935 
3936   if (use_XOR_for_compressed_class_base) {
3937     if (Universe::narrow_klass_shift() != 0) {
3938       lsl(dst, src, LogKlassAlignmentInBytes);
3939       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3940     } else {
3941       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3942     }
3943     return;
3944   }
3945 
3946   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3947       && Universe::narrow_klass_shift() == 0) {
3948     if (dst != src)
3949       movw(dst, src);
3950     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3951     return;
3952   }
3953 
3954   // Cannot assert, unverified entry point counts instructions (see .ad file)
3955   // vtableStubs also counts instructions in pd_code_size_limit.
3956   // Also do not verify_oop as this is called by verify_oop.
3957   if (dst == src) rbase = rheapbase;
3958   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3959   if (Universe::narrow_klass_shift() != 0) {
3960     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3961     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3962   } else {
3963     add(dst, rbase, src);
3964   }
3965   if (dst == src) reinit_heapbase();
3966 }
3967 
3968 void  MacroAssembler::decode_klass_not_null(Register r) {
3969   decode_klass_not_null(r, r);
3970 }
3971 
3972 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3973 #ifdef ASSERT
3974   {
3975     ThreadInVMfromUnknown tiv;
3976     assert (UseCompressedOops, "should only be used for compressed oops");
3977     assert (Universe::heap() != NULL, "java heap should be initialized");
3978     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3979     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3980   }
3981 #endif
3982   int oop_index = oop_recorder()->find_index(obj);
3983   InstructionMark im(this);
3984   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3985   code_section()->relocate(inst_mark(), rspec);
3986   movz(dst, 0xDEAD, 16);
3987   movk(dst, 0xBEEF);
3988 }
3989 
3990 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3991   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3992   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3993   int index = oop_recorder()->find_index(k);
3994   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3995 
3996   InstructionMark im(this);
3997   RelocationHolder rspec = metadata_Relocation::spec(index);
3998   code_section()->relocate(inst_mark(), rspec);
3999   narrowKlass nk = Klass::encode_klass(k);
4000   movz(dst, (nk >> 16), 16);
4001   movk(dst, nk & 0xffff);
4002 }
4003 
4004 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4005                                     Register dst, Address src,
4006                                     Register tmp1, Register thread_tmp) {
4007   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4008   decorators = AccessInternal::decorator_fixup(decorators);
4009   bool as_raw = (decorators & AS_RAW) != 0;
4010   if (as_raw) {
4011     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4012   } else {
4013     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4014   }
4015 }
4016 
4017 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4018                                      Address dst, Register src,
4019                                      Register tmp1, Register thread_tmp) {
4020   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4021   decorators = AccessInternal::decorator_fixup(decorators);
4022   bool as_raw = (decorators & AS_RAW) != 0;
4023   if (as_raw) {
4024     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4025   } else {
4026     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4027   }
4028 }
4029 
4030 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4031   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4032   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4033     decorators |= ACCESS_READ | ACCESS_WRITE;
4034   }
4035   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4036   return bs->resolve(this, decorators, obj);
4037 }
4038 
4039 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4040                                    Register thread_tmp, DecoratorSet decorators) {
4041   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4042 }
4043 
4044 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4045                                             Register thread_tmp, DecoratorSet decorators) {
4046   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4047 }
4048 
4049 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4050                                     Register thread_tmp, DecoratorSet decorators) {
4051   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4052 }
4053 
4054 // Used for storing NULLs.
4055 void MacroAssembler::store_heap_oop_null(Address dst) {
4056   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4057 }
4058 
4059 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4060   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4061   int index = oop_recorder()->allocate_metadata_index(obj);
4062   RelocationHolder rspec = metadata_Relocation::spec(index);
4063   return Address((address)obj, rspec);
4064 }
4065 
4066 // Move an oop into a register.  immediate is true if we want
4067 // immediate instrcutions, i.e. we are not going to patch this
4068 // instruction while the code is being executed by another thread.  In
4069 // that case we can use move immediates rather than the constant pool.
4070 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4071   int oop_index;
4072   if (obj == NULL) {
4073     oop_index = oop_recorder()->allocate_oop_index(obj);
4074   } else {
4075 #ifdef ASSERT
4076     {
4077       ThreadInVMfromUnknown tiv;
4078       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4079     }
4080 #endif
4081     oop_index = oop_recorder()->find_index(obj);
4082   }
4083   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4084   if (! immediate) {
4085     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4086     ldr_constant(dst, Address(dummy, rspec));
4087   } else
4088     mov(dst, Address((address)obj, rspec));
4089 }
4090 
4091 // Move a metadata address into a register.
4092 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4093   int oop_index;
4094   if (obj == NULL) {
4095     oop_index = oop_recorder()->allocate_metadata_index(obj);
4096   } else {
4097     oop_index = oop_recorder()->find_index(obj);
4098   }
4099   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4100   mov(dst, Address((address)obj, rspec));
4101 }
4102 
4103 Address MacroAssembler::constant_oop_address(jobject obj) {
4104 #ifdef ASSERT
4105   {
4106     ThreadInVMfromUnknown tiv;
4107     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4108     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4109   }
4110 #endif
4111   int oop_index = oop_recorder()->find_index(obj);
4112   return Address((address)obj, oop_Relocation::spec(oop_index));
4113 }
4114 
4115 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4116 void MacroAssembler::tlab_allocate(Register obj,
4117                                    Register var_size_in_bytes,
4118                                    int con_size_in_bytes,
4119                                    Register t1,
4120                                    Register t2,
4121                                    Label& slow_case) {
4122   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4123   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4124 }
4125 
4126 // Defines obj, preserves var_size_in_bytes
4127 void MacroAssembler::eden_allocate(Register obj,
4128                                    Register var_size_in_bytes,
4129                                    int con_size_in_bytes,
4130                                    Register t1,
4131                                    Label& slow_case) {
4132   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4133   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4134 }
4135 
4136 // Zero words; len is in bytes
4137 // Destroys all registers except addr
4138 // len must be a nonzero multiple of wordSize
4139 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4140   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4141 
4142 #ifdef ASSERT
4143   { Label L;
4144     tst(len, BytesPerWord - 1);
4145     br(Assembler::EQ, L);
4146     stop("len is not a multiple of BytesPerWord");
4147     bind(L);
4148   }
4149 #endif
4150 
4151 #ifndef PRODUCT
4152   block_comment("zero memory");
4153 #endif
4154 
4155   Label loop;
4156   Label entry;
4157 
4158 //  Algorithm:
4159 //
4160 //    scratch1 = cnt & 7;
4161 //    cnt -= scratch1;
4162 //    p += scratch1;
4163 //    switch (scratch1) {
4164 //      do {
4165 //        cnt -= 8;
4166 //          p[-8] = 0;
4167 //        case 7:
4168 //          p[-7] = 0;
4169 //        case 6:
4170 //          p[-6] = 0;
4171 //          // ...
4172 //        case 1:
4173 //          p[-1] = 0;
4174 //        case 0:
4175 //          p += 8;
4176 //      } while (cnt);
4177 //    }
4178 
4179   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4180 
4181   lsr(len, len, LogBytesPerWord);
4182   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4183   sub(len, len, rscratch1);      // cnt -= unroll
4184   // t1 always points to the end of the region we're about to zero
4185   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4186   adr(rscratch2, entry);
4187   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4188   br(rscratch2);
4189   bind(loop);
4190   sub(len, len, unroll);
4191   for (int i = -unroll; i < 0; i++)
4192     Assembler::str(zr, Address(t1, i * wordSize));
4193   bind(entry);
4194   add(t1, t1, unroll * wordSize);
4195   cbnz(len, loop);
4196 }
4197 
4198 void MacroAssembler::verify_tlab() {
4199 #ifdef ASSERT
4200   if (UseTLAB && VerifyOops) {
4201     Label next, ok;
4202 
4203     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4204 
4205     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4206     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4207     cmp(rscratch2, rscratch1);
4208     br(Assembler::HS, next);
4209     STOP("assert(top >= start)");
4210     should_not_reach_here();
4211 
4212     bind(next);
4213     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4214     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4215     cmp(rscratch2, rscratch1);
4216     br(Assembler::HS, ok);
4217     STOP("assert(top <= end)");
4218     should_not_reach_here();
4219 
4220     bind(ok);
4221     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4222   }
4223 #endif
4224 }
4225 
4226 // Writes to stack successive pages until offset reached to check for
4227 // stack overflow + shadow pages.  This clobbers tmp.
4228 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4229   assert_different_registers(tmp, size, rscratch1);
4230   mov(tmp, sp);
4231   // Bang stack for total size given plus shadow page size.
4232   // Bang one page at a time because large size can bang beyond yellow and
4233   // red zones.
4234   Label loop;
4235   mov(rscratch1, os::vm_page_size());
4236   bind(loop);
4237   lea(tmp, Address(tmp, -os::vm_page_size()));
4238   subsw(size, size, rscratch1);
4239   str(size, Address(tmp));
4240   br(Assembler::GT, loop);
4241 
4242   // Bang down shadow pages too.
4243   // At this point, (tmp-0) is the last address touched, so don't
4244   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4245   // was post-decremented.)  Skip this address by starting at i=1, and
4246   // touch a few more pages below.  N.B.  It is important to touch all
4247   // the way down to and including i=StackShadowPages.
4248   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4249     // this could be any sized move but this is can be a debugging crumb
4250     // so the bigger the better.
4251     lea(tmp, Address(tmp, -os::vm_page_size()));
4252     str(size, Address(tmp));
4253   }
4254 }
4255 
4256 
4257 // Move the address of the polling page into dest.
4258 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4259   if (SafepointMechanism::uses_thread_local_poll()) {
4260     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4261   } else {
4262     unsigned long off;
4263     adrp(dest, Address(page, rtype), off);
4264     assert(off == 0, "polling page must be page aligned");
4265   }
4266 }
4267 
4268 // Move the address of the polling page into r, then read the polling
4269 // page.
4270 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4271   get_polling_page(r, page, rtype);
4272   return read_polling_page(r, rtype);
4273 }
4274 
4275 // Read the polling page.  The address of the polling page must
4276 // already be in r.
4277 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4278   InstructionMark im(this);
4279   code_section()->relocate(inst_mark(), rtype);
4280   ldrw(zr, Address(r, 0));
4281   return inst_mark();
4282 }
4283 
4284 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4285   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4286   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4287   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4288   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4289   long offset_low = dest_page - low_page;
4290   long offset_high = dest_page - high_page;
4291 
4292   assert(is_valid_AArch64_address(dest.target()), "bad address");
4293   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4294 
4295   InstructionMark im(this);
4296   code_section()->relocate(inst_mark(), dest.rspec());
4297   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4298   // the code cache so that if it is relocated we know it will still reach
4299   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4300     _adrp(reg1, dest.target());
4301   } else {
4302     unsigned long target = (unsigned long)dest.target();
4303     unsigned long adrp_target
4304       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4305 
4306     _adrp(reg1, (address)adrp_target);
4307     movk(reg1, target >> 32, 32);
4308   }
4309   byte_offset = (unsigned long)dest.target() & 0xfff;
4310 }
4311 
4312 void MacroAssembler::load_byte_map_base(Register reg) {
4313   jbyte *byte_map_base =
4314     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4315 
4316   if (is_valid_AArch64_address((address)byte_map_base)) {
4317     // Strictly speaking the byte_map_base isn't an address at all,
4318     // and it might even be negative.
4319     unsigned long offset;
4320     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4321     // We expect offset to be zero with most collectors.
4322     if (offset != 0) {
4323       add(reg, reg, offset);
4324     }
4325   } else {
4326     mov(reg, (uint64_t)byte_map_base);
4327   }
4328 }
4329 
4330 void MacroAssembler::build_frame(int framesize) {
4331   assert(framesize > 0, "framesize must be > 0");
4332   if (framesize < ((1 << 9) + 2 * wordSize)) {
4333     sub(sp, sp, framesize);
4334     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4335     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4336   } else {
4337     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4338     if (PreserveFramePointer) mov(rfp, sp);
4339     if (framesize < ((1 << 12) + 2 * wordSize))
4340       sub(sp, sp, framesize - 2 * wordSize);
4341     else {
4342       mov(rscratch1, framesize - 2 * wordSize);
4343       sub(sp, sp, rscratch1);
4344     }
4345   }
4346 }
4347 
4348 void MacroAssembler::remove_frame(int framesize) {
4349   assert(framesize > 0, "framesize must be > 0");
4350   if (framesize < ((1 << 9) + 2 * wordSize)) {
4351     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4352     add(sp, sp, framesize);
4353   } else {
4354     if (framesize < ((1 << 12) + 2 * wordSize))
4355       add(sp, sp, framesize - 2 * wordSize);
4356     else {
4357       mov(rscratch1, framesize - 2 * wordSize);
4358       add(sp, sp, rscratch1);
4359     }
4360     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4361   }
4362 }
4363 
4364 #ifdef COMPILER2
4365 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4366 
4367 // Search for str1 in str2 and return index or -1
4368 void MacroAssembler::string_indexof(Register str2, Register str1,
4369                                     Register cnt2, Register cnt1,
4370                                     Register tmp1, Register tmp2,
4371                                     Register tmp3, Register tmp4,
4372                                     Register tmp5, Register tmp6,
4373                                     int icnt1, Register result, int ae) {
4374   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4375   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4376 
4377   Register ch1 = rscratch1;
4378   Register ch2 = rscratch2;
4379   Register cnt1tmp = tmp1;
4380   Register cnt2tmp = tmp2;
4381   Register cnt1_neg = cnt1;
4382   Register cnt2_neg = cnt2;
4383   Register result_tmp = tmp4;
4384 
4385   bool isL = ae == StrIntrinsicNode::LL;
4386 
4387   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4388   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4389   int str1_chr_shift = str1_isL ? 0:1;
4390   int str2_chr_shift = str2_isL ? 0:1;
4391   int str1_chr_size = str1_isL ? 1:2;
4392   int str2_chr_size = str2_isL ? 1:2;
4393   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4394                                       (chr_insn)&MacroAssembler::ldrh;
4395   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4396                                       (chr_insn)&MacroAssembler::ldrh;
4397   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4398   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4399 
4400   // Note, inline_string_indexOf() generates checks:
4401   // if (substr.count > string.count) return -1;
4402   // if (substr.count == 0) return 0;
4403 
4404   // We have two strings, a source string in str2, cnt2 and a pattern string
4405   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4406 
4407   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4408   // With a small pattern and source we use linear scan.
4409 
4410   if (icnt1 == -1) {
4411     sub(result_tmp, cnt2, cnt1);
4412     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4413     br(LT, LINEARSEARCH);
4414     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4415     subs(zr, cnt1, 256);
4416     lsr(tmp1, cnt2, 2);
4417     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4418     br(GE, LINEARSTUB);
4419   }
4420 
4421 // The Boyer Moore alogorithm is based on the description here:-
4422 //
4423 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4424 //
4425 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4426 // and the 'Good Suffix' rule.
4427 //
4428 // These rules are essentially heuristics for how far we can shift the
4429 // pattern along the search string.
4430 //
4431 // The implementation here uses the 'Bad Character' rule only because of the
4432 // complexity of initialisation for the 'Good Suffix' rule.
4433 //
4434 // This is also known as the Boyer-Moore-Horspool algorithm:-
4435 //
4436 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4437 //
4438 // This particular implementation has few java-specific optimizations.
4439 //
4440 // #define ASIZE 256
4441 //
4442 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4443 //       int i, j;
4444 //       unsigned c;
4445 //       unsigned char bc[ASIZE];
4446 //
4447 //       /* Preprocessing */
4448 //       for (i = 0; i < ASIZE; ++i)
4449 //          bc[i] = m;
4450 //       for (i = 0; i < m - 1; ) {
4451 //          c = x[i];
4452 //          ++i;
4453 //          // c < 256 for Latin1 string, so, no need for branch
4454 //          #ifdef PATTERN_STRING_IS_LATIN1
4455 //          bc[c] = m - i;
4456 //          #else
4457 //          if (c < ASIZE) bc[c] = m - i;
4458 //          #endif
4459 //       }
4460 //
4461 //       /* Searching */
4462 //       j = 0;
4463 //       while (j <= n - m) {
4464 //          c = y[i+j];
4465 //          if (x[m-1] == c)
4466 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4467 //          if (i < 0) return j;
4468 //          // c < 256 for Latin1 string, so, no need for branch
4469 //          #ifdef SOURCE_STRING_IS_LATIN1
4470 //          // LL case: (c< 256) always true. Remove branch
4471 //          j += bc[y[j+m-1]];
4472 //          #endif
4473 //          #ifndef PATTERN_STRING_IS_UTF
4474 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4475 //          if (c < ASIZE)
4476 //            j += bc[y[j+m-1]];
4477 //          else
4478 //            j += 1
4479 //          #endif
4480 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4481 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4482 //          if (c < ASIZE)
4483 //            j += bc[y[j+m-1]];
4484 //          else
4485 //            j += m
4486 //          #endif
4487 //       }
4488 //    }
4489 
4490   if (icnt1 == -1) {
4491     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4492         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4493     Register cnt1end = tmp2;
4494     Register str2end = cnt2;
4495     Register skipch = tmp2;
4496 
4497     // str1 length is >=8, so, we can read at least 1 register for cases when
4498     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4499     // UL case. We'll re-read last character in inner pre-loop code to have
4500     // single outer pre-loop load
4501     const int firstStep = isL ? 7 : 3;
4502 
4503     const int ASIZE = 256;
4504     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4505     sub(sp, sp, ASIZE);
4506     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4507     mov(ch1, sp);
4508     BIND(BM_INIT_LOOP);
4509       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4510       subs(tmp5, tmp5, 1);
4511       br(GT, BM_INIT_LOOP);
4512 
4513       sub(cnt1tmp, cnt1, 1);
4514       mov(tmp5, str2);
4515       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4516       sub(ch2, cnt1, 1);
4517       mov(tmp3, str1);
4518     BIND(BCLOOP);
4519       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4520       if (!str1_isL) {
4521         subs(zr, ch1, ASIZE);
4522         br(HS, BCSKIP);
4523       }
4524       strb(ch2, Address(sp, ch1));
4525     BIND(BCSKIP);
4526       subs(ch2, ch2, 1);
4527       br(GT, BCLOOP);
4528 
4529       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4530       if (str1_isL == str2_isL) {
4531         // load last 8 bytes (8LL/4UU symbols)
4532         ldr(tmp6, Address(tmp6, -wordSize));
4533       } else {
4534         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4535         // convert Latin1 to UTF. We'll have to wait until load completed, but
4536         // it's still faster than per-character loads+checks
4537         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4538         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4539         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4540         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4541         orr(ch2, ch1, ch2, LSL, 16);
4542         orr(tmp6, tmp6, tmp3, LSL, 48);
4543         orr(tmp6, tmp6, ch2, LSL, 16);
4544       }
4545     BIND(BMLOOPSTR2);
4546       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4547       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4548       if (str1_isL == str2_isL) {
4549         // re-init tmp3. It's for free because it's executed in parallel with
4550         // load above. Alternative is to initialize it before loop, but it'll
4551         // affect performance on in-order systems with 2 or more ld/st pipelines
4552         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4553       }
4554       if (!isL) { // UU/UL case
4555         lsl(ch2, cnt1tmp, 1); // offset in bytes
4556       }
4557       cmp(tmp3, skipch);
4558       br(NE, BMSKIP);
4559       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4560       mov(ch1, tmp6);
4561       if (isL) {
4562         b(BMLOOPSTR1_AFTER_LOAD);
4563       } else {
4564         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4565         b(BMLOOPSTR1_CMP);
4566       }
4567     BIND(BMLOOPSTR1);
4568       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4569       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4570     BIND(BMLOOPSTR1_AFTER_LOAD);
4571       subs(cnt1tmp, cnt1tmp, 1);
4572       br(LT, BMLOOPSTR1_LASTCMP);
4573     BIND(BMLOOPSTR1_CMP);
4574       cmp(ch1, ch2);
4575       br(EQ, BMLOOPSTR1);
4576     BIND(BMSKIP);
4577       if (!isL) {
4578         // if we've met UTF symbol while searching Latin1 pattern, then we can
4579         // skip cnt1 symbols
4580         if (str1_isL != str2_isL) {
4581           mov(result_tmp, cnt1);
4582         } else {
4583           mov(result_tmp, 1);
4584         }
4585         subs(zr, skipch, ASIZE);
4586         br(HS, BMADV);
4587       }
4588       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4589     BIND(BMADV);
4590       sub(cnt1tmp, cnt1, 1);
4591       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4592       cmp(str2, str2end);
4593       br(LE, BMLOOPSTR2);
4594       add(sp, sp, ASIZE);
4595       b(NOMATCH);
4596     BIND(BMLOOPSTR1_LASTCMP);
4597       cmp(ch1, ch2);
4598       br(NE, BMSKIP);
4599     BIND(BMMATCH);
4600       sub(result, str2, tmp5);
4601       if (!str2_isL) lsr(result, result, 1);
4602       add(sp, sp, ASIZE);
4603       b(DONE);
4604 
4605     BIND(LINEARSTUB);
4606     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4607     br(LT, LINEAR_MEDIUM);
4608     mov(result, zr);
4609     RuntimeAddress stub = NULL;
4610     if (isL) {
4611       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4612       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4613     } else if (str1_isL) {
4614       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4615        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4616     } else {
4617       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4618       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4619     }
4620     trampoline_call(stub);
4621     b(DONE);
4622   }
4623 
4624   BIND(LINEARSEARCH);
4625   {
4626     Label DO1, DO2, DO3;
4627 
4628     Register str2tmp = tmp2;
4629     Register first = tmp3;
4630 
4631     if (icnt1 == -1)
4632     {
4633         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4634 
4635         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4636         br(LT, DOSHORT);
4637       BIND(LINEAR_MEDIUM);
4638         (this->*str1_load_1chr)(first, Address(str1));
4639         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4640         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4641         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4642         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4643 
4644       BIND(FIRST_LOOP);
4645         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4646         cmp(first, ch2);
4647         br(EQ, STR1_LOOP);
4648       BIND(STR2_NEXT);
4649         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4650         br(LE, FIRST_LOOP);
4651         b(NOMATCH);
4652 
4653       BIND(STR1_LOOP);
4654         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4655         add(cnt2tmp, cnt2_neg, str2_chr_size);
4656         br(GE, MATCH);
4657 
4658       BIND(STR1_NEXT);
4659         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4660         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4661         cmp(ch1, ch2);
4662         br(NE, STR2_NEXT);
4663         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4664         add(cnt2tmp, cnt2tmp, str2_chr_size);
4665         br(LT, STR1_NEXT);
4666         b(MATCH);
4667 
4668       BIND(DOSHORT);
4669       if (str1_isL == str2_isL) {
4670         cmp(cnt1, (u1)2);
4671         br(LT, DO1);
4672         br(GT, DO3);
4673       }
4674     }
4675 
4676     if (icnt1 == 4) {
4677       Label CH1_LOOP;
4678 
4679         (this->*load_4chr)(ch1, str1);
4680         sub(result_tmp, cnt2, 4);
4681         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4682         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4683 
4684       BIND(CH1_LOOP);
4685         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4686         cmp(ch1, ch2);
4687         br(EQ, MATCH);
4688         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4689         br(LE, CH1_LOOP);
4690         b(NOMATCH);
4691       }
4692 
4693     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4694       Label CH1_LOOP;
4695 
4696       BIND(DO2);
4697         (this->*load_2chr)(ch1, str1);
4698         if (icnt1 == 2) {
4699           sub(result_tmp, cnt2, 2);
4700         }
4701         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4702         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4703       BIND(CH1_LOOP);
4704         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4705         cmp(ch1, ch2);
4706         br(EQ, MATCH);
4707         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4708         br(LE, CH1_LOOP);
4709         b(NOMATCH);
4710     }
4711 
4712     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4713       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4714 
4715       BIND(DO3);
4716         (this->*load_2chr)(first, str1);
4717         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4718         if (icnt1 == 3) {
4719           sub(result_tmp, cnt2, 3);
4720         }
4721         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4722         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4723       BIND(FIRST_LOOP);
4724         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4725         cmpw(first, ch2);
4726         br(EQ, STR1_LOOP);
4727       BIND(STR2_NEXT);
4728         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4729         br(LE, FIRST_LOOP);
4730         b(NOMATCH);
4731 
4732       BIND(STR1_LOOP);
4733         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4734         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4735         cmp(ch1, ch2);
4736         br(NE, STR2_NEXT);
4737         b(MATCH);
4738     }
4739 
4740     if (icnt1 == -1 || icnt1 == 1) {
4741       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4742 
4743       BIND(DO1);
4744         (this->*str1_load_1chr)(ch1, str1);
4745         cmp(cnt2, (u1)8);
4746         br(LT, DO1_SHORT);
4747 
4748         sub(result_tmp, cnt2, 8/str2_chr_size);
4749         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4750         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4751         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4752 
4753         if (str2_isL) {
4754           orr(ch1, ch1, ch1, LSL, 8);
4755         }
4756         orr(ch1, ch1, ch1, LSL, 16);
4757         orr(ch1, ch1, ch1, LSL, 32);
4758       BIND(CH1_LOOP);
4759         ldr(ch2, Address(str2, cnt2_neg));
4760         eor(ch2, ch1, ch2);
4761         sub(tmp1, ch2, tmp3);
4762         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4763         bics(tmp1, tmp1, tmp2);
4764         br(NE, HAS_ZERO);
4765         adds(cnt2_neg, cnt2_neg, 8);
4766         br(LT, CH1_LOOP);
4767 
4768         cmp(cnt2_neg, (u1)8);
4769         mov(cnt2_neg, 0);
4770         br(LT, CH1_LOOP);
4771         b(NOMATCH);
4772 
4773       BIND(HAS_ZERO);
4774         rev(tmp1, tmp1);
4775         clz(tmp1, tmp1);
4776         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4777         b(MATCH);
4778 
4779       BIND(DO1_SHORT);
4780         mov(result_tmp, cnt2);
4781         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4782         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4783       BIND(DO1_LOOP);
4784         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4785         cmpw(ch1, ch2);
4786         br(EQ, MATCH);
4787         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4788         br(LT, DO1_LOOP);
4789     }
4790   }
4791   BIND(NOMATCH);
4792     mov(result, -1);
4793     b(DONE);
4794   BIND(MATCH);
4795     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4796   BIND(DONE);
4797 }
4798 
4799 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4800 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4801 
4802 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4803                                          Register ch, Register result,
4804                                          Register tmp1, Register tmp2, Register tmp3)
4805 {
4806   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4807   Register cnt1_neg = cnt1;
4808   Register ch1 = rscratch1;
4809   Register result_tmp = rscratch2;
4810 
4811   cmp(cnt1, (u1)4);
4812   br(LT, DO1_SHORT);
4813 
4814   orr(ch, ch, ch, LSL, 16);
4815   orr(ch, ch, ch, LSL, 32);
4816 
4817   sub(cnt1, cnt1, 4);
4818   mov(result_tmp, cnt1);
4819   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4820   sub(cnt1_neg, zr, cnt1, LSL, 1);
4821 
4822   mov(tmp3, 0x0001000100010001);
4823 
4824   BIND(CH1_LOOP);
4825     ldr(ch1, Address(str1, cnt1_neg));
4826     eor(ch1, ch, ch1);
4827     sub(tmp1, ch1, tmp3);
4828     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4829     bics(tmp1, tmp1, tmp2);
4830     br(NE, HAS_ZERO);
4831     adds(cnt1_neg, cnt1_neg, 8);
4832     br(LT, CH1_LOOP);
4833 
4834     cmp(cnt1_neg, (u1)8);
4835     mov(cnt1_neg, 0);
4836     br(LT, CH1_LOOP);
4837     b(NOMATCH);
4838 
4839   BIND(HAS_ZERO);
4840     rev(tmp1, tmp1);
4841     clz(tmp1, tmp1);
4842     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4843     b(MATCH);
4844 
4845   BIND(DO1_SHORT);
4846     mov(result_tmp, cnt1);
4847     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4848     sub(cnt1_neg, zr, cnt1, LSL, 1);
4849   BIND(DO1_LOOP);
4850     ldrh(ch1, Address(str1, cnt1_neg));
4851     cmpw(ch, ch1);
4852     br(EQ, MATCH);
4853     adds(cnt1_neg, cnt1_neg, 2);
4854     br(LT, DO1_LOOP);
4855   BIND(NOMATCH);
4856     mov(result, -1);
4857     b(DONE);
4858   BIND(MATCH);
4859     add(result, result_tmp, cnt1_neg, ASR, 1);
4860   BIND(DONE);
4861 }
4862 
4863 // Compare strings.
4864 void MacroAssembler::string_compare(Register str1, Register str2,
4865     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4866     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4867   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4868       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4869       SHORT_LOOP_START, TAIL_CHECK;
4870 
4871   const u1 STUB_THRESHOLD = 64 + 8;
4872   bool isLL = ae == StrIntrinsicNode::LL;
4873   bool isLU = ae == StrIntrinsicNode::LU;
4874   bool isUL = ae == StrIntrinsicNode::UL;
4875 
4876   bool str1_isL = isLL || isLU;
4877   bool str2_isL = isLL || isUL;
4878 
4879   int str1_chr_shift = str1_isL ? 0 : 1;
4880   int str2_chr_shift = str2_isL ? 0 : 1;
4881   int str1_chr_size = str1_isL ? 1 : 2;
4882   int str2_chr_size = str2_isL ? 1 : 2;
4883   int minCharsInWord = isLL ? wordSize : wordSize/2;
4884 
4885   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4886   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4887                                       (chr_insn)&MacroAssembler::ldrh;
4888   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4889                                       (chr_insn)&MacroAssembler::ldrh;
4890   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4891                             (uxt_insn)&MacroAssembler::uxthw;
4892 
4893   BLOCK_COMMENT("string_compare {");
4894 
4895   // Bizzarely, the counts are passed in bytes, regardless of whether they
4896   // are L or U strings, however the result is always in characters.
4897   if (!str1_isL) asrw(cnt1, cnt1, 1);
4898   if (!str2_isL) asrw(cnt2, cnt2, 1);
4899 
4900   // Compute the minimum of the string lengths and save the difference.
4901   subsw(result, cnt1, cnt2);
4902   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4903 
4904   // A very short string
4905   cmpw(cnt2, minCharsInWord);
4906   br(Assembler::LT, SHORT_STRING);
4907 
4908   // Compare longwords
4909   // load first parts of strings and finish initialization while loading
4910   {
4911     if (str1_isL == str2_isL) { // LL or UU
4912       ldr(tmp1, Address(str1));
4913       cmp(str1, str2);
4914       br(Assembler::EQ, DONE);
4915       ldr(tmp2, Address(str2));
4916       cmp(cnt2, STUB_THRESHOLD);
4917       br(GE, STUB);
4918       subsw(cnt2, cnt2, minCharsInWord);
4919       br(EQ, TAIL_CHECK);
4920       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4921       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4922       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4923     } else if (isLU) {
4924       ldrs(vtmp, Address(str1));
4925       cmp(str1, str2);
4926       br(Assembler::EQ, DONE);
4927       ldr(tmp2, Address(str2));
4928       cmp(cnt2, STUB_THRESHOLD);
4929       br(GE, STUB);
4930       subsw(cnt2, cnt2, 4);
4931       br(EQ, TAIL_CHECK);
4932       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4933       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4934       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4935       zip1(vtmp, T8B, vtmp, vtmpZ);
4936       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4937       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4938       add(cnt1, cnt1, 4);
4939       fmovd(tmp1, vtmp);
4940     } else { // UL case
4941       ldr(tmp1, Address(str1));
4942       cmp(str1, str2);
4943       br(Assembler::EQ, DONE);
4944       ldrs(vtmp, Address(str2));
4945       cmp(cnt2, STUB_THRESHOLD);
4946       br(GE, STUB);
4947       subsw(cnt2, cnt2, 4);
4948       br(EQ, TAIL_CHECK);
4949       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4950       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4951       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4952       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4953       zip1(vtmp, T8B, vtmp, vtmpZ);
4954       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4955       add(cnt1, cnt1, 8);
4956       fmovd(tmp2, vtmp);
4957     }
4958     adds(cnt2, cnt2, isUL ? 4 : 8);
4959     br(GE, TAIL);
4960     eor(rscratch2, tmp1, tmp2);
4961     cbnz(rscratch2, DIFFERENCE);
4962     // main loop
4963     bind(NEXT_WORD);
4964     if (str1_isL == str2_isL) {
4965       ldr(tmp1, Address(str1, cnt2));
4966       ldr(tmp2, Address(str2, cnt2));
4967       adds(cnt2, cnt2, 8);
4968     } else if (isLU) {
4969       ldrs(vtmp, Address(str1, cnt1));
4970       ldr(tmp2, Address(str2, cnt2));
4971       add(cnt1, cnt1, 4);
4972       zip1(vtmp, T8B, vtmp, vtmpZ);
4973       fmovd(tmp1, vtmp);
4974       adds(cnt2, cnt2, 8);
4975     } else { // UL
4976       ldrs(vtmp, Address(str2, cnt2));
4977       ldr(tmp1, Address(str1, cnt1));
4978       zip1(vtmp, T8B, vtmp, vtmpZ);
4979       add(cnt1, cnt1, 8);
4980       fmovd(tmp2, vtmp);
4981       adds(cnt2, cnt2, 4);
4982     }
4983     br(GE, TAIL);
4984 
4985     eor(rscratch2, tmp1, tmp2);
4986     cbz(rscratch2, NEXT_WORD);
4987     b(DIFFERENCE);
4988     bind(TAIL);
4989     eor(rscratch2, tmp1, tmp2);
4990     cbnz(rscratch2, DIFFERENCE);
4991     // Last longword.  In the case where length == 4 we compare the
4992     // same longword twice, but that's still faster than another
4993     // conditional branch.
4994     if (str1_isL == str2_isL) {
4995       ldr(tmp1, Address(str1));
4996       ldr(tmp2, Address(str2));
4997     } else if (isLU) {
4998       ldrs(vtmp, Address(str1));
4999       ldr(tmp2, Address(str2));
5000       zip1(vtmp, T8B, vtmp, vtmpZ);
5001       fmovd(tmp1, vtmp);
5002     } else { // UL
5003       ldrs(vtmp, Address(str2));
5004       ldr(tmp1, Address(str1));
5005       zip1(vtmp, T8B, vtmp, vtmpZ);
5006       fmovd(tmp2, vtmp);
5007     }
5008     bind(TAIL_CHECK);
5009     eor(rscratch2, tmp1, tmp2);
5010     cbz(rscratch2, DONE);
5011 
5012     // Find the first different characters in the longwords and
5013     // compute their difference.
5014     bind(DIFFERENCE);
5015     rev(rscratch2, rscratch2);
5016     clz(rscratch2, rscratch2);
5017     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5018     lsrv(tmp1, tmp1, rscratch2);
5019     (this->*ext_chr)(tmp1, tmp1);
5020     lsrv(tmp2, tmp2, rscratch2);
5021     (this->*ext_chr)(tmp2, tmp2);
5022     subw(result, tmp1, tmp2);
5023     b(DONE);
5024   }
5025 
5026   bind(STUB);
5027     RuntimeAddress stub = NULL;
5028     switch(ae) {
5029       case StrIntrinsicNode::LL:
5030         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5031         break;
5032       case StrIntrinsicNode::UU:
5033         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5034         break;
5035       case StrIntrinsicNode::LU:
5036         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5037         break;
5038       case StrIntrinsicNode::UL:
5039         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5040         break;
5041       default:
5042         ShouldNotReachHere();
5043      }
5044     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5045     trampoline_call(stub);
5046     b(DONE);
5047 
5048   bind(SHORT_STRING);
5049   // Is the minimum length zero?
5050   cbz(cnt2, DONE);
5051   // arrange code to do most branches while loading and loading next characters
5052   // while comparing previous
5053   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5054   subs(cnt2, cnt2, 1);
5055   br(EQ, SHORT_LAST_INIT);
5056   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5057   b(SHORT_LOOP_START);
5058   bind(SHORT_LOOP);
5059   subs(cnt2, cnt2, 1);
5060   br(EQ, SHORT_LAST);
5061   bind(SHORT_LOOP_START);
5062   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5063   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5064   cmp(tmp1, cnt1);
5065   br(NE, SHORT_LOOP_TAIL);
5066   subs(cnt2, cnt2, 1);
5067   br(EQ, SHORT_LAST2);
5068   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5069   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5070   cmp(tmp2, rscratch1);
5071   br(EQ, SHORT_LOOP);
5072   sub(result, tmp2, rscratch1);
5073   b(DONE);
5074   bind(SHORT_LOOP_TAIL);
5075   sub(result, tmp1, cnt1);
5076   b(DONE);
5077   bind(SHORT_LAST2);
5078   cmp(tmp2, rscratch1);
5079   br(EQ, DONE);
5080   sub(result, tmp2, rscratch1);
5081 
5082   b(DONE);
5083   bind(SHORT_LAST_INIT);
5084   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5085   bind(SHORT_LAST);
5086   cmp(tmp1, cnt1);
5087   br(EQ, DONE);
5088   sub(result, tmp1, cnt1);
5089 
5090   bind(DONE);
5091 
5092   BLOCK_COMMENT("} string_compare");
5093 }
5094 #endif // COMPILER2
5095 
5096 // This method checks if provided byte array contains byte with highest bit set.
5097 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5098     // Simple and most common case of aligned small array which is not at the
5099     // end of memory page is placed here. All other cases are in stub.
5100     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5101     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5102     assert_different_registers(ary1, len, result);
5103 
5104     cmpw(len, 0);
5105     br(LE, SET_RESULT);
5106     cmpw(len, 4 * wordSize);
5107     br(GE, STUB_LONG); // size > 32 then go to stub
5108 
5109     int shift = 64 - exact_log2(os::vm_page_size());
5110     lsl(rscratch1, ary1, shift);
5111     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5112     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5113     br(CS, STUB); // at the end of page then go to stub
5114     subs(len, len, wordSize);
5115     br(LT, END);
5116 
5117   BIND(LOOP);
5118     ldr(rscratch1, Address(post(ary1, wordSize)));
5119     tst(rscratch1, UPPER_BIT_MASK);
5120     br(NE, SET_RESULT);
5121     subs(len, len, wordSize);
5122     br(GE, LOOP);
5123     cmpw(len, -wordSize);
5124     br(EQ, SET_RESULT);
5125 
5126   BIND(END);
5127     ldr(result, Address(ary1));
5128     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5129     lslv(result, result, len);
5130     tst(result, UPPER_BIT_MASK);
5131     b(SET_RESULT);
5132 
5133   BIND(STUB);
5134     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5135     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5136     trampoline_call(has_neg);
5137     b(DONE);
5138 
5139   BIND(STUB_LONG);
5140     RuntimeAddress has_neg_long =  RuntimeAddress(
5141             StubRoutines::aarch64::has_negatives_long());
5142     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5143     trampoline_call(has_neg_long);
5144     b(DONE);
5145 
5146   BIND(SET_RESULT);
5147     cset(result, NE); // set true or false
5148 
5149   BIND(DONE);
5150 }
5151 
5152 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5153                                    Register tmp4, Register tmp5, Register result,
5154                                    Register cnt1, int elem_size) {
5155   Label DONE, SAME;
5156   Register tmp1 = rscratch1;
5157   Register tmp2 = rscratch2;
5158   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5159   int elem_per_word = wordSize/elem_size;
5160   int log_elem_size = exact_log2(elem_size);
5161   int length_offset = arrayOopDesc::length_offset_in_bytes();
5162   int base_offset
5163     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5164   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5165 
5166   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5167   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5168 
5169 #ifndef PRODUCT
5170   {
5171     const char kind = (elem_size == 2) ? 'U' : 'L';
5172     char comment[64];
5173     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5174     BLOCK_COMMENT(comment);
5175   }
5176 #endif
5177 
5178   // if (a1 == a2)
5179   //     return true;
5180   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5181   br(EQ, SAME);
5182 
5183   if (UseSimpleArrayEquals) {
5184     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5185     // if (a1 == null || a2 == null)
5186     //     return false;
5187     // a1 & a2 == 0 means (some-pointer is null) or
5188     // (very-rare-or-even-probably-impossible-pointer-values)
5189     // so, we can save one branch in most cases
5190     tst(a1, a2);
5191     mov(result, false);
5192     br(EQ, A_MIGHT_BE_NULL);
5193     // if (a1.length != a2.length)
5194     //      return false;
5195     bind(A_IS_NOT_NULL);
5196     ldrw(cnt1, Address(a1, length_offset));
5197     ldrw(cnt2, Address(a2, length_offset));
5198     eorw(tmp5, cnt1, cnt2);
5199     cbnzw(tmp5, DONE);
5200     lea(a1, Address(a1, base_offset));
5201     lea(a2, Address(a2, base_offset));
5202     // Check for short strings, i.e. smaller than wordSize.
5203     subs(cnt1, cnt1, elem_per_word);
5204     br(Assembler::LT, SHORT);
5205     // Main 8 byte comparison loop.
5206     bind(NEXT_WORD); {
5207       ldr(tmp1, Address(post(a1, wordSize)));
5208       ldr(tmp2, Address(post(a2, wordSize)));
5209       subs(cnt1, cnt1, elem_per_word);
5210       eor(tmp5, tmp1, tmp2);
5211       cbnz(tmp5, DONE);
5212     } br(GT, NEXT_WORD);
5213     // Last longword.  In the case where length == 4 we compare the
5214     // same longword twice, but that's still faster than another
5215     // conditional branch.
5216     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5217     // length == 4.
5218     if (log_elem_size > 0)
5219       lsl(cnt1, cnt1, log_elem_size);
5220     ldr(tmp3, Address(a1, cnt1));
5221     ldr(tmp4, Address(a2, cnt1));
5222     eor(tmp5, tmp3, tmp4);
5223     cbnz(tmp5, DONE);
5224     b(SAME);
5225     bind(A_MIGHT_BE_NULL);
5226     // in case both a1 and a2 are not-null, proceed with loads
5227     cbz(a1, DONE);
5228     cbz(a2, DONE);
5229     b(A_IS_NOT_NULL);
5230     bind(SHORT);
5231 
5232     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5233     {
5234       ldrw(tmp1, Address(post(a1, 4)));
5235       ldrw(tmp2, Address(post(a2, 4)));
5236       eorw(tmp5, tmp1, tmp2);
5237       cbnzw(tmp5, DONE);
5238     }
5239     bind(TAIL03);
5240     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5241     {
5242       ldrh(tmp3, Address(post(a1, 2)));
5243       ldrh(tmp4, Address(post(a2, 2)));
5244       eorw(tmp5, tmp3, tmp4);
5245       cbnzw(tmp5, DONE);
5246     }
5247     bind(TAIL01);
5248     if (elem_size == 1) { // Only needed when comparing byte arrays.
5249       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5250       {
5251         ldrb(tmp1, a1);
5252         ldrb(tmp2, a2);
5253         eorw(tmp5, tmp1, tmp2);
5254         cbnzw(tmp5, DONE);
5255       }
5256     }
5257   } else {
5258     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5259         CSET_EQ, LAST_CHECK;
5260     mov(result, false);
5261     cbz(a1, DONE);
5262     ldrw(cnt1, Address(a1, length_offset));
5263     cbz(a2, DONE);
5264     ldrw(cnt2, Address(a2, length_offset));
5265     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5266     // faster to perform another branch before comparing a1 and a2
5267     cmp(cnt1, (u1)elem_per_word);
5268     br(LE, SHORT); // short or same
5269     ldr(tmp3, Address(pre(a1, base_offset)));
5270     subs(zr, cnt1, stubBytesThreshold);
5271     br(GE, STUB);
5272     ldr(tmp4, Address(pre(a2, base_offset)));
5273     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5274     cmp(cnt2, cnt1);
5275     br(NE, DONE);
5276 
5277     // Main 16 byte comparison loop with 2 exits
5278     bind(NEXT_DWORD); {
5279       ldr(tmp1, Address(pre(a1, wordSize)));
5280       ldr(tmp2, Address(pre(a2, wordSize)));
5281       subs(cnt1, cnt1, 2 * elem_per_word);
5282       br(LE, TAIL);
5283       eor(tmp4, tmp3, tmp4);
5284       cbnz(tmp4, DONE);
5285       ldr(tmp3, Address(pre(a1, wordSize)));
5286       ldr(tmp4, Address(pre(a2, wordSize)));
5287       cmp(cnt1, (u1)elem_per_word);
5288       br(LE, TAIL2);
5289       cmp(tmp1, tmp2);
5290     } br(EQ, NEXT_DWORD);
5291     b(DONE);
5292 
5293     bind(TAIL);
5294     eor(tmp4, tmp3, tmp4);
5295     eor(tmp2, tmp1, tmp2);
5296     lslv(tmp2, tmp2, tmp5);
5297     orr(tmp5, tmp4, tmp2);
5298     cmp(tmp5, zr);
5299     b(CSET_EQ);
5300 
5301     bind(TAIL2);
5302     eor(tmp2, tmp1, tmp2);
5303     cbnz(tmp2, DONE);
5304     b(LAST_CHECK);
5305 
5306     bind(STUB);
5307     ldr(tmp4, Address(pre(a2, base_offset)));
5308     cmp(cnt2, cnt1);
5309     br(NE, DONE);
5310     if (elem_size == 2) { // convert to byte counter
5311       lsl(cnt1, cnt1, 1);
5312     }
5313     eor(tmp5, tmp3, tmp4);
5314     cbnz(tmp5, DONE);
5315     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5316     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5317     trampoline_call(stub);
5318     b(DONE);
5319 
5320     bind(EARLY_OUT);
5321     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5322     // so, if a2 == null => return false(0), else return true, so we can return a2
5323     mov(result, a2);
5324     b(DONE);
5325     bind(SHORT);
5326     cmp(cnt2, cnt1);
5327     br(NE, DONE);
5328     cbz(cnt1, SAME);
5329     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5330     ldr(tmp3, Address(a1, base_offset));
5331     ldr(tmp4, Address(a2, base_offset));
5332     bind(LAST_CHECK);
5333     eor(tmp4, tmp3, tmp4);
5334     lslv(tmp5, tmp4, tmp5);
5335     cmp(tmp5, zr);
5336     bind(CSET_EQ);
5337     cset(result, EQ);
5338     b(DONE);
5339   }
5340 
5341   bind(SAME);
5342   mov(result, true);
5343   // That's it.
5344   bind(DONE);
5345 
5346   BLOCK_COMMENT("} array_equals");
5347 }
5348 
5349 // Compare Strings
5350 
5351 // For Strings we're passed the address of the first characters in a1
5352 // and a2 and the length in cnt1.
5353 // elem_size is the element size in bytes: either 1 or 2.
5354 // There are two implementations.  For arrays >= 8 bytes, all
5355 // comparisons (including the final one, which may overlap) are
5356 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5357 // halfword, then a short, and then a byte.
5358 
5359 void MacroAssembler::string_equals(Register a1, Register a2,
5360                                    Register result, Register cnt1, int elem_size)
5361 {
5362   Label SAME, DONE, SHORT, NEXT_WORD;
5363   Register tmp1 = rscratch1;
5364   Register tmp2 = rscratch2;
5365   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5366 
5367   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5368   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5369 
5370 #ifndef PRODUCT
5371   {
5372     const char kind = (elem_size == 2) ? 'U' : 'L';
5373     char comment[64];
5374     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5375     BLOCK_COMMENT(comment);
5376   }
5377 #endif
5378 
5379   mov(result, false);
5380 
5381   // Check for short strings, i.e. smaller than wordSize.
5382   subs(cnt1, cnt1, wordSize);
5383   br(Assembler::LT, SHORT);
5384   // Main 8 byte comparison loop.
5385   bind(NEXT_WORD); {
5386     ldr(tmp1, Address(post(a1, wordSize)));
5387     ldr(tmp2, Address(post(a2, wordSize)));
5388     subs(cnt1, cnt1, wordSize);
5389     eor(tmp1, tmp1, tmp2);
5390     cbnz(tmp1, DONE);
5391   } br(GT, NEXT_WORD);
5392   // Last longword.  In the case where length == 4 we compare the
5393   // same longword twice, but that's still faster than another
5394   // conditional branch.
5395   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5396   // length == 4.
5397   ldr(tmp1, Address(a1, cnt1));
5398   ldr(tmp2, Address(a2, cnt1));
5399   eor(tmp2, tmp1, tmp2);
5400   cbnz(tmp2, DONE);
5401   b(SAME);
5402 
5403   bind(SHORT);
5404   Label TAIL03, TAIL01;
5405 
5406   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5407   {
5408     ldrw(tmp1, Address(post(a1, 4)));
5409     ldrw(tmp2, Address(post(a2, 4)));
5410     eorw(tmp1, tmp1, tmp2);
5411     cbnzw(tmp1, DONE);
5412   }
5413   bind(TAIL03);
5414   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5415   {
5416     ldrh(tmp1, Address(post(a1, 2)));
5417     ldrh(tmp2, Address(post(a2, 2)));
5418     eorw(tmp1, tmp1, tmp2);
5419     cbnzw(tmp1, DONE);
5420   }
5421   bind(TAIL01);
5422   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5423     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5424     {
5425       ldrb(tmp1, a1);
5426       ldrb(tmp2, a2);
5427       eorw(tmp1, tmp1, tmp2);
5428       cbnzw(tmp1, DONE);
5429     }
5430   }
5431   // Arrays are equal.
5432   bind(SAME);
5433   mov(result, true);
5434 
5435   // That's it.
5436   bind(DONE);
5437   BLOCK_COMMENT("} string_equals");
5438 }
5439 
5440 
5441 // The size of the blocks erased by the zero_blocks stub.  We must
5442 // handle anything smaller than this ourselves in zero_words().
5443 const int MacroAssembler::zero_words_block_size = 8;
5444 
5445 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5446 // possible, handling small word counts locally and delegating
5447 // anything larger to the zero_blocks stub.  It is expanded many times
5448 // in compiled code, so it is important to keep it short.
5449 
5450 // ptr:   Address of a buffer to be zeroed.
5451 // cnt:   Count in HeapWords.
5452 //
5453 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5454 void MacroAssembler::zero_words(Register ptr, Register cnt)
5455 {
5456   assert(is_power_of_2(zero_words_block_size), "adjust this");
5457   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5458 
5459   BLOCK_COMMENT("zero_words {");
5460   cmp(cnt, (u1)zero_words_block_size);
5461   Label around;
5462   br(LO, around);
5463   {
5464     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5465     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5466     if (StubRoutines::aarch64::complete()) {
5467       trampoline_call(zero_blocks);
5468     } else {
5469       bl(zero_blocks);
5470     }
5471   }
5472   bind(around);
5473   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5474     Label l;
5475     tbz(cnt, exact_log2(i), l);
5476     for (int j = 0; j < i; j += 2) {
5477       stp(zr, zr, post(ptr, 16));
5478     }
5479     bind(l);
5480   }
5481   {
5482     Label l;
5483     tbz(cnt, 0, l);
5484     str(zr, Address(ptr));
5485     bind(l);
5486   }
5487   BLOCK_COMMENT("} zero_words");
5488 }
5489 
5490 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5491 // cnt:          Immediate count in HeapWords.
5492 #define SmallArraySize (18 * BytesPerLong)
5493 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5494 {
5495   BLOCK_COMMENT("zero_words {");
5496   int i = cnt & 1;  // store any odd word to start
5497   if (i) str(zr, Address(base));
5498 
5499   if (cnt <= SmallArraySize / BytesPerLong) {
5500     for (; i < (int)cnt; i += 2)
5501       stp(zr, zr, Address(base, i * wordSize));
5502   } else {
5503     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5504     int remainder = cnt % (2 * unroll);
5505     for (; i < remainder; i += 2)
5506       stp(zr, zr, Address(base, i * wordSize));
5507 
5508     Label loop;
5509     Register cnt_reg = rscratch1;
5510     Register loop_base = rscratch2;
5511     cnt = cnt - remainder;
5512     mov(cnt_reg, cnt);
5513     // adjust base and prebias by -2 * wordSize so we can pre-increment
5514     add(loop_base, base, (remainder - 2) * wordSize);
5515     bind(loop);
5516     sub(cnt_reg, cnt_reg, 2 * unroll);
5517     for (i = 1; i < unroll; i++)
5518       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5519     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5520     cbnz(cnt_reg, loop);
5521   }
5522   BLOCK_COMMENT("} zero_words");
5523 }
5524 
5525 // Zero blocks of memory by using DC ZVA.
5526 //
5527 // Aligns the base address first sufficently for DC ZVA, then uses
5528 // DC ZVA repeatedly for every full block.  cnt is the size to be
5529 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5530 // in cnt.
5531 //
5532 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5533 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5534 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5535   Register tmp = rscratch1;
5536   Register tmp2 = rscratch2;
5537   int zva_length = VM_Version::zva_length();
5538   Label initial_table_end, loop_zva;
5539   Label fini;
5540 
5541   // Base must be 16 byte aligned. If not just return and let caller handle it
5542   tst(base, 0x0f);
5543   br(Assembler::NE, fini);
5544   // Align base with ZVA length.
5545   neg(tmp, base);
5546   andr(tmp, tmp, zva_length - 1);
5547 
5548   // tmp: the number of bytes to be filled to align the base with ZVA length.
5549   add(base, base, tmp);
5550   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5551   adr(tmp2, initial_table_end);
5552   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5553   br(tmp2);
5554 
5555   for (int i = -zva_length + 16; i < 0; i += 16)
5556     stp(zr, zr, Address(base, i));
5557   bind(initial_table_end);
5558 
5559   sub(cnt, cnt, zva_length >> 3);
5560   bind(loop_zva);
5561   dc(Assembler::ZVA, base);
5562   subs(cnt, cnt, zva_length >> 3);
5563   add(base, base, zva_length);
5564   br(Assembler::GE, loop_zva);
5565   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5566   bind(fini);
5567 }
5568 
5569 // base:   Address of a buffer to be filled, 8 bytes aligned.
5570 // cnt:    Count in 8-byte unit.
5571 // value:  Value to be filled with.
5572 // base will point to the end of the buffer after filling.
5573 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5574 {
5575 //  Algorithm:
5576 //
5577 //    scratch1 = cnt & 7;
5578 //    cnt -= scratch1;
5579 //    p += scratch1;
5580 //    switch (scratch1) {
5581 //      do {
5582 //        cnt -= 8;
5583 //          p[-8] = v;
5584 //        case 7:
5585 //          p[-7] = v;
5586 //        case 6:
5587 //          p[-6] = v;
5588 //          // ...
5589 //        case 1:
5590 //          p[-1] = v;
5591 //        case 0:
5592 //          p += 8;
5593 //      } while (cnt);
5594 //    }
5595 
5596   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5597 
5598   Label fini, skip, entry, loop;
5599   const int unroll = 8; // Number of stp instructions we'll unroll
5600 
5601   cbz(cnt, fini);
5602   tbz(base, 3, skip);
5603   str(value, Address(post(base, 8)));
5604   sub(cnt, cnt, 1);
5605   bind(skip);
5606 
5607   andr(rscratch1, cnt, (unroll-1) * 2);
5608   sub(cnt, cnt, rscratch1);
5609   add(base, base, rscratch1, Assembler::LSL, 3);
5610   adr(rscratch2, entry);
5611   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5612   br(rscratch2);
5613 
5614   bind(loop);
5615   add(base, base, unroll * 16);
5616   for (int i = -unroll; i < 0; i++)
5617     stp(value, value, Address(base, i * 16));
5618   bind(entry);
5619   subs(cnt, cnt, unroll * 2);
5620   br(Assembler::GE, loop);
5621 
5622   tbz(cnt, 0, fini);
5623   str(value, Address(post(base, 8)));
5624   bind(fini);
5625 }
5626 
5627 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5628 // java/lang/StringUTF16.compress.
5629 void MacroAssembler::encode_iso_array(Register src, Register dst,
5630                       Register len, Register result,
5631                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5632                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5633 {
5634     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5635         NEXT_32_START, NEXT_32_PRFM_START;
5636     Register tmp1 = rscratch1, tmp2 = rscratch2;
5637 
5638       mov(result, len); // Save initial len
5639 
5640 #ifndef BUILTIN_SIM
5641       cmp(len, (u1)8); // handle shortest strings first
5642       br(LT, LOOP_1);
5643       cmp(len, (u1)32);
5644       br(LT, NEXT_8);
5645       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5646       // to convert chars to bytes
5647       if (SoftwarePrefetchHintDistance >= 0) {
5648         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5649         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5650         br(LE, NEXT_32_START);
5651         b(NEXT_32_PRFM_START);
5652         BIND(NEXT_32_PRFM);
5653           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5654         BIND(NEXT_32_PRFM_START);
5655           prfm(Address(src, SoftwarePrefetchHintDistance));
5656           orr(v4, T16B, Vtmp1, Vtmp2);
5657           orr(v5, T16B, Vtmp3, Vtmp4);
5658           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5659           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5660           stpq(Vtmp1, Vtmp3, dst);
5661           uzp2(v5, T16B, v4, v5); // high bytes
5662           umov(tmp2, v5, D, 1);
5663           fmovd(tmp1, v5);
5664           orr(tmp1, tmp1, tmp2);
5665           cbnz(tmp1, LOOP_8);
5666           sub(len, len, 32);
5667           add(dst, dst, 32);
5668           add(src, src, 64);
5669           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5670           br(GE, NEXT_32_PRFM);
5671           cmp(len, (u1)32);
5672           br(LT, LOOP_8);
5673         BIND(NEXT_32);
5674           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5675         BIND(NEXT_32_START);
5676       } else {
5677         BIND(NEXT_32);
5678           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5679       }
5680       prfm(Address(src, SoftwarePrefetchHintDistance));
5681       uzp1(v4, T16B, Vtmp1, Vtmp2);
5682       uzp1(v5, T16B, Vtmp3, Vtmp4);
5683       stpq(v4, v5, dst);
5684       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5685       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5686       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5687       umov(tmp2, Vtmp1, D, 1);
5688       fmovd(tmp1, Vtmp1);
5689       orr(tmp1, tmp1, tmp2);
5690       cbnz(tmp1, LOOP_8);
5691       sub(len, len, 32);
5692       add(dst, dst, 32);
5693       add(src, src, 64);
5694       cmp(len, (u1)32);
5695       br(GE, NEXT_32);
5696       cbz(len, DONE);
5697 
5698     BIND(LOOP_8);
5699       cmp(len, (u1)8);
5700       br(LT, LOOP_1);
5701     BIND(NEXT_8);
5702       ld1(Vtmp1, T8H, src);
5703       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5704       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5705       strd(Vtmp2, dst);
5706       fmovd(tmp1, Vtmp3);
5707       cbnz(tmp1, NEXT_1);
5708 
5709       sub(len, len, 8);
5710       add(dst, dst, 8);
5711       add(src, src, 16);
5712       cmp(len, (u1)8);
5713       br(GE, NEXT_8);
5714 
5715     BIND(LOOP_1);
5716 #endif
5717     cbz(len, DONE);
5718     BIND(NEXT_1);
5719       ldrh(tmp1, Address(post(src, 2)));
5720       strb(tmp1, Address(post(dst, 1)));
5721       tst(tmp1, 0xff00);
5722       br(NE, SET_RESULT);
5723       subs(len, len, 1);
5724       br(GT, NEXT_1);
5725 
5726     BIND(SET_RESULT);
5727       sub(result, result, len); // Return index where we stopped
5728                                 // Return len == 0 if we processed all
5729                                 // characters
5730     BIND(DONE);
5731 }
5732 
5733 
5734 // Inflate byte[] array to char[].
5735 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5736                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5737                                         Register tmp4) {
5738   Label big, done, after_init, to_stub;
5739 
5740   assert_different_registers(src, dst, len, tmp4, rscratch1);
5741 
5742   fmovd(vtmp1, zr);
5743   lsrw(tmp4, len, 3);
5744   bind(after_init);
5745   cbnzw(tmp4, big);
5746   // Short string: less than 8 bytes.
5747   {
5748     Label loop, tiny;
5749 
5750     cmpw(len, 4);
5751     br(LT, tiny);
5752     // Use SIMD to do 4 bytes.
5753     ldrs(vtmp2, post(src, 4));
5754     zip1(vtmp3, T8B, vtmp2, vtmp1);
5755     subw(len, len, 4);
5756     strd(vtmp3, post(dst, 8));
5757 
5758     cbzw(len, done);
5759 
5760     // Do the remaining bytes by steam.
5761     bind(loop);
5762     ldrb(tmp4, post(src, 1));
5763     strh(tmp4, post(dst, 2));
5764     subw(len, len, 1);
5765 
5766     bind(tiny);
5767     cbnz(len, loop);
5768 
5769     b(done);
5770   }
5771 
5772   if (SoftwarePrefetchHintDistance >= 0) {
5773     bind(to_stub);
5774       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5775       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5776       trampoline_call(stub);
5777       b(after_init);
5778   }
5779 
5780   // Unpack the bytes 8 at a time.
5781   bind(big);
5782   {
5783     Label loop, around, loop_last, loop_start;
5784 
5785     if (SoftwarePrefetchHintDistance >= 0) {
5786       const int large_loop_threshold = (64 + 16)/8;
5787       ldrd(vtmp2, post(src, 8));
5788       andw(len, len, 7);
5789       cmp(tmp4, (u1)large_loop_threshold);
5790       br(GE, to_stub);
5791       b(loop_start);
5792 
5793       bind(loop);
5794       ldrd(vtmp2, post(src, 8));
5795       bind(loop_start);
5796       subs(tmp4, tmp4, 1);
5797       br(EQ, loop_last);
5798       zip1(vtmp2, T16B, vtmp2, vtmp1);
5799       ldrd(vtmp3, post(src, 8));
5800       st1(vtmp2, T8H, post(dst, 16));
5801       subs(tmp4, tmp4, 1);
5802       zip1(vtmp3, T16B, vtmp3, vtmp1);
5803       st1(vtmp3, T8H, post(dst, 16));
5804       br(NE, loop);
5805       b(around);
5806       bind(loop_last);
5807       zip1(vtmp2, T16B, vtmp2, vtmp1);
5808       st1(vtmp2, T8H, post(dst, 16));
5809       bind(around);
5810       cbz(len, done);
5811     } else {
5812       andw(len, len, 7);
5813       bind(loop);
5814       ldrd(vtmp2, post(src, 8));
5815       sub(tmp4, tmp4, 1);
5816       zip1(vtmp3, T16B, vtmp2, vtmp1);
5817       st1(vtmp3, T8H, post(dst, 16));
5818       cbnz(tmp4, loop);
5819     }
5820   }
5821 
5822   // Do the tail of up to 8 bytes.
5823   add(src, src, len);
5824   ldrd(vtmp3, Address(src, -8));
5825   add(dst, dst, len, ext::uxtw, 1);
5826   zip1(vtmp3, T16B, vtmp3, vtmp1);
5827   strq(vtmp3, Address(dst, -16));
5828 
5829   bind(done);
5830 }
5831 
5832 // Compress char[] array to byte[].
5833 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5834                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5835                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5836                                          Register result) {
5837   encode_iso_array(src, dst, len, result,
5838                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5839   cmp(len, zr);
5840   csel(result, result, zr, EQ);
5841 }
5842 
5843 // get_thread() can be called anywhere inside generated code so we
5844 // need to save whatever non-callee save context might get clobbered
5845 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5846 // the call setup code.
5847 //
5848 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5849 //
5850 void MacroAssembler::get_thread(Register dst) {
5851   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5852   push(saved_regs, sp);
5853 
5854   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5855   blrt(lr, 1, 0, 1);
5856   if (dst != c_rarg0) {
5857     mov(dst, c_rarg0);
5858   }
5859 
5860   pop(saved_regs, sp);
5861 }