1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 292   dsb(Assembler::SY);
 293 }
 294 
 295 void MacroAssembler::safepoint_poll(Label& slow_path) {
 296   if (SafepointMechanism::uses_thread_local_poll()) {
 297     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 298     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 299   } else {
 300     unsigned long offset;
 301     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 302     ldrw(rscratch1, Address(rscratch1, offset));
 303     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 304     cbnz(rscratch1, slow_path);
 305   }
 306 }
 307 
 308 // Just like safepoint_poll, but use an acquiring load for thread-
 309 // local polling.
 310 //
 311 // We need an acquire here to ensure that any subsequent load of the
 312 // global SafepointSynchronize::_state flag is ordered after this load
 313 // of the local Thread::_polling page.  We don't want this poll to
 314 // return false (i.e. not safepointing) and a later poll of the global
 315 // SafepointSynchronize::_state spuriously to return true.
 316 //
 317 // This is to avoid a race when we're in a native->Java transition
 318 // racing the code which wakes up from a safepoint.
 319 //
 320 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 321   if (SafepointMechanism::uses_thread_local_poll()) {
 322     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 323     ldar(rscratch1, rscratch1);
 324     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 325   } else {
 326     safepoint_poll(slow_path);
 327   }
 328 }
 329 
 330 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 331   // we must set sp to zero to clear frame
 332   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 333 
 334   // must clear fp, so that compiled frames are not confused; it is
 335   // possible that we need it only for debugging
 336   if (clear_fp) {
 337     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 338   }
 339 
 340   // Always clear the pc because it could have been set by make_walkable()
 341   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 342 }
 343 
 344 // Calls to C land
 345 //
 346 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 347 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 348 // has to be reset to 0. This is required to allow proper stack traversal.
 349 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 350                                          Register last_java_fp,
 351                                          Register last_java_pc,
 352                                          Register scratch) {
 353 
 354   if (last_java_pc->is_valid()) {
 355       str(last_java_pc, Address(rthread,
 356                                 JavaThread::frame_anchor_offset()
 357                                 + JavaFrameAnchor::last_Java_pc_offset()));
 358     }
 359 
 360   // determine last_java_sp register
 361   if (last_java_sp == sp) {
 362     mov(scratch, sp);
 363     last_java_sp = scratch;
 364   } else if (!last_java_sp->is_valid()) {
 365     last_java_sp = esp;
 366   }
 367 
 368   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 369 
 370   // last_java_fp is optional
 371   if (last_java_fp->is_valid()) {
 372     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 373   }
 374 }
 375 
 376 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 377                                          Register last_java_fp,
 378                                          address  last_java_pc,
 379                                          Register scratch) {
 380   if (last_java_pc != NULL) {
 381     adr(scratch, last_java_pc);
 382   } else {
 383     // FIXME: This is almost never correct.  We should delete all
 384     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 385     // correct return address instead.
 386     adr(scratch, pc());
 387   }
 388 
 389   str(scratch, Address(rthread,
 390                        JavaThread::frame_anchor_offset()
 391                        + JavaFrameAnchor::last_Java_pc_offset()));
 392 
 393   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 394 }
 395 
 396 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 397                                          Register last_java_fp,
 398                                          Label &L,
 399                                          Register scratch) {
 400   if (L.is_bound()) {
 401     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 402   } else {
 403     InstructionMark im(this);
 404     L.add_patch_at(code(), locator());
 405     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 406   }
 407 }
 408 
 409 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 410   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 411   assert(CodeCache::find_blob(entry.target()) != NULL,
 412          "destination of far call not found in code cache");
 413   if (far_branches()) {
 414     unsigned long offset;
 415     // We can use ADRP here because we know that the total size of
 416     // the code cache cannot exceed 2Gb.
 417     adrp(tmp, entry, offset);
 418     add(tmp, tmp, offset);
 419     if (cbuf) cbuf->set_insts_mark();
 420     blr(tmp);
 421   } else {
 422     if (cbuf) cbuf->set_insts_mark();
 423     bl(entry);
 424   }
 425 }
 426 
 427 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 428   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 429   assert(CodeCache::find_blob(entry.target()) != NULL,
 430          "destination of far call not found in code cache");
 431   if (far_branches()) {
 432     unsigned long offset;
 433     // We can use ADRP here because we know that the total size of
 434     // the code cache cannot exceed 2Gb.
 435     adrp(tmp, entry, offset);
 436     add(tmp, tmp, offset);
 437     if (cbuf) cbuf->set_insts_mark();
 438     br(tmp);
 439   } else {
 440     if (cbuf) cbuf->set_insts_mark();
 441     b(entry);
 442   }
 443 }
 444 
 445 void MacroAssembler::reserved_stack_check() {
 446     // testing if reserved zone needs to be enabled
 447     Label no_reserved_zone_enabling;
 448 
 449     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 450     cmp(sp, rscratch1);
 451     br(Assembler::LO, no_reserved_zone_enabling);
 452 
 453     enter();   // LR and FP are live.
 454     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 455     mov(c_rarg0, rthread);
 456     blr(rscratch1);
 457     leave();
 458 
 459     // We have already removed our own frame.
 460     // throw_delayed_StackOverflowError will think that it's been
 461     // called by our caller.
 462     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 463     br(rscratch1);
 464     should_not_reach_here();
 465 
 466     bind(no_reserved_zone_enabling);
 467 }
 468 
 469 int MacroAssembler::biased_locking_enter(Register lock_reg,
 470                                          Register obj_reg,
 471                                          Register swap_reg,
 472                                          Register tmp_reg,
 473                                          bool swap_reg_contains_mark,
 474                                          Label& done,
 475                                          Label* slow_case,
 476                                          BiasedLockingCounters* counters) {
 477   assert(UseBiasedLocking, "why call this otherwise?");
 478   assert_different_registers(lock_reg, obj_reg, swap_reg);
 479 
 480   if (PrintBiasedLockingStatistics && counters == NULL)
 481     counters = BiasedLocking::counters();
 482 
 483   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 484   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 485   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 486   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 487   Address saved_mark_addr(lock_reg, 0);
 488 
 489   // Biased locking
 490   // See whether the lock is currently biased toward our thread and
 491   // whether the epoch is still valid
 492   // Note that the runtime guarantees sufficient alignment of JavaThread
 493   // pointers to allow age to be placed into low bits
 494   // First check to see whether biasing is even enabled for this object
 495   Label cas_label;
 496   int null_check_offset = -1;
 497   if (!swap_reg_contains_mark) {
 498     null_check_offset = offset();
 499     ldr(swap_reg, mark_addr);
 500   }
 501   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 502   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 503   br(Assembler::NE, cas_label);
 504   // The bias pattern is present in the object's header. Need to check
 505   // whether the bias owner and the epoch are both still current.
 506   load_prototype_header(tmp_reg, obj_reg);
 507   orr(tmp_reg, tmp_reg, rthread);
 508   eor(tmp_reg, swap_reg, tmp_reg);
 509   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 510   if (counters != NULL) {
 511     Label around;
 512     cbnz(tmp_reg, around);
 513     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 514     b(done);
 515     bind(around);
 516   } else {
 517     cbz(tmp_reg, done);
 518   }
 519 
 520   Label try_revoke_bias;
 521   Label try_rebias;
 522 
 523   // At this point we know that the header has the bias pattern and
 524   // that we are not the bias owner in the current epoch. We need to
 525   // figure out more details about the state of the header in order to
 526   // know what operations can be legally performed on the object's
 527   // header.
 528 
 529   // If the low three bits in the xor result aren't clear, that means
 530   // the prototype header is no longer biased and we have to revoke
 531   // the bias on this object.
 532   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 533   cbnz(rscratch1, try_revoke_bias);
 534 
 535   // Biasing is still enabled for this data type. See whether the
 536   // epoch of the current bias is still valid, meaning that the epoch
 537   // bits of the mark word are equal to the epoch bits of the
 538   // prototype header. (Note that the prototype header's epoch bits
 539   // only change at a safepoint.) If not, attempt to rebias the object
 540   // toward the current thread. Note that we must be absolutely sure
 541   // that the current epoch is invalid in order to do this because
 542   // otherwise the manipulations it performs on the mark word are
 543   // illegal.
 544   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 545   cbnz(rscratch1, try_rebias);
 546 
 547   // The epoch of the current bias is still valid but we know nothing
 548   // about the owner; it might be set or it might be clear. Try to
 549   // acquire the bias of the object using an atomic operation. If this
 550   // fails we will go in to the runtime to revoke the object's bias.
 551   // Note that we first construct the presumed unbiased header so we
 552   // don't accidentally blow away another thread's valid bias.
 553   {
 554     Label here;
 555     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 556     andr(swap_reg, swap_reg, rscratch1);
 557     orr(tmp_reg, swap_reg, rthread);
 558     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 559     // If the biasing toward our thread failed, this means that
 560     // another thread succeeded in biasing it toward itself and we
 561     // need to revoke that bias. The revocation will occur in the
 562     // interpreter runtime in the slow case.
 563     bind(here);
 564     if (counters != NULL) {
 565       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 566                   tmp_reg, rscratch1, rscratch2);
 567     }
 568   }
 569   b(done);
 570 
 571   bind(try_rebias);
 572   // At this point we know the epoch has expired, meaning that the
 573   // current "bias owner", if any, is actually invalid. Under these
 574   // circumstances _only_, we are allowed to use the current header's
 575   // value as the comparison value when doing the cas to acquire the
 576   // bias in the current epoch. In other words, we allow transfer of
 577   // the bias from one thread to another directly in this situation.
 578   //
 579   // FIXME: due to a lack of registers we currently blow away the age
 580   // bits in this situation. Should attempt to preserve them.
 581   {
 582     Label here;
 583     load_prototype_header(tmp_reg, obj_reg);
 584     orr(tmp_reg, rthread, tmp_reg);
 585     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 586     // If the biasing toward our thread failed, then another thread
 587     // succeeded in biasing it toward itself and we need to revoke that
 588     // bias. The revocation will occur in the runtime in the slow case.
 589     bind(here);
 590     if (counters != NULL) {
 591       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 592                   tmp_reg, rscratch1, rscratch2);
 593     }
 594   }
 595   b(done);
 596 
 597   bind(try_revoke_bias);
 598   // The prototype mark in the klass doesn't have the bias bit set any
 599   // more, indicating that objects of this data type are not supposed
 600   // to be biased any more. We are going to try to reset the mark of
 601   // this object to the prototype value and fall through to the
 602   // CAS-based locking scheme. Note that if our CAS fails, it means
 603   // that another thread raced us for the privilege of revoking the
 604   // bias of this particular object, so it's okay to continue in the
 605   // normal locking code.
 606   //
 607   // FIXME: due to a lack of registers we currently blow away the age
 608   // bits in this situation. Should attempt to preserve them.
 609   {
 610     Label here, nope;
 611     load_prototype_header(tmp_reg, obj_reg);
 612     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 613     bind(here);
 614 
 615     // Fall through to the normal CAS-based lock, because no matter what
 616     // the result of the above CAS, some thread must have succeeded in
 617     // removing the bias bit from the object's header.
 618     if (counters != NULL) {
 619       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 620                   rscratch1, rscratch2);
 621     }
 622     bind(nope);
 623   }
 624 
 625   bind(cas_label);
 626 
 627   return null_check_offset;
 628 }
 629 
 630 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 631   assert(UseBiasedLocking, "why call this otherwise?");
 632 
 633   // Check for biased locking unlock case, which is a no-op
 634   // Note: we do not have to check the thread ID for two reasons.
 635   // First, the interpreter checks for IllegalMonitorStateException at
 636   // a higher level. Second, if the bias was revoked while we held the
 637   // lock, the object could not be rebiased toward another thread, so
 638   // the bias bit would be clear.
 639   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 640   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 641   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 642   br(Assembler::EQ, done);
 643 }
 644 
 645 static void pass_arg0(MacroAssembler* masm, Register arg) {
 646   if (c_rarg0 != arg ) {
 647     masm->mov(c_rarg0, arg);
 648   }
 649 }
 650 
 651 static void pass_arg1(MacroAssembler* masm, Register arg) {
 652   if (c_rarg1 != arg ) {
 653     masm->mov(c_rarg1, arg);
 654   }
 655 }
 656 
 657 static void pass_arg2(MacroAssembler* masm, Register arg) {
 658   if (c_rarg2 != arg ) {
 659     masm->mov(c_rarg2, arg);
 660   }
 661 }
 662 
 663 static void pass_arg3(MacroAssembler* masm, Register arg) {
 664   if (c_rarg3 != arg ) {
 665     masm->mov(c_rarg3, arg);
 666   }
 667 }
 668 
 669 void MacroAssembler::call_VM_base(Register oop_result,
 670                                   Register java_thread,
 671                                   Register last_java_sp,
 672                                   address  entry_point,
 673                                   int      number_of_arguments,
 674                                   bool     check_exceptions) {
 675    // determine java_thread register
 676   if (!java_thread->is_valid()) {
 677     java_thread = rthread;
 678   }
 679 
 680   // determine last_java_sp register
 681   if (!last_java_sp->is_valid()) {
 682     last_java_sp = esp;
 683   }
 684 
 685   // debugging support
 686   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 687   assert(java_thread == rthread, "unexpected register");
 688 #ifdef ASSERT
 689   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 690   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 691 #endif // ASSERT
 692 
 693   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 694   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 695 
 696   // push java thread (becomes first argument of C function)
 697 
 698   mov(c_rarg0, java_thread);
 699 
 700   // set last Java frame before call
 701   assert(last_java_sp != rfp, "can't use rfp");
 702 
 703   Label l;
 704   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 705 
 706   // do the call, remove parameters
 707   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 708 
 709   // reset last Java frame
 710   // Only interpreter should have to clear fp
 711   reset_last_Java_frame(true);
 712 
 713    // C++ interp handles this in the interpreter
 714   check_and_handle_popframe(java_thread);
 715   check_and_handle_earlyret(java_thread);
 716 
 717   if (check_exceptions) {
 718     // check for pending exceptions (java_thread is set upon return)
 719     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 720     Label ok;
 721     cbz(rscratch1, ok);
 722     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 723     br(rscratch1);
 724     bind(ok);
 725   }
 726 
 727   // get oop result if there is one and reset the value in the thread
 728   if (oop_result->is_valid()) {
 729     get_vm_result(oop_result, java_thread);
 730   }
 731 }
 732 
 733 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 734   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 735 }
 736 
 737 // Maybe emit a call via a trampoline.  If the code cache is small
 738 // trampolines won't be emitted.
 739 
 740 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 741   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 742   assert(entry.rspec().type() == relocInfo::runtime_call_type
 743          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 744          || entry.rspec().type() == relocInfo::static_call_type
 745          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 746 
 747   // We need a trampoline if branches are far.
 748   if (far_branches()) {
 749     bool in_scratch_emit_size = false;
 750 #ifdef COMPILER2
 751     // We don't want to emit a trampoline if C2 is generating dummy
 752     // code during its branch shortening phase.
 753     CompileTask* task = ciEnv::current()->task();
 754     in_scratch_emit_size =
 755       (task != NULL && is_c2_compile(task->comp_level()) &&
 756        Compile::current()->in_scratch_emit_size());
 757 #endif
 758     if (!in_scratch_emit_size) {
 759       address stub = emit_trampoline_stub(offset(), entry.target());
 760       if (stub == NULL) {
 761         return NULL; // CodeCache is full
 762       }
 763     }
 764   }
 765 
 766   if (cbuf) cbuf->set_insts_mark();
 767   relocate(entry.rspec());
 768   if (!far_branches()) {
 769     bl(entry.target());
 770   } else {
 771     bl(pc());
 772   }
 773   // just need to return a non-null address
 774   return pc();
 775 }
 776 
 777 
 778 // Emit a trampoline stub for a call to a target which is too far away.
 779 //
 780 // code sequences:
 781 //
 782 // call-site:
 783 //   branch-and-link to <destination> or <trampoline stub>
 784 //
 785 // Related trampoline stub for this call site in the stub section:
 786 //   load the call target from the constant pool
 787 //   branch (LR still points to the call site above)
 788 
 789 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 790                                              address dest) {
 791   // Max stub size: alignment nop, TrampolineStub.
 792   address stub = start_a_stub(NativeInstruction::instruction_size
 793                    + NativeCallTrampolineStub::instruction_size);
 794   if (stub == NULL) {
 795     return NULL;  // CodeBuffer::expand failed
 796   }
 797 
 798   // Create a trampoline stub relocation which relates this trampoline stub
 799   // with the call instruction at insts_call_instruction_offset in the
 800   // instructions code-section.
 801   align(wordSize);
 802   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 803                                             + insts_call_instruction_offset));
 804   const int stub_start_offset = offset();
 805 
 806   // Now, create the trampoline stub's code:
 807   // - load the call
 808   // - call
 809   Label target;
 810   ldr(rscratch1, target);
 811   br(rscratch1);
 812   bind(target);
 813   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 814          "should be");
 815   emit_int64((int64_t)dest);
 816 
 817   const address stub_start_addr = addr_at(stub_start_offset);
 818 
 819   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 820 
 821   end_a_stub();
 822   return stub_start_addr;
 823 }
 824 
 825 void MacroAssembler::c2bool(Register x) {
 826   // implements x == 0 ? 0 : 1
 827   // note: must only look at least-significant byte of x
 828   //       since C-style booleans are stored in one byte
 829   //       only! (was bug)
 830   tst(x, 0xff);
 831   cset(x, Assembler::NE);
 832 }
 833 
 834 address MacroAssembler::ic_call(address entry, jint method_index) {
 835   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 836   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 837   // unsigned long offset;
 838   // ldr_constant(rscratch2, const_ptr);
 839   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 840   return trampoline_call(Address(entry, rh));
 841 }
 842 
 843 // Implementation of call_VM versions
 844 
 845 void MacroAssembler::call_VM(Register oop_result,
 846                              address entry_point,
 847                              bool check_exceptions) {
 848   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 849 }
 850 
 851 void MacroAssembler::call_VM(Register oop_result,
 852                              address entry_point,
 853                              Register arg_1,
 854                              bool check_exceptions) {
 855   pass_arg1(this, arg_1);
 856   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 857 }
 858 
 859 void MacroAssembler::call_VM(Register oop_result,
 860                              address entry_point,
 861                              Register arg_1,
 862                              Register arg_2,
 863                              bool check_exceptions) {
 864   assert(arg_1 != c_rarg2, "smashed arg");
 865   pass_arg2(this, arg_2);
 866   pass_arg1(this, arg_1);
 867   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 868 }
 869 
 870 void MacroAssembler::call_VM(Register oop_result,
 871                              address entry_point,
 872                              Register arg_1,
 873                              Register arg_2,
 874                              Register arg_3,
 875                              bool check_exceptions) {
 876   assert(arg_1 != c_rarg3, "smashed arg");
 877   assert(arg_2 != c_rarg3, "smashed arg");
 878   pass_arg3(this, arg_3);
 879 
 880   assert(arg_1 != c_rarg2, "smashed arg");
 881   pass_arg2(this, arg_2);
 882 
 883   pass_arg1(this, arg_1);
 884   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 885 }
 886 
 887 void MacroAssembler::call_VM(Register oop_result,
 888                              Register last_java_sp,
 889                              address entry_point,
 890                              int number_of_arguments,
 891                              bool check_exceptions) {
 892   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 893 }
 894 
 895 void MacroAssembler::call_VM(Register oop_result,
 896                              Register last_java_sp,
 897                              address entry_point,
 898                              Register arg_1,
 899                              bool check_exceptions) {
 900   pass_arg1(this, arg_1);
 901   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 902 }
 903 
 904 void MacroAssembler::call_VM(Register oop_result,
 905                              Register last_java_sp,
 906                              address entry_point,
 907                              Register arg_1,
 908                              Register arg_2,
 909                              bool check_exceptions) {
 910 
 911   assert(arg_1 != c_rarg2, "smashed arg");
 912   pass_arg2(this, arg_2);
 913   pass_arg1(this, arg_1);
 914   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 915 }
 916 
 917 void MacroAssembler::call_VM(Register oop_result,
 918                              Register last_java_sp,
 919                              address entry_point,
 920                              Register arg_1,
 921                              Register arg_2,
 922                              Register arg_3,
 923                              bool check_exceptions) {
 924   assert(arg_1 != c_rarg3, "smashed arg");
 925   assert(arg_2 != c_rarg3, "smashed arg");
 926   pass_arg3(this, arg_3);
 927   assert(arg_1 != c_rarg2, "smashed arg");
 928   pass_arg2(this, arg_2);
 929   pass_arg1(this, arg_1);
 930   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 931 }
 932 
 933 
 934 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 935   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 936   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 937   verify_oop(oop_result, "broken oop in call_VM_base");
 938 }
 939 
 940 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 941   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 942   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 943 }
 944 
 945 void MacroAssembler::align(int modulus) {
 946   while (offset() % modulus != 0) nop();
 947 }
 948 
 949 // these are no-ops overridden by InterpreterMacroAssembler
 950 
 951 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 952 
 953 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 954 
 955 
 956 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 957                                                       Register tmp,
 958                                                       int offset) {
 959   intptr_t value = *delayed_value_addr;
 960   if (value != 0)
 961     return RegisterOrConstant(value + offset);
 962 
 963   // load indirectly to solve generation ordering problem
 964   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 965 
 966   if (offset != 0)
 967     add(tmp, tmp, offset);
 968 
 969   return RegisterOrConstant(tmp);
 970 }
 971 
 972 
 973 void MacroAssembler:: notify(int type) {
 974   if (type == bytecode_start) {
 975     // set_last_Java_frame(esp, rfp, (address)NULL);
 976     Assembler:: notify(type);
 977     // reset_last_Java_frame(true);
 978   }
 979   else
 980     Assembler:: notify(type);
 981 }
 982 
 983 // Look up the method for a megamorphic invokeinterface call.
 984 // The target method is determined by <intf_klass, itable_index>.
 985 // The receiver klass is in recv_klass.
 986 // On success, the result will be in method_result, and execution falls through.
 987 // On failure, execution transfers to the given label.
 988 void MacroAssembler::lookup_interface_method(Register recv_klass,
 989                                              Register intf_klass,
 990                                              RegisterOrConstant itable_index,
 991                                              Register method_result,
 992                                              Register scan_temp,
 993                                              Label& L_no_such_interface,
 994                          bool return_method) {
 995   assert_different_registers(recv_klass, intf_klass, scan_temp);
 996   assert_different_registers(method_result, intf_klass, scan_temp);
 997   assert(recv_klass != method_result || !return_method,
 998      "recv_klass can be destroyed when method isn't needed");
 999   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
1000          "caller must use same register for non-constant itable index as for method");
1001 
1002   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
1003   int vtable_base = in_bytes(Klass::vtable_start_offset());
1004   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1005   int scan_step   = itableOffsetEntry::size() * wordSize;
1006   int vte_size    = vtableEntry::size_in_bytes();
1007   assert(vte_size == wordSize, "else adjust times_vte_scale");
1008 
1009   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1010 
1011   // %%% Could store the aligned, prescaled offset in the klassoop.
1012   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1013   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1014   add(scan_temp, scan_temp, vtable_base);
1015 
1016   if (return_method) {
1017     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1018     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1019     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1020     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1021     if (itentry_off)
1022       add(recv_klass, recv_klass, itentry_off);
1023   }
1024 
1025   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1026   //   if (scan->interface() == intf) {
1027   //     result = (klass + scan->offset() + itable_index);
1028   //   }
1029   // }
1030   Label search, found_method;
1031 
1032   for (int peel = 1; peel >= 0; peel--) {
1033     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1034     cmp(intf_klass, method_result);
1035 
1036     if (peel) {
1037       br(Assembler::EQ, found_method);
1038     } else {
1039       br(Assembler::NE, search);
1040       // (invert the test to fall through to found_method...)
1041     }
1042 
1043     if (!peel)  break;
1044 
1045     bind(search);
1046 
1047     // Check that the previous entry is non-null.  A null entry means that
1048     // the receiver class doesn't implement the interface, and wasn't the
1049     // same as when the caller was compiled.
1050     cbz(method_result, L_no_such_interface);
1051     add(scan_temp, scan_temp, scan_step);
1052   }
1053 
1054   bind(found_method);
1055 
1056   // Got a hit.
1057   if (return_method) {
1058     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1059     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1060   }
1061 }
1062 
1063 // virtual method calling
1064 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1065                                            RegisterOrConstant vtable_index,
1066                                            Register method_result) {
1067   const int base = in_bytes(Klass::vtable_start_offset());
1068   assert(vtableEntry::size() * wordSize == 8,
1069          "adjust the scaling in the code below");
1070   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1071 
1072   if (vtable_index.is_register()) {
1073     lea(method_result, Address(recv_klass,
1074                                vtable_index.as_register(),
1075                                Address::lsl(LogBytesPerWord)));
1076     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1077   } else {
1078     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1079     ldr(method_result,
1080         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1081   }
1082 }
1083 
1084 void MacroAssembler::check_klass_subtype(Register sub_klass,
1085                            Register super_klass,
1086                            Register temp_reg,
1087                            Label& L_success) {
1088   Label L_failure;
1089   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1090   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1091   bind(L_failure);
1092 }
1093 
1094 
1095 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1096                                                    Register super_klass,
1097                                                    Register temp_reg,
1098                                                    Label* L_success,
1099                                                    Label* L_failure,
1100                                                    Label* L_slow_path,
1101                                         RegisterOrConstant super_check_offset) {
1102   assert_different_registers(sub_klass, super_klass, temp_reg);
1103   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1104   if (super_check_offset.is_register()) {
1105     assert_different_registers(sub_klass, super_klass,
1106                                super_check_offset.as_register());
1107   } else if (must_load_sco) {
1108     assert(temp_reg != noreg, "supply either a temp or a register offset");
1109   }
1110 
1111   Label L_fallthrough;
1112   int label_nulls = 0;
1113   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1114   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1115   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1116   assert(label_nulls <= 1, "at most one NULL in the batch");
1117 
1118   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1119   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1120   Address super_check_offset_addr(super_klass, sco_offset);
1121 
1122   // Hacked jmp, which may only be used just before L_fallthrough.
1123 #define final_jmp(label)                                                \
1124   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1125   else                            b(label)                /*omit semi*/
1126 
1127   // If the pointers are equal, we are done (e.g., String[] elements).
1128   // This self-check enables sharing of secondary supertype arrays among
1129   // non-primary types such as array-of-interface.  Otherwise, each such
1130   // type would need its own customized SSA.
1131   // We move this check to the front of the fast path because many
1132   // type checks are in fact trivially successful in this manner,
1133   // so we get a nicely predicted branch right at the start of the check.
1134   cmp(sub_klass, super_klass);
1135   br(Assembler::EQ, *L_success);
1136 
1137   // Check the supertype display:
1138   if (must_load_sco) {
1139     ldrw(temp_reg, super_check_offset_addr);
1140     super_check_offset = RegisterOrConstant(temp_reg);
1141   }
1142   Address super_check_addr(sub_klass, super_check_offset);
1143   ldr(rscratch1, super_check_addr);
1144   cmp(super_klass, rscratch1); // load displayed supertype
1145 
1146   // This check has worked decisively for primary supers.
1147   // Secondary supers are sought in the super_cache ('super_cache_addr').
1148   // (Secondary supers are interfaces and very deeply nested subtypes.)
1149   // This works in the same check above because of a tricky aliasing
1150   // between the super_cache and the primary super display elements.
1151   // (The 'super_check_addr' can address either, as the case requires.)
1152   // Note that the cache is updated below if it does not help us find
1153   // what we need immediately.
1154   // So if it was a primary super, we can just fail immediately.
1155   // Otherwise, it's the slow path for us (no success at this point).
1156 
1157   if (super_check_offset.is_register()) {
1158     br(Assembler::EQ, *L_success);
1159     subs(zr, super_check_offset.as_register(), sc_offset);
1160     if (L_failure == &L_fallthrough) {
1161       br(Assembler::EQ, *L_slow_path);
1162     } else {
1163       br(Assembler::NE, *L_failure);
1164       final_jmp(*L_slow_path);
1165     }
1166   } else if (super_check_offset.as_constant() == sc_offset) {
1167     // Need a slow path; fast failure is impossible.
1168     if (L_slow_path == &L_fallthrough) {
1169       br(Assembler::EQ, *L_success);
1170     } else {
1171       br(Assembler::NE, *L_slow_path);
1172       final_jmp(*L_success);
1173     }
1174   } else {
1175     // No slow path; it's a fast decision.
1176     if (L_failure == &L_fallthrough) {
1177       br(Assembler::EQ, *L_success);
1178     } else {
1179       br(Assembler::NE, *L_failure);
1180       final_jmp(*L_success);
1181     }
1182   }
1183 
1184   bind(L_fallthrough);
1185 
1186 #undef final_jmp
1187 }
1188 
1189 // These two are taken from x86, but they look generally useful
1190 
1191 // scans count pointer sized words at [addr] for occurence of value,
1192 // generic
1193 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1194                                 Register scratch) {
1195   Label Lloop, Lexit;
1196   cbz(count, Lexit);
1197   bind(Lloop);
1198   ldr(scratch, post(addr, wordSize));
1199   cmp(value, scratch);
1200   br(EQ, Lexit);
1201   sub(count, count, 1);
1202   cbnz(count, Lloop);
1203   bind(Lexit);
1204 }
1205 
1206 // scans count 4 byte words at [addr] for occurence of value,
1207 // generic
1208 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1209                                 Register scratch) {
1210   Label Lloop, Lexit;
1211   cbz(count, Lexit);
1212   bind(Lloop);
1213   ldrw(scratch, post(addr, wordSize));
1214   cmpw(value, scratch);
1215   br(EQ, Lexit);
1216   sub(count, count, 1);
1217   cbnz(count, Lloop);
1218   bind(Lexit);
1219 }
1220 
1221 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1222                                                    Register super_klass,
1223                                                    Register temp_reg,
1224                                                    Register temp2_reg,
1225                                                    Label* L_success,
1226                                                    Label* L_failure,
1227                                                    bool set_cond_codes) {
1228   assert_different_registers(sub_klass, super_klass, temp_reg);
1229   if (temp2_reg != noreg)
1230     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1231 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1232 
1233   Label L_fallthrough;
1234   int label_nulls = 0;
1235   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1236   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1237   assert(label_nulls <= 1, "at most one NULL in the batch");
1238 
1239   // a couple of useful fields in sub_klass:
1240   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1241   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1242   Address secondary_supers_addr(sub_klass, ss_offset);
1243   Address super_cache_addr(     sub_klass, sc_offset);
1244 
1245   BLOCK_COMMENT("check_klass_subtype_slow_path");
1246 
1247   // Do a linear scan of the secondary super-klass chain.
1248   // This code is rarely used, so simplicity is a virtue here.
1249   // The repne_scan instruction uses fixed registers, which we must spill.
1250   // Don't worry too much about pre-existing connections with the input regs.
1251 
1252   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1253   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1254 
1255   RegSet pushed_registers;
1256   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1257   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1258 
1259   if (super_klass != r0 || UseCompressedOops) {
1260     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1261   }
1262 
1263   push(pushed_registers, sp);
1264 
1265   // Get super_klass value into r0 (even if it was in r5 or r2).
1266   if (super_klass != r0) {
1267     mov(r0, super_klass);
1268   }
1269 
1270 #ifndef PRODUCT
1271   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1272   Address pst_counter_addr(rscratch2);
1273   ldr(rscratch1, pst_counter_addr);
1274   add(rscratch1, rscratch1, 1);
1275   str(rscratch1, pst_counter_addr);
1276 #endif //PRODUCT
1277 
1278   // We will consult the secondary-super array.
1279   ldr(r5, secondary_supers_addr);
1280   // Load the array length.
1281   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1282   // Skip to start of data.
1283   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1284 
1285   cmp(sp, zr); // Clear Z flag; SP is never zero
1286   // Scan R2 words at [R5] for an occurrence of R0.
1287   // Set NZ/Z based on last compare.
1288   repne_scan(r5, r0, r2, rscratch1);
1289 
1290   // Unspill the temp. registers:
1291   pop(pushed_registers, sp);
1292 
1293   br(Assembler::NE, *L_failure);
1294 
1295   // Success.  Cache the super we found and proceed in triumph.
1296   str(super_klass, super_cache_addr);
1297 
1298   if (L_success != &L_fallthrough) {
1299     b(*L_success);
1300   }
1301 
1302 #undef IS_A_TEMP
1303 
1304   bind(L_fallthrough);
1305 }
1306 
1307 
1308 void MacroAssembler::verify_oop(Register reg, const char* s) {
1309   if (!VerifyOops) return;
1310 
1311   // Pass register number to verify_oop_subroutine
1312   const char* b = NULL;
1313   {
1314     ResourceMark rm;
1315     stringStream ss;
1316     ss.print("verify_oop: %s: %s", reg->name(), s);
1317     b = code_string(ss.as_string());
1318   }
1319   BLOCK_COMMENT("verify_oop {");
1320 
1321   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1322   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1323 
1324   mov(r0, reg);
1325   mov(rscratch1, (address)b);
1326 
1327   // call indirectly to solve generation ordering problem
1328   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1329   ldr(rscratch2, Address(rscratch2));
1330   blr(rscratch2);
1331 
1332   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1333   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1334 
1335   BLOCK_COMMENT("} verify_oop");
1336 }
1337 
1338 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1339   if (!VerifyOops) return;
1340 
1341   const char* b = NULL;
1342   {
1343     ResourceMark rm;
1344     stringStream ss;
1345     ss.print("verify_oop_addr: %s", s);
1346     b = code_string(ss.as_string());
1347   }
1348   BLOCK_COMMENT("verify_oop_addr {");
1349 
1350   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1351   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1352 
1353   // addr may contain sp so we will have to adjust it based on the
1354   // pushes that we just did.
1355   if (addr.uses(sp)) {
1356     lea(r0, addr);
1357     ldr(r0, Address(r0, 4 * wordSize));
1358   } else {
1359     ldr(r0, addr);
1360   }
1361   mov(rscratch1, (address)b);
1362 
1363   // call indirectly to solve generation ordering problem
1364   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1365   ldr(rscratch2, Address(rscratch2));
1366   blr(rscratch2);
1367 
1368   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1369   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1370 
1371   BLOCK_COMMENT("} verify_oop_addr");
1372 }
1373 
1374 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1375                                          int extra_slot_offset) {
1376   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1377   int stackElementSize = Interpreter::stackElementSize;
1378   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1379 #ifdef ASSERT
1380   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1381   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1382 #endif
1383   if (arg_slot.is_constant()) {
1384     return Address(esp, arg_slot.as_constant() * stackElementSize
1385                    + offset);
1386   } else {
1387     add(rscratch1, esp, arg_slot.as_register(),
1388         ext::uxtx, exact_log2(stackElementSize));
1389     return Address(rscratch1, offset);
1390   }
1391 }
1392 
1393 void MacroAssembler::call_VM_leaf_base(address entry_point,
1394                                        int number_of_arguments,
1395                                        Label *retaddr) {
1396   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1397 }
1398 
1399 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1400                                         int number_of_gp_arguments,
1401                                         int number_of_fp_arguments,
1402                                         ret_type type,
1403                                         Label *retaddr) {
1404   Label E, L;
1405 
1406   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1407 
1408   // We add 1 to number_of_arguments because the thread in arg0 is
1409   // not counted
1410   mov(rscratch1, entry_point);
1411   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1412   if (retaddr)
1413     bind(*retaddr);
1414 
1415   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1416   maybe_isb();
1417 }
1418 
1419 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1420   call_VM_leaf_base(entry_point, number_of_arguments);
1421 }
1422 
1423 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1424   pass_arg0(this, arg_0);
1425   call_VM_leaf_base(entry_point, 1);
1426 }
1427 
1428 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1429   pass_arg0(this, arg_0);
1430   pass_arg1(this, arg_1);
1431   call_VM_leaf_base(entry_point, 2);
1432 }
1433 
1434 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1435                                   Register arg_1, Register arg_2) {
1436   pass_arg0(this, arg_0);
1437   pass_arg1(this, arg_1);
1438   pass_arg2(this, arg_2);
1439   call_VM_leaf_base(entry_point, 3);
1440 }
1441 
1442 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1443   pass_arg0(this, arg_0);
1444   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1445 }
1446 
1447 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1448 
1449   assert(arg_0 != c_rarg1, "smashed arg");
1450   pass_arg1(this, arg_1);
1451   pass_arg0(this, arg_0);
1452   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1453 }
1454 
1455 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1456   assert(arg_0 != c_rarg2, "smashed arg");
1457   assert(arg_1 != c_rarg2, "smashed arg");
1458   pass_arg2(this, arg_2);
1459   assert(arg_0 != c_rarg1, "smashed arg");
1460   pass_arg1(this, arg_1);
1461   pass_arg0(this, arg_0);
1462   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1463 }
1464 
1465 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1466   assert(arg_0 != c_rarg3, "smashed arg");
1467   assert(arg_1 != c_rarg3, "smashed arg");
1468   assert(arg_2 != c_rarg3, "smashed arg");
1469   pass_arg3(this, arg_3);
1470   assert(arg_0 != c_rarg2, "smashed arg");
1471   assert(arg_1 != c_rarg2, "smashed arg");
1472   pass_arg2(this, arg_2);
1473   assert(arg_0 != c_rarg1, "smashed arg");
1474   pass_arg1(this, arg_1);
1475   pass_arg0(this, arg_0);
1476   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1477 }
1478 
1479 void MacroAssembler::null_check(Register reg, int offset) {
1480   if (needs_explicit_null_check(offset)) {
1481     // provoke OS NULL exception if reg = NULL by
1482     // accessing M[reg] w/o changing any registers
1483     // NOTE: this is plenty to provoke a segv
1484     ldr(zr, Address(reg));
1485   } else {
1486     // nothing to do, (later) access of M[reg + offset]
1487     // will provoke OS NULL exception if reg = NULL
1488   }
1489 }
1490 
1491 // MacroAssembler protected routines needed to implement
1492 // public methods
1493 
1494 void MacroAssembler::mov(Register r, Address dest) {
1495   code_section()->relocate(pc(), dest.rspec());
1496   u_int64_t imm64 = (u_int64_t)dest.target();
1497   movptr(r, imm64);
1498 }
1499 
1500 // Move a constant pointer into r.  In AArch64 mode the virtual
1501 // address space is 48 bits in size, so we only need three
1502 // instructions to create a patchable instruction sequence that can
1503 // reach anywhere.
1504 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1505 #ifndef PRODUCT
1506   {
1507     char buffer[64];
1508     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1509     block_comment(buffer);
1510   }
1511 #endif
1512   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1513   movz(r, imm64 & 0xffff);
1514   imm64 >>= 16;
1515   movk(r, imm64 & 0xffff, 16);
1516   imm64 >>= 16;
1517   movk(r, imm64 & 0xffff, 32);
1518 }
1519 
1520 // Macro to mov replicated immediate to vector register.
1521 //  Vd will get the following values for different arrangements in T
1522 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1523 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1524 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1525 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1526 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1527 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1528 //   T1D/T2D: invalid
1529 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1530   assert(T != T1D && T != T2D, "invalid arrangement");
1531   if (T == T8B || T == T16B) {
1532     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1533     movi(Vd, T, imm32 & 0xff, 0);
1534     return;
1535   }
1536   u_int32_t nimm32 = ~imm32;
1537   if (T == T4H || T == T8H) {
1538     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1539     imm32 &= 0xffff;
1540     nimm32 &= 0xffff;
1541   }
1542   u_int32_t x = imm32;
1543   int movi_cnt = 0;
1544   int movn_cnt = 0;
1545   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1546   x = nimm32;
1547   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1548   if (movn_cnt < movi_cnt) imm32 = nimm32;
1549   unsigned lsl = 0;
1550   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1551   if (movn_cnt < movi_cnt)
1552     mvni(Vd, T, imm32 & 0xff, lsl);
1553   else
1554     movi(Vd, T, imm32 & 0xff, lsl);
1555   imm32 >>= 8; lsl += 8;
1556   while (imm32) {
1557     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1558     if (movn_cnt < movi_cnt)
1559       bici(Vd, T, imm32 & 0xff, lsl);
1560     else
1561       orri(Vd, T, imm32 & 0xff, lsl);
1562     lsl += 8; imm32 >>= 8;
1563   }
1564 }
1565 
1566 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1567 {
1568 #ifndef PRODUCT
1569   {
1570     char buffer[64];
1571     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1572     block_comment(buffer);
1573   }
1574 #endif
1575   if (operand_valid_for_logical_immediate(false, imm64)) {
1576     orr(dst, zr, imm64);
1577   } else {
1578     // we can use a combination of MOVZ or MOVN with
1579     // MOVK to build up the constant
1580     u_int64_t imm_h[4];
1581     int zero_count = 0;
1582     int neg_count = 0;
1583     int i;
1584     for (i = 0; i < 4; i++) {
1585       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1586       if (imm_h[i] == 0) {
1587         zero_count++;
1588       } else if (imm_h[i] == 0xffffL) {
1589         neg_count++;
1590       }
1591     }
1592     if (zero_count == 4) {
1593       // one MOVZ will do
1594       movz(dst, 0);
1595     } else if (neg_count == 4) {
1596       // one MOVN will do
1597       movn(dst, 0);
1598     } else if (zero_count == 3) {
1599       for (i = 0; i < 4; i++) {
1600         if (imm_h[i] != 0L) {
1601           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1602           break;
1603         }
1604       }
1605     } else if (neg_count == 3) {
1606       // one MOVN will do
1607       for (int i = 0; i < 4; i++) {
1608         if (imm_h[i] != 0xffffL) {
1609           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1610           break;
1611         }
1612       }
1613     } else if (zero_count == 2) {
1614       // one MOVZ and one MOVK will do
1615       for (i = 0; i < 3; i++) {
1616         if (imm_h[i] != 0L) {
1617           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1618           i++;
1619           break;
1620         }
1621       }
1622       for (;i < 4; i++) {
1623         if (imm_h[i] != 0L) {
1624           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1625         }
1626       }
1627     } else if (neg_count == 2) {
1628       // one MOVN and one MOVK will do
1629       for (i = 0; i < 4; i++) {
1630         if (imm_h[i] != 0xffffL) {
1631           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1632           i++;
1633           break;
1634         }
1635       }
1636       for (;i < 4; i++) {
1637         if (imm_h[i] != 0xffffL) {
1638           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1639         }
1640       }
1641     } else if (zero_count == 1) {
1642       // one MOVZ and two MOVKs will do
1643       for (i = 0; i < 4; i++) {
1644         if (imm_h[i] != 0L) {
1645           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1646           i++;
1647           break;
1648         }
1649       }
1650       for (;i < 4; i++) {
1651         if (imm_h[i] != 0x0L) {
1652           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1653         }
1654       }
1655     } else if (neg_count == 1) {
1656       // one MOVN and two MOVKs will do
1657       for (i = 0; i < 4; i++) {
1658         if (imm_h[i] != 0xffffL) {
1659           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1660           i++;
1661           break;
1662         }
1663       }
1664       for (;i < 4; i++) {
1665         if (imm_h[i] != 0xffffL) {
1666           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1667         }
1668       }
1669     } else {
1670       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1671       movz(dst, (u_int32_t)imm_h[0], 0);
1672       for (i = 1; i < 4; i++) {
1673         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1674       }
1675     }
1676   }
1677 }
1678 
1679 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1680 {
1681 #ifndef PRODUCT
1682     {
1683       char buffer[64];
1684       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1685       block_comment(buffer);
1686     }
1687 #endif
1688   if (operand_valid_for_logical_immediate(true, imm32)) {
1689     orrw(dst, zr, imm32);
1690   } else {
1691     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1692     // constant
1693     u_int32_t imm_h[2];
1694     imm_h[0] = imm32 & 0xffff;
1695     imm_h[1] = ((imm32 >> 16) & 0xffff);
1696     if (imm_h[0] == 0) {
1697       movzw(dst, imm_h[1], 16);
1698     } else if (imm_h[0] == 0xffff) {
1699       movnw(dst, imm_h[1] ^ 0xffff, 16);
1700     } else if (imm_h[1] == 0) {
1701       movzw(dst, imm_h[0], 0);
1702     } else if (imm_h[1] == 0xffff) {
1703       movnw(dst, imm_h[0] ^ 0xffff, 0);
1704     } else {
1705       // use a MOVZ and MOVK (makes it easier to debug)
1706       movzw(dst, imm_h[0], 0);
1707       movkw(dst, imm_h[1], 16);
1708     }
1709   }
1710 }
1711 
1712 // Form an address from base + offset in Rd.  Rd may or may
1713 // not actually be used: you must use the Address that is returned.
1714 // It is up to you to ensure that the shift provided matches the size
1715 // of your data.
1716 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1717   if (Address::offset_ok_for_immed(byte_offset, shift))
1718     // It fits; no need for any heroics
1719     return Address(base, byte_offset);
1720 
1721   // Don't do anything clever with negative or misaligned offsets
1722   unsigned mask = (1 << shift) - 1;
1723   if (byte_offset < 0 || byte_offset & mask) {
1724     mov(Rd, byte_offset);
1725     add(Rd, base, Rd);
1726     return Address(Rd);
1727   }
1728 
1729   // See if we can do this with two 12-bit offsets
1730   {
1731     unsigned long word_offset = byte_offset >> shift;
1732     unsigned long masked_offset = word_offset & 0xfff000;
1733     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1734         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1735       add(Rd, base, masked_offset << shift);
1736       word_offset -= masked_offset;
1737       return Address(Rd, word_offset << shift);
1738     }
1739   }
1740 
1741   // Do it the hard way
1742   mov(Rd, byte_offset);
1743   add(Rd, base, Rd);
1744   return Address(Rd);
1745 }
1746 
1747 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1748   if (UseLSE) {
1749     mov(tmp, 1);
1750     ldadd(Assembler::word, tmp, zr, counter_addr);
1751     return;
1752   }
1753   Label retry_load;
1754   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1755     prfm(Address(counter_addr), PSTL1STRM);
1756   bind(retry_load);
1757   // flush and load exclusive from the memory location
1758   ldxrw(tmp, counter_addr);
1759   addw(tmp, tmp, 1);
1760   // if we store+flush with no intervening write tmp wil be zero
1761   stxrw(tmp2, tmp, counter_addr);
1762   cbnzw(tmp2, retry_load);
1763 }
1764 
1765 
1766 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1767                                     bool want_remainder, Register scratch)
1768 {
1769   // Full implementation of Java idiv and irem.  The function
1770   // returns the (pc) offset of the div instruction - may be needed
1771   // for implicit exceptions.
1772   //
1773   // constraint : ra/rb =/= scratch
1774   //         normal case
1775   //
1776   // input : ra: dividend
1777   //         rb: divisor
1778   //
1779   // result: either
1780   //         quotient  (= ra idiv rb)
1781   //         remainder (= ra irem rb)
1782 
1783   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1784 
1785   int idivl_offset = offset();
1786   if (! want_remainder) {
1787     sdivw(result, ra, rb);
1788   } else {
1789     sdivw(scratch, ra, rb);
1790     Assembler::msubw(result, scratch, rb, ra);
1791   }
1792 
1793   return idivl_offset;
1794 }
1795 
1796 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1797                                     bool want_remainder, Register scratch)
1798 {
1799   // Full implementation of Java ldiv and lrem.  The function
1800   // returns the (pc) offset of the div instruction - may be needed
1801   // for implicit exceptions.
1802   //
1803   // constraint : ra/rb =/= scratch
1804   //         normal case
1805   //
1806   // input : ra: dividend
1807   //         rb: divisor
1808   //
1809   // result: either
1810   //         quotient  (= ra idiv rb)
1811   //         remainder (= ra irem rb)
1812 
1813   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1814 
1815   int idivq_offset = offset();
1816   if (! want_remainder) {
1817     sdiv(result, ra, rb);
1818   } else {
1819     sdiv(scratch, ra, rb);
1820     Assembler::msub(result, scratch, rb, ra);
1821   }
1822 
1823   return idivq_offset;
1824 }
1825 
1826 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1827   address prev = pc() - NativeMembar::instruction_size;
1828   address last = code()->last_insn();
1829   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1830     NativeMembar *bar = NativeMembar_at(prev);
1831     // We are merging two memory barrier instructions.  On AArch64 we
1832     // can do this simply by ORing them together.
1833     bar->set_kind(bar->get_kind() | order_constraint);
1834     BLOCK_COMMENT("merged membar");
1835   } else {
1836     code()->set_last_insn(pc());
1837     dmb(Assembler::barrier(order_constraint));
1838   }
1839 }
1840 
1841 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1842   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1843     merge_ldst(rt, adr, size_in_bytes, is_store);
1844     code()->clear_last_insn();
1845     return true;
1846   } else {
1847     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1848     const unsigned mask = size_in_bytes - 1;
1849     if (adr.getMode() == Address::base_plus_offset &&
1850         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1851       code()->set_last_insn(pc());
1852     }
1853     return false;
1854   }
1855 }
1856 
1857 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1858   // We always try to merge two adjacent loads into one ldp.
1859   if (!try_merge_ldst(Rx, adr, 8, false)) {
1860     Assembler::ldr(Rx, adr);
1861   }
1862 }
1863 
1864 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1865   // We always try to merge two adjacent loads into one ldp.
1866   if (!try_merge_ldst(Rw, adr, 4, false)) {
1867     Assembler::ldrw(Rw, adr);
1868   }
1869 }
1870 
1871 void MacroAssembler::str(Register Rx, const Address &adr) {
1872   // We always try to merge two adjacent stores into one stp.
1873   if (!try_merge_ldst(Rx, adr, 8, true)) {
1874     Assembler::str(Rx, adr);
1875   }
1876 }
1877 
1878 void MacroAssembler::strw(Register Rw, const Address &adr) {
1879   // We always try to merge two adjacent stores into one stp.
1880   if (!try_merge_ldst(Rw, adr, 4, true)) {
1881     Assembler::strw(Rw, adr);
1882   }
1883 }
1884 
1885 // MacroAssembler routines found actually to be needed
1886 
1887 void MacroAssembler::push(Register src)
1888 {
1889   str(src, Address(pre(esp, -1 * wordSize)));
1890 }
1891 
1892 void MacroAssembler::pop(Register dst)
1893 {
1894   ldr(dst, Address(post(esp, 1 * wordSize)));
1895 }
1896 
1897 // Note: load_unsigned_short used to be called load_unsigned_word.
1898 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1899   int off = offset();
1900   ldrh(dst, src);
1901   return off;
1902 }
1903 
1904 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1905   int off = offset();
1906   ldrb(dst, src);
1907   return off;
1908 }
1909 
1910 int MacroAssembler::load_signed_short(Register dst, Address src) {
1911   int off = offset();
1912   ldrsh(dst, src);
1913   return off;
1914 }
1915 
1916 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1917   int off = offset();
1918   ldrsb(dst, src);
1919   return off;
1920 }
1921 
1922 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1923   int off = offset();
1924   ldrshw(dst, src);
1925   return off;
1926 }
1927 
1928 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1929   int off = offset();
1930   ldrsbw(dst, src);
1931   return off;
1932 }
1933 
1934 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1935   switch (size_in_bytes) {
1936   case  8:  ldr(dst, src); break;
1937   case  4:  ldrw(dst, src); break;
1938   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1939   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1940   default:  ShouldNotReachHere();
1941   }
1942 }
1943 
1944 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1945   switch (size_in_bytes) {
1946   case  8:  str(src, dst); break;
1947   case  4:  strw(src, dst); break;
1948   case  2:  strh(src, dst); break;
1949   case  1:  strb(src, dst); break;
1950   default:  ShouldNotReachHere();
1951   }
1952 }
1953 
1954 void MacroAssembler::decrementw(Register reg, int value)
1955 {
1956   if (value < 0)  { incrementw(reg, -value);      return; }
1957   if (value == 0) {                               return; }
1958   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1959   /* else */ {
1960     guarantee(reg != rscratch2, "invalid dst for register decrement");
1961     movw(rscratch2, (unsigned)value);
1962     subw(reg, reg, rscratch2);
1963   }
1964 }
1965 
1966 void MacroAssembler::decrement(Register reg, int value)
1967 {
1968   if (value < 0)  { increment(reg, -value);      return; }
1969   if (value == 0) {                              return; }
1970   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1971   /* else */ {
1972     assert(reg != rscratch2, "invalid dst for register decrement");
1973     mov(rscratch2, (unsigned long)value);
1974     sub(reg, reg, rscratch2);
1975   }
1976 }
1977 
1978 void MacroAssembler::decrementw(Address dst, int value)
1979 {
1980   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1981   if (dst.getMode() == Address::literal) {
1982     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1983     lea(rscratch2, dst);
1984     dst = Address(rscratch2);
1985   }
1986   ldrw(rscratch1, dst);
1987   decrementw(rscratch1, value);
1988   strw(rscratch1, dst);
1989 }
1990 
1991 void MacroAssembler::decrement(Address dst, int value)
1992 {
1993   assert(!dst.uses(rscratch1), "invalid address for decrement");
1994   if (dst.getMode() == Address::literal) {
1995     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1996     lea(rscratch2, dst);
1997     dst = Address(rscratch2);
1998   }
1999   ldr(rscratch1, dst);
2000   decrement(rscratch1, value);
2001   str(rscratch1, dst);
2002 }
2003 
2004 void MacroAssembler::incrementw(Register reg, int value)
2005 {
2006   if (value < 0)  { decrementw(reg, -value);      return; }
2007   if (value == 0) {                               return; }
2008   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2009   /* else */ {
2010     assert(reg != rscratch2, "invalid dst for register increment");
2011     movw(rscratch2, (unsigned)value);
2012     addw(reg, reg, rscratch2);
2013   }
2014 }
2015 
2016 void MacroAssembler::increment(Register reg, int value)
2017 {
2018   if (value < 0)  { decrement(reg, -value);      return; }
2019   if (value == 0) {                              return; }
2020   if (value < (1 << 12)) { add(reg, reg, value); return; }
2021   /* else */ {
2022     assert(reg != rscratch2, "invalid dst for register increment");
2023     movw(rscratch2, (unsigned)value);
2024     add(reg, reg, rscratch2);
2025   }
2026 }
2027 
2028 void MacroAssembler::incrementw(Address dst, int value)
2029 {
2030   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2031   if (dst.getMode() == Address::literal) {
2032     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2033     lea(rscratch2, dst);
2034     dst = Address(rscratch2);
2035   }
2036   ldrw(rscratch1, dst);
2037   incrementw(rscratch1, value);
2038   strw(rscratch1, dst);
2039 }
2040 
2041 void MacroAssembler::increment(Address dst, int value)
2042 {
2043   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2044   if (dst.getMode() == Address::literal) {
2045     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2046     lea(rscratch2, dst);
2047     dst = Address(rscratch2);
2048   }
2049   ldr(rscratch1, dst);
2050   increment(rscratch1, value);
2051   str(rscratch1, dst);
2052 }
2053 
2054 
2055 void MacroAssembler::pusha() {
2056   push(0x7fffffff, sp);
2057 }
2058 
2059 void MacroAssembler::popa() {
2060   pop(0x7fffffff, sp);
2061 }
2062 
2063 // Push lots of registers in the bit set supplied.  Don't push sp.
2064 // Return the number of words pushed
2065 int MacroAssembler::push(unsigned int bitset, Register stack) {
2066   int words_pushed = 0;
2067 
2068   // Scan bitset to accumulate register pairs
2069   unsigned char regs[32];
2070   int count = 0;
2071   for (int reg = 0; reg <= 30; reg++) {
2072     if (1 & bitset)
2073       regs[count++] = reg;
2074     bitset >>= 1;
2075   }
2076   regs[count++] = zr->encoding_nocheck();
2077   count &= ~1;  // Only push an even nuber of regs
2078 
2079   if (count) {
2080     stp(as_Register(regs[0]), as_Register(regs[1]),
2081        Address(pre(stack, -count * wordSize)));
2082     words_pushed += 2;
2083   }
2084   for (int i = 2; i < count; i += 2) {
2085     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2086        Address(stack, i * wordSize));
2087     words_pushed += 2;
2088   }
2089 
2090   assert(words_pushed == count, "oops, pushed != count");
2091 
2092   return count;
2093 }
2094 
2095 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2096   int words_pushed = 0;
2097 
2098   // Scan bitset to accumulate register pairs
2099   unsigned char regs[32];
2100   int count = 0;
2101   for (int reg = 0; reg <= 30; reg++) {
2102     if (1 & bitset)
2103       regs[count++] = reg;
2104     bitset >>= 1;
2105   }
2106   regs[count++] = zr->encoding_nocheck();
2107   count &= ~1;
2108 
2109   for (int i = 2; i < count; i += 2) {
2110     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2111        Address(stack, i * wordSize));
2112     words_pushed += 2;
2113   }
2114   if (count) {
2115     ldp(as_Register(regs[0]), as_Register(regs[1]),
2116        Address(post(stack, count * wordSize)));
2117     words_pushed += 2;
2118   }
2119 
2120   assert(words_pushed == count, "oops, pushed != count");
2121 
2122   return count;
2123 }
2124 #ifdef ASSERT
2125 void MacroAssembler::verify_heapbase(const char* msg) {
2126 #if 0
2127   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2128   assert (Universe::heap() != NULL, "java heap should be initialized");
2129   if (CheckCompressedOops) {
2130     Label ok;
2131     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2132     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2133     br(Assembler::EQ, ok);
2134     stop(msg);
2135     bind(ok);
2136     pop(1 << rscratch1->encoding(), sp);
2137   }
2138 #endif
2139 }
2140 #endif
2141 
2142 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2143   Label done, not_weak;
2144   cbz(value, done);           // Use NULL as-is.
2145 
2146   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2147   tbz(r0, 0, not_weak);    // Test for jweak tag.
2148 
2149   // Resolve jweak.
2150   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2151                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2152   verify_oop(value);
2153   b(done);
2154 
2155   bind(not_weak);
2156   // Resolve (untagged) jobject.
2157   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2158   verify_oop(value);
2159   bind(done);
2160 }
2161 
2162 void MacroAssembler::stop(const char* msg) {
2163   address ip = pc();
2164   pusha();
2165   mov(c_rarg0, (address)msg);
2166   mov(c_rarg1, (address)ip);
2167   mov(c_rarg2, sp);
2168   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2169   // call(c_rarg3);
2170   blrt(c_rarg3, 3, 0, 1);
2171   hlt(0);
2172 }
2173 
2174 void MacroAssembler::unimplemented(const char* what) {
2175   const char* buf = NULL;
2176   {
2177     ResourceMark rm;
2178     stringStream ss;
2179     ss.print("unimplemented: %s", what);
2180     buf = code_string(ss.as_string());
2181   }
2182   stop(buf);
2183 }
2184 
2185 // If a constant does not fit in an immediate field, generate some
2186 // number of MOV instructions and then perform the operation.
2187 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2188                                            add_sub_imm_insn insn1,
2189                                            add_sub_reg_insn insn2) {
2190   assert(Rd != zr, "Rd = zr and not setting flags?");
2191   if (operand_valid_for_add_sub_immediate((int)imm)) {
2192     (this->*insn1)(Rd, Rn, imm);
2193   } else {
2194     if (uabs(imm) < (1 << 24)) {
2195        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2196        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2197     } else {
2198        assert_different_registers(Rd, Rn);
2199        mov(Rd, (uint64_t)imm);
2200        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2201     }
2202   }
2203 }
2204 
2205 // Seperate vsn which sets the flags. Optimisations are more restricted
2206 // because we must set the flags correctly.
2207 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2208                                            add_sub_imm_insn insn1,
2209                                            add_sub_reg_insn insn2) {
2210   if (operand_valid_for_add_sub_immediate((int)imm)) {
2211     (this->*insn1)(Rd, Rn, imm);
2212   } else {
2213     assert_different_registers(Rd, Rn);
2214     assert(Rd != zr, "overflow in immediate operand");
2215     mov(Rd, (uint64_t)imm);
2216     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2217   }
2218 }
2219 
2220 
2221 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2222   if (increment.is_register()) {
2223     add(Rd, Rn, increment.as_register());
2224   } else {
2225     add(Rd, Rn, increment.as_constant());
2226   }
2227 }
2228 
2229 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2230   if (increment.is_register()) {
2231     addw(Rd, Rn, increment.as_register());
2232   } else {
2233     addw(Rd, Rn, increment.as_constant());
2234   }
2235 }
2236 
2237 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2238   if (decrement.is_register()) {
2239     sub(Rd, Rn, decrement.as_register());
2240   } else {
2241     sub(Rd, Rn, decrement.as_constant());
2242   }
2243 }
2244 
2245 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2246   if (decrement.is_register()) {
2247     subw(Rd, Rn, decrement.as_register());
2248   } else {
2249     subw(Rd, Rn, decrement.as_constant());
2250   }
2251 }
2252 
2253 void MacroAssembler::reinit_heapbase()
2254 {
2255   if (UseCompressedOops) {
2256     if (Universe::is_fully_initialized()) {
2257       mov(rheapbase, Universe::narrow_ptrs_base());
2258     } else {
2259       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2260       ldr(rheapbase, Address(rheapbase));
2261     }
2262   }
2263 }
2264 
2265 // this simulates the behaviour of the x86 cmpxchg instruction using a
2266 // load linked/store conditional pair. we use the acquire/release
2267 // versions of these instructions so that we flush pending writes as
2268 // per Java semantics.
2269 
2270 // n.b the x86 version assumes the old value to be compared against is
2271 // in rax and updates rax with the value located in memory if the
2272 // cmpxchg fails. we supply a register for the old value explicitly
2273 
2274 // the aarch64 load linked/store conditional instructions do not
2275 // accept an offset. so, unlike x86, we must provide a plain register
2276 // to identify the memory word to be compared/exchanged rather than a
2277 // register+offset Address.
2278 
2279 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2280                                 Label &succeed, Label *fail) {
2281   // oldv holds comparison value
2282   // newv holds value to write in exchange
2283   // addr identifies memory word to compare against/update
2284   if (UseLSE) {
2285     mov(tmp, oldv);
2286     casal(Assembler::xword, oldv, newv, addr);
2287     cmp(tmp, oldv);
2288     br(Assembler::EQ, succeed);
2289     membar(AnyAny);
2290   } else {
2291     Label retry_load, nope;
2292     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2293       prfm(Address(addr), PSTL1STRM);
2294     bind(retry_load);
2295     // flush and load exclusive from the memory location
2296     // and fail if it is not what we expect
2297     ldaxr(tmp, addr);
2298     cmp(tmp, oldv);
2299     br(Assembler::NE, nope);
2300     // if we store+flush with no intervening write tmp wil be zero
2301     stlxr(tmp, newv, addr);
2302     cbzw(tmp, succeed);
2303     // retry so we only ever return after a load fails to compare
2304     // ensures we don't return a stale value after a failed write.
2305     b(retry_load);
2306     // if the memory word differs we return it in oldv and signal a fail
2307     bind(nope);
2308     membar(AnyAny);
2309     mov(oldv, tmp);
2310   }
2311   if (fail)
2312     b(*fail);
2313 }
2314 
2315 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2316                                         Label &succeed, Label *fail) {
2317   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2318   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2319 }
2320 
2321 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2322                                 Label &succeed, Label *fail) {
2323   // oldv holds comparison value
2324   // newv holds value to write in exchange
2325   // addr identifies memory word to compare against/update
2326   // tmp returns 0/1 for success/failure
2327   if (UseLSE) {
2328     mov(tmp, oldv);
2329     casal(Assembler::word, oldv, newv, addr);
2330     cmp(tmp, oldv);
2331     br(Assembler::EQ, succeed);
2332     membar(AnyAny);
2333   } else {
2334     Label retry_load, nope;
2335     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2336       prfm(Address(addr), PSTL1STRM);
2337     bind(retry_load);
2338     // flush and load exclusive from the memory location
2339     // and fail if it is not what we expect
2340     ldaxrw(tmp, addr);
2341     cmp(tmp, oldv);
2342     br(Assembler::NE, nope);
2343     // if we store+flush with no intervening write tmp wil be zero
2344     stlxrw(tmp, newv, addr);
2345     cbzw(tmp, succeed);
2346     // retry so we only ever return after a load fails to compare
2347     // ensures we don't return a stale value after a failed write.
2348     b(retry_load);
2349     // if the memory word differs we return it in oldv and signal a fail
2350     bind(nope);
2351     membar(AnyAny);
2352     mov(oldv, tmp);
2353   }
2354   if (fail)
2355     b(*fail);
2356 }
2357 
2358 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2359 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2360 // Pass a register for the result, otherwise pass noreg.
2361 
2362 // Clobbers rscratch1
2363 void MacroAssembler::cmpxchg(Register addr, Register expected,
2364                              Register new_val,
2365                              enum operand_size size,
2366                              bool acquire, bool release,
2367                              bool weak,
2368                              Register result) {
2369   if (result == noreg)  result = rscratch1;
2370   BLOCK_COMMENT("cmpxchg {");
2371   if (UseLSE) {
2372     mov(result, expected);
2373     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2374     compare_eq(result, expected, size);
2375   } else {
2376     Label retry_load, done;
2377     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2378       prfm(Address(addr), PSTL1STRM);
2379     bind(retry_load);
2380     load_exclusive(result, addr, size, acquire);
2381     compare_eq(result, expected, size);
2382     br(Assembler::NE, done);
2383     store_exclusive(rscratch1, new_val, addr, size, release);
2384     if (weak) {
2385       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2386     } else {
2387       cbnzw(rscratch1, retry_load);
2388     }
2389     bind(done);
2390   }
2391   BLOCK_COMMENT("} cmpxchg");
2392 }
2393 
2394 // A generic comparison. Only compares for equality, clobbers rscratch1.
2395 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2396   if (size == xword) {
2397     cmp(rm, rn);
2398   } else if (size == word) {
2399     cmpw(rm, rn);
2400   } else if (size == halfword) {
2401     eorw(rscratch1, rm, rn);
2402     ands(zr, rscratch1, 0xffff);
2403   } else if (size == byte) {
2404     eorw(rscratch1, rm, rn);
2405     ands(zr, rscratch1, 0xff);
2406   } else {
2407     ShouldNotReachHere();
2408   }
2409 }
2410 
2411 
2412 static bool different(Register a, RegisterOrConstant b, Register c) {
2413   if (b.is_constant())
2414     return a != c;
2415   else
2416     return a != b.as_register() && a != c && b.as_register() != c;
2417 }
2418 
2419 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2420 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2421   if (UseLSE) {                                                         \
2422     prev = prev->is_valid() ? prev : zr;                                \
2423     if (incr.is_register()) {                                           \
2424       AOP(sz, incr.as_register(), prev, addr);                          \
2425     } else {                                                            \
2426       mov(rscratch2, incr.as_constant());                               \
2427       AOP(sz, rscratch2, prev, addr);                                   \
2428     }                                                                   \
2429     return;                                                             \
2430   }                                                                     \
2431   Register result = rscratch2;                                          \
2432   if (prev->is_valid())                                                 \
2433     result = different(prev, incr, addr) ? prev : rscratch2;            \
2434                                                                         \
2435   Label retry_load;                                                     \
2436   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2437     prfm(Address(addr), PSTL1STRM);                                     \
2438   bind(retry_load);                                                     \
2439   LDXR(result, addr);                                                   \
2440   OP(rscratch1, result, incr);                                          \
2441   STXR(rscratch2, rscratch1, addr);                                     \
2442   cbnzw(rscratch2, retry_load);                                         \
2443   if (prev->is_valid() && prev != result) {                             \
2444     IOP(prev, rscratch1, incr);                                         \
2445   }                                                                     \
2446 }
2447 
2448 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2449 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2450 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2451 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2452 
2453 #undef ATOMIC_OP
2454 
2455 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2456 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2457   if (UseLSE) {                                                         \
2458     prev = prev->is_valid() ? prev : zr;                                \
2459     AOP(sz, newv, prev, addr);                                          \
2460     return;                                                             \
2461   }                                                                     \
2462   Register result = rscratch2;                                          \
2463   if (prev->is_valid())                                                 \
2464     result = different(prev, newv, addr) ? prev : rscratch2;            \
2465                                                                         \
2466   Label retry_load;                                                     \
2467   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2468     prfm(Address(addr), PSTL1STRM);                                     \
2469   bind(retry_load);                                                     \
2470   LDXR(result, addr);                                                   \
2471   STXR(rscratch1, newv, addr);                                          \
2472   cbnzw(rscratch1, retry_load);                                         \
2473   if (prev->is_valid() && prev != result)                               \
2474     mov(prev, result);                                                  \
2475 }
2476 
2477 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2478 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2479 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2480 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2481 
2482 #undef ATOMIC_XCHG
2483 
2484 #ifndef PRODUCT
2485 extern "C" void findpc(intptr_t x);
2486 #endif
2487 
2488 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2489 {
2490   // In order to get locks to work, we need to fake a in_VM state
2491   if (ShowMessageBoxOnError ) {
2492     JavaThread* thread = JavaThread::current();
2493     JavaThreadState saved_state = thread->thread_state();
2494     thread->set_thread_state(_thread_in_vm);
2495 #ifndef PRODUCT
2496     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2497       ttyLocker ttyl;
2498       BytecodeCounter::print();
2499     }
2500 #endif
2501     if (os::message_box(msg, "Execution stopped, print registers?")) {
2502       ttyLocker ttyl;
2503       tty->print_cr(" pc = 0x%016lx", pc);
2504 #ifndef PRODUCT
2505       tty->cr();
2506       findpc(pc);
2507       tty->cr();
2508 #endif
2509       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2510       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2511       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2512       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2513       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2514       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2515       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2516       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2517       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2518       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2519       tty->print_cr("r10 = 0x%016lx", regs[10]);
2520       tty->print_cr("r11 = 0x%016lx", regs[11]);
2521       tty->print_cr("r12 = 0x%016lx", regs[12]);
2522       tty->print_cr("r13 = 0x%016lx", regs[13]);
2523       tty->print_cr("r14 = 0x%016lx", regs[14]);
2524       tty->print_cr("r15 = 0x%016lx", regs[15]);
2525       tty->print_cr("r16 = 0x%016lx", regs[16]);
2526       tty->print_cr("r17 = 0x%016lx", regs[17]);
2527       tty->print_cr("r18 = 0x%016lx", regs[18]);
2528       tty->print_cr("r19 = 0x%016lx", regs[19]);
2529       tty->print_cr("r20 = 0x%016lx", regs[20]);
2530       tty->print_cr("r21 = 0x%016lx", regs[21]);
2531       tty->print_cr("r22 = 0x%016lx", regs[22]);
2532       tty->print_cr("r23 = 0x%016lx", regs[23]);
2533       tty->print_cr("r24 = 0x%016lx", regs[24]);
2534       tty->print_cr("r25 = 0x%016lx", regs[25]);
2535       tty->print_cr("r26 = 0x%016lx", regs[26]);
2536       tty->print_cr("r27 = 0x%016lx", regs[27]);
2537       tty->print_cr("r28 = 0x%016lx", regs[28]);
2538       tty->print_cr("r30 = 0x%016lx", regs[30]);
2539       tty->print_cr("r31 = 0x%016lx", regs[31]);
2540       BREAKPOINT;
2541     }
2542     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2543   } else {
2544     ttyLocker ttyl;
2545     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2546                     msg);
2547     assert(false, "DEBUG MESSAGE: %s", msg);
2548   }
2549 }
2550 
2551 #ifdef BUILTIN_SIM
2552 // routine to generate an x86 prolog for a stub function which
2553 // bootstraps into the generated ARM code which directly follows the
2554 // stub
2555 //
2556 // the argument encodes the number of general and fp registers
2557 // passed by the caller and the callng convention (currently just
2558 // the number of general registers and assumes C argument passing)
2559 
2560 extern "C" {
2561 int aarch64_stub_prolog_size();
2562 void aarch64_stub_prolog();
2563 void aarch64_prolog();
2564 }
2565 
2566 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2567                                    address *prolog_ptr)
2568 {
2569   int calltype = (((ret_type & 0x3) << 8) |
2570                   ((fp_arg_count & 0xf) << 4) |
2571                   (gp_arg_count & 0xf));
2572 
2573   // the addresses for the x86 to ARM entry code we need to use
2574   address start = pc();
2575   // printf("start = %lx\n", start);
2576   int byteCount =  aarch64_stub_prolog_size();
2577   // printf("byteCount = %x\n", byteCount);
2578   int instructionCount = (byteCount + 3)/ 4;
2579   // printf("instructionCount = %x\n", instructionCount);
2580   for (int i = 0; i < instructionCount; i++) {
2581     nop();
2582   }
2583 
2584   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2585 
2586   // write the address of the setup routine and the call format at the
2587   // end of into the copied code
2588   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2589   if (prolog_ptr)
2590     patch_end[-2] = (u_int64_t)prolog_ptr;
2591   patch_end[-1] = calltype;
2592 }
2593 #endif
2594 
2595 void MacroAssembler::push_call_clobbered_registers() {
2596   int step = 4 * wordSize;
2597   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2598   sub(sp, sp, step);
2599   mov(rscratch1, -step);
2600   // Push v0-v7, v16-v31.
2601   for (int i = 31; i>= 4; i -= 4) {
2602     if (i <= v7->encoding() || i >= v16->encoding())
2603       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2604           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2605   }
2606   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2607       as_FloatRegister(3), T1D, Address(sp));
2608 }
2609 
2610 void MacroAssembler::pop_call_clobbered_registers() {
2611   for (int i = 0; i < 32; i += 4) {
2612     if (i <= v7->encoding() || i >= v16->encoding())
2613       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2614           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2615   }
2616 
2617   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2618 }
2619 
2620 void MacroAssembler::push_CPU_state(bool save_vectors) {
2621   int step = (save_vectors ? 8 : 4) * wordSize;
2622   push(0x3fffffff, sp);         // integer registers except lr & sp
2623   mov(rscratch1, -step);
2624   sub(sp, sp, step);
2625   for (int i = 28; i >= 4; i -= 4) {
2626     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2627         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2628   }
2629   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2630 }
2631 
2632 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2633   int step = (restore_vectors ? 8 : 4) * wordSize;
2634   for (int i = 0; i <= 28; i += 4)
2635     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2636         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2637   pop(0x3fffffff, sp);         // integer registers except lr & sp
2638 }
2639 
2640 /**
2641  * Helpers for multiply_to_len().
2642  */
2643 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2644                                      Register src1, Register src2) {
2645   adds(dest_lo, dest_lo, src1);
2646   adc(dest_hi, dest_hi, zr);
2647   adds(dest_lo, dest_lo, src2);
2648   adc(final_dest_hi, dest_hi, zr);
2649 }
2650 
2651 // Generate an address from (r + r1 extend offset).  "size" is the
2652 // size of the operand.  The result may be in rscratch2.
2653 Address MacroAssembler::offsetted_address(Register r, Register r1,
2654                                           Address::extend ext, int offset, int size) {
2655   if (offset || (ext.shift() % size != 0)) {
2656     lea(rscratch2, Address(r, r1, ext));
2657     return Address(rscratch2, offset);
2658   } else {
2659     return Address(r, r1, ext);
2660   }
2661 }
2662 
2663 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2664 {
2665   assert(offset >= 0, "spill to negative address?");
2666   // Offset reachable ?
2667   //   Not aligned - 9 bits signed offset
2668   //   Aligned - 12 bits unsigned offset shifted
2669   Register base = sp;
2670   if ((offset & (size-1)) && offset >= (1<<8)) {
2671     add(tmp, base, offset & ((1<<12)-1));
2672     base = tmp;
2673     offset &= -1<<12;
2674   }
2675 
2676   if (offset >= (1<<12) * size) {
2677     add(tmp, base, offset & (((1<<12)-1)<<12));
2678     base = tmp;
2679     offset &= ~(((1<<12)-1)<<12);
2680   }
2681 
2682   return Address(base, offset);
2683 }
2684 
2685 // Checks whether offset is aligned.
2686 // Returns true if it is, else false.
2687 bool MacroAssembler::merge_alignment_check(Register base,
2688                                            size_t size,
2689                                            long cur_offset,
2690                                            long prev_offset) const {
2691   if (AvoidUnalignedAccesses) {
2692     if (base == sp) {
2693       // Checks whether low offset if aligned to pair of registers.
2694       long pair_mask = size * 2 - 1;
2695       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2696       return (offset & pair_mask) == 0;
2697     } else { // If base is not sp, we can't guarantee the access is aligned.
2698       return false;
2699     }
2700   } else {
2701     long mask = size - 1;
2702     // Load/store pair instruction only supports element size aligned offset.
2703     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2704   }
2705 }
2706 
2707 // Checks whether current and previous loads/stores can be merged.
2708 // Returns true if it can be merged, else false.
2709 bool MacroAssembler::ldst_can_merge(Register rt,
2710                                     const Address &adr,
2711                                     size_t cur_size_in_bytes,
2712                                     bool is_store) const {
2713   address prev = pc() - NativeInstruction::instruction_size;
2714   address last = code()->last_insn();
2715 
2716   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2717     return false;
2718   }
2719 
2720   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2721     return false;
2722   }
2723 
2724   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2725   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2726 
2727   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2728   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2729 
2730   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2731     return false;
2732   }
2733 
2734   long max_offset = 63 * prev_size_in_bytes;
2735   long min_offset = -64 * prev_size_in_bytes;
2736 
2737   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2738 
2739   // Only same base can be merged.
2740   if (adr.base() != prev_ldst->base()) {
2741     return false;
2742   }
2743 
2744   long cur_offset = adr.offset();
2745   long prev_offset = prev_ldst->offset();
2746   size_t diff = abs(cur_offset - prev_offset);
2747   if (diff != prev_size_in_bytes) {
2748     return false;
2749   }
2750 
2751   // Following cases can not be merged:
2752   // ldr x2, [x2, #8]
2753   // ldr x3, [x2, #16]
2754   // or:
2755   // ldr x2, [x3, #8]
2756   // ldr x2, [x3, #16]
2757   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2758   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2759     return false;
2760   }
2761 
2762   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2763   // Offset range must be in ldp/stp instruction's range.
2764   if (low_offset > max_offset || low_offset < min_offset) {
2765     return false;
2766   }
2767 
2768   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2769     return true;
2770   }
2771 
2772   return false;
2773 }
2774 
2775 // Merge current load/store with previous load/store into ldp/stp.
2776 void MacroAssembler::merge_ldst(Register rt,
2777                                 const Address &adr,
2778                                 size_t cur_size_in_bytes,
2779                                 bool is_store) {
2780 
2781   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2782 
2783   Register rt_low, rt_high;
2784   address prev = pc() - NativeInstruction::instruction_size;
2785   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2786 
2787   long offset;
2788 
2789   if (adr.offset() < prev_ldst->offset()) {
2790     offset = adr.offset();
2791     rt_low = rt;
2792     rt_high = prev_ldst->target();
2793   } else {
2794     offset = prev_ldst->offset();
2795     rt_low = prev_ldst->target();
2796     rt_high = rt;
2797   }
2798 
2799   Address adr_p = Address(prev_ldst->base(), offset);
2800   // Overwrite previous generated binary.
2801   code_section()->set_end(prev);
2802 
2803   const int sz = prev_ldst->size_in_bytes();
2804   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2805   if (!is_store) {
2806     BLOCK_COMMENT("merged ldr pair");
2807     if (sz == 8) {
2808       ldp(rt_low, rt_high, adr_p);
2809     } else {
2810       ldpw(rt_low, rt_high, adr_p);
2811     }
2812   } else {
2813     BLOCK_COMMENT("merged str pair");
2814     if (sz == 8) {
2815       stp(rt_low, rt_high, adr_p);
2816     } else {
2817       stpw(rt_low, rt_high, adr_p);
2818     }
2819   }
2820 }
2821 
2822 /**
2823  * Multiply 64 bit by 64 bit first loop.
2824  */
2825 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2826                                            Register y, Register y_idx, Register z,
2827                                            Register carry, Register product,
2828                                            Register idx, Register kdx) {
2829   //
2830   //  jlong carry, x[], y[], z[];
2831   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2832   //    huge_128 product = y[idx] * x[xstart] + carry;
2833   //    z[kdx] = (jlong)product;
2834   //    carry  = (jlong)(product >>> 64);
2835   //  }
2836   //  z[xstart] = carry;
2837   //
2838 
2839   Label L_first_loop, L_first_loop_exit;
2840   Label L_one_x, L_one_y, L_multiply;
2841 
2842   subsw(xstart, xstart, 1);
2843   br(Assembler::MI, L_one_x);
2844 
2845   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2846   ldr(x_xstart, Address(rscratch1));
2847   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2848 
2849   bind(L_first_loop);
2850   subsw(idx, idx, 1);
2851   br(Assembler::MI, L_first_loop_exit);
2852   subsw(idx, idx, 1);
2853   br(Assembler::MI, L_one_y);
2854   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2855   ldr(y_idx, Address(rscratch1));
2856   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2857   bind(L_multiply);
2858 
2859   // AArch64 has a multiply-accumulate instruction that we can't use
2860   // here because it has no way to process carries, so we have to use
2861   // separate add and adc instructions.  Bah.
2862   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2863   mul(product, x_xstart, y_idx);
2864   adds(product, product, carry);
2865   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2866 
2867   subw(kdx, kdx, 2);
2868   ror(product, product, 32); // back to big-endian
2869   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2870 
2871   b(L_first_loop);
2872 
2873   bind(L_one_y);
2874   ldrw(y_idx, Address(y,  0));
2875   b(L_multiply);
2876 
2877   bind(L_one_x);
2878   ldrw(x_xstart, Address(x,  0));
2879   b(L_first_loop);
2880 
2881   bind(L_first_loop_exit);
2882 }
2883 
2884 /**
2885  * Multiply 128 bit by 128. Unrolled inner loop.
2886  *
2887  */
2888 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2889                                              Register carry, Register carry2,
2890                                              Register idx, Register jdx,
2891                                              Register yz_idx1, Register yz_idx2,
2892                                              Register tmp, Register tmp3, Register tmp4,
2893                                              Register tmp6, Register product_hi) {
2894 
2895   //   jlong carry, x[], y[], z[];
2896   //   int kdx = ystart+1;
2897   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2898   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2899   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2900   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2901   //     carry  = (jlong)(tmp4 >>> 64);
2902   //     z[kdx+idx+1] = (jlong)tmp3;
2903   //     z[kdx+idx] = (jlong)tmp4;
2904   //   }
2905   //   idx += 2;
2906   //   if (idx > 0) {
2907   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2908   //     z[kdx+idx] = (jlong)yz_idx1;
2909   //     carry  = (jlong)(yz_idx1 >>> 64);
2910   //   }
2911   //
2912 
2913   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2914 
2915   lsrw(jdx, idx, 2);
2916 
2917   bind(L_third_loop);
2918 
2919   subsw(jdx, jdx, 1);
2920   br(Assembler::MI, L_third_loop_exit);
2921   subw(idx, idx, 4);
2922 
2923   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2924 
2925   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2926 
2927   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2928 
2929   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2930   ror(yz_idx2, yz_idx2, 32);
2931 
2932   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2933 
2934   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2935   umulh(tmp4, product_hi, yz_idx1);
2936 
2937   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2938   ror(rscratch2, rscratch2, 32);
2939 
2940   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2941   umulh(carry2, product_hi, yz_idx2);
2942 
2943   // propagate sum of both multiplications into carry:tmp4:tmp3
2944   adds(tmp3, tmp3, carry);
2945   adc(tmp4, tmp4, zr);
2946   adds(tmp3, tmp3, rscratch1);
2947   adcs(tmp4, tmp4, tmp);
2948   adc(carry, carry2, zr);
2949   adds(tmp4, tmp4, rscratch2);
2950   adc(carry, carry, zr);
2951 
2952   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2953   ror(tmp4, tmp4, 32);
2954   stp(tmp4, tmp3, Address(tmp6, 0));
2955 
2956   b(L_third_loop);
2957   bind (L_third_loop_exit);
2958 
2959   andw (idx, idx, 0x3);
2960   cbz(idx, L_post_third_loop_done);
2961 
2962   Label L_check_1;
2963   subsw(idx, idx, 2);
2964   br(Assembler::MI, L_check_1);
2965 
2966   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2967   ldr(yz_idx1, Address(rscratch1, 0));
2968   ror(yz_idx1, yz_idx1, 32);
2969   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2970   umulh(tmp4, product_hi, yz_idx1);
2971   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2972   ldr(yz_idx2, Address(rscratch1, 0));
2973   ror(yz_idx2, yz_idx2, 32);
2974 
2975   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2976 
2977   ror(tmp3, tmp3, 32);
2978   str(tmp3, Address(rscratch1, 0));
2979 
2980   bind (L_check_1);
2981 
2982   andw (idx, idx, 0x1);
2983   subsw(idx, idx, 1);
2984   br(Assembler::MI, L_post_third_loop_done);
2985   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2986   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2987   umulh(carry2, tmp4, product_hi);
2988   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2989 
2990   add2_with_carry(carry2, tmp3, tmp4, carry);
2991 
2992   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2993   extr(carry, carry2, tmp3, 32);
2994 
2995   bind(L_post_third_loop_done);
2996 }
2997 
2998 /**
2999  * Code for BigInteger::multiplyToLen() instrinsic.
3000  *
3001  * r0: x
3002  * r1: xlen
3003  * r2: y
3004  * r3: ylen
3005  * r4:  z
3006  * r5: zlen
3007  * r10: tmp1
3008  * r11: tmp2
3009  * r12: tmp3
3010  * r13: tmp4
3011  * r14: tmp5
3012  * r15: tmp6
3013  * r16: tmp7
3014  *
3015  */
3016 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3017                                      Register z, Register zlen,
3018                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3019                                      Register tmp5, Register tmp6, Register product_hi) {
3020 
3021   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3022 
3023   const Register idx = tmp1;
3024   const Register kdx = tmp2;
3025   const Register xstart = tmp3;
3026 
3027   const Register y_idx = tmp4;
3028   const Register carry = tmp5;
3029   const Register product  = xlen;
3030   const Register x_xstart = zlen;  // reuse register
3031 
3032   // First Loop.
3033   //
3034   //  final static long LONG_MASK = 0xffffffffL;
3035   //  int xstart = xlen - 1;
3036   //  int ystart = ylen - 1;
3037   //  long carry = 0;
3038   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3039   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3040   //    z[kdx] = (int)product;
3041   //    carry = product >>> 32;
3042   //  }
3043   //  z[xstart] = (int)carry;
3044   //
3045 
3046   movw(idx, ylen);      // idx = ylen;
3047   movw(kdx, zlen);      // kdx = xlen+ylen;
3048   mov(carry, zr);       // carry = 0;
3049 
3050   Label L_done;
3051 
3052   movw(xstart, xlen);
3053   subsw(xstart, xstart, 1);
3054   br(Assembler::MI, L_done);
3055 
3056   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3057 
3058   Label L_second_loop;
3059   cbzw(kdx, L_second_loop);
3060 
3061   Label L_carry;
3062   subw(kdx, kdx, 1);
3063   cbzw(kdx, L_carry);
3064 
3065   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3066   lsr(carry, carry, 32);
3067   subw(kdx, kdx, 1);
3068 
3069   bind(L_carry);
3070   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3071 
3072   // Second and third (nested) loops.
3073   //
3074   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3075   //   carry = 0;
3076   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3077   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3078   //                    (z[k] & LONG_MASK) + carry;
3079   //     z[k] = (int)product;
3080   //     carry = product >>> 32;
3081   //   }
3082   //   z[i] = (int)carry;
3083   // }
3084   //
3085   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3086 
3087   const Register jdx = tmp1;
3088 
3089   bind(L_second_loop);
3090   mov(carry, zr);                // carry = 0;
3091   movw(jdx, ylen);               // j = ystart+1
3092 
3093   subsw(xstart, xstart, 1);      // i = xstart-1;
3094   br(Assembler::MI, L_done);
3095 
3096   str(z, Address(pre(sp, -4 * wordSize)));
3097 
3098   Label L_last_x;
3099   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3100   subsw(xstart, xstart, 1);       // i = xstart-1;
3101   br(Assembler::MI, L_last_x);
3102 
3103   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3104   ldr(product_hi, Address(rscratch1));
3105   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3106 
3107   Label L_third_loop_prologue;
3108   bind(L_third_loop_prologue);
3109 
3110   str(ylen, Address(sp, wordSize));
3111   stp(x, xstart, Address(sp, 2 * wordSize));
3112   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3113                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3114   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3115   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3116 
3117   addw(tmp3, xlen, 1);
3118   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3119   subsw(tmp3, tmp3, 1);
3120   br(Assembler::MI, L_done);
3121 
3122   lsr(carry, carry, 32);
3123   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3124   b(L_second_loop);
3125 
3126   // Next infrequent code is moved outside loops.
3127   bind(L_last_x);
3128   ldrw(product_hi, Address(x,  0));
3129   b(L_third_loop_prologue);
3130 
3131   bind(L_done);
3132 }
3133 
3134 // Code for BigInteger::mulAdd instrinsic
3135 // out     = r0
3136 // in      = r1
3137 // offset  = r2  (already out.length-offset)
3138 // len     = r3
3139 // k       = r4
3140 //
3141 // pseudo code from java implementation:
3142 // carry = 0;
3143 // offset = out.length-offset - 1;
3144 // for (int j=len-1; j >= 0; j--) {
3145 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3146 //     out[offset--] = (int)product;
3147 //     carry = product >>> 32;
3148 // }
3149 // return (int)carry;
3150 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3151       Register len, Register k) {
3152     Label LOOP, END;
3153     // pre-loop
3154     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3155     csel(out, zr, out, Assembler::EQ);
3156     br(Assembler::EQ, END);
3157     add(in, in, len, LSL, 2); // in[j+1] address
3158     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3159     mov(out, zr); // used to keep carry now
3160     BIND(LOOP);
3161     ldrw(rscratch1, Address(pre(in, -4)));
3162     madd(rscratch1, rscratch1, k, out);
3163     ldrw(rscratch2, Address(pre(offset, -4)));
3164     add(rscratch1, rscratch1, rscratch2);
3165     strw(rscratch1, Address(offset));
3166     lsr(out, rscratch1, 32);
3167     subs(len, len, 1);
3168     br(Assembler::NE, LOOP);
3169     BIND(END);
3170 }
3171 
3172 /**
3173  * Emits code to update CRC-32 with a byte value according to constants in table
3174  *
3175  * @param [in,out]crc   Register containing the crc.
3176  * @param [in]val       Register containing the byte to fold into the CRC.
3177  * @param [in]table     Register containing the table of crc constants.
3178  *
3179  * uint32_t crc;
3180  * val = crc_table[(val ^ crc) & 0xFF];
3181  * crc = val ^ (crc >> 8);
3182  *
3183  */
3184 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3185   eor(val, val, crc);
3186   andr(val, val, 0xff);
3187   ldrw(val, Address(table, val, Address::lsl(2)));
3188   eor(crc, val, crc, Assembler::LSR, 8);
3189 }
3190 
3191 /**
3192  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3193  *
3194  * @param [in,out]crc   Register containing the crc.
3195  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3196  * @param [in]table0    Register containing table 0 of crc constants.
3197  * @param [in]table1    Register containing table 1 of crc constants.
3198  * @param [in]table2    Register containing table 2 of crc constants.
3199  * @param [in]table3    Register containing table 3 of crc constants.
3200  *
3201  * uint32_t crc;
3202  *   v = crc ^ v
3203  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3204  *
3205  */
3206 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3207         Register table0, Register table1, Register table2, Register table3,
3208         bool upper) {
3209   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3210   uxtb(tmp, v);
3211   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3212   ubfx(tmp, v, 8, 8);
3213   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3214   eor(crc, crc, tmp);
3215   ubfx(tmp, v, 16, 8);
3216   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3217   eor(crc, crc, tmp);
3218   ubfx(tmp, v, 24, 8);
3219   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3220   eor(crc, crc, tmp);
3221 }
3222 
3223 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3224         Register len, Register tmp0, Register tmp1, Register tmp2,
3225         Register tmp3) {
3226     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3227     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3228 
3229     mvnw(crc, crc);
3230 
3231     subs(len, len, 128);
3232     br(Assembler::GE, CRC_by64_pre);
3233   BIND(CRC_less64);
3234     adds(len, len, 128-32);
3235     br(Assembler::GE, CRC_by32_loop);
3236   BIND(CRC_less32);
3237     adds(len, len, 32-4);
3238     br(Assembler::GE, CRC_by4_loop);
3239     adds(len, len, 4);
3240     br(Assembler::GT, CRC_by1_loop);
3241     b(L_exit);
3242 
3243   BIND(CRC_by32_loop);
3244     ldp(tmp0, tmp1, Address(post(buf, 16)));
3245     subs(len, len, 32);
3246     crc32x(crc, crc, tmp0);
3247     ldr(tmp2, Address(post(buf, 8)));
3248     crc32x(crc, crc, tmp1);
3249     ldr(tmp3, Address(post(buf, 8)));
3250     crc32x(crc, crc, tmp2);
3251     crc32x(crc, crc, tmp3);
3252     br(Assembler::GE, CRC_by32_loop);
3253     cmn(len, 32);
3254     br(Assembler::NE, CRC_less32);
3255     b(L_exit);
3256 
3257   BIND(CRC_by4_loop);
3258     ldrw(tmp0, Address(post(buf, 4)));
3259     subs(len, len, 4);
3260     crc32w(crc, crc, tmp0);
3261     br(Assembler::GE, CRC_by4_loop);
3262     adds(len, len, 4);
3263     br(Assembler::LE, L_exit);
3264   BIND(CRC_by1_loop);
3265     ldrb(tmp0, Address(post(buf, 1)));
3266     subs(len, len, 1);
3267     crc32b(crc, crc, tmp0);
3268     br(Assembler::GT, CRC_by1_loop);
3269     b(L_exit);
3270 
3271   BIND(CRC_by64_pre);
3272     sub(buf, buf, 8);
3273     ldp(tmp0, tmp1, Address(buf, 8));
3274     crc32x(crc, crc, tmp0);
3275     ldr(tmp2, Address(buf, 24));
3276     crc32x(crc, crc, tmp1);
3277     ldr(tmp3, Address(buf, 32));
3278     crc32x(crc, crc, tmp2);
3279     ldr(tmp0, Address(buf, 40));
3280     crc32x(crc, crc, tmp3);
3281     ldr(tmp1, Address(buf, 48));
3282     crc32x(crc, crc, tmp0);
3283     ldr(tmp2, Address(buf, 56));
3284     crc32x(crc, crc, tmp1);
3285     ldr(tmp3, Address(pre(buf, 64)));
3286 
3287     b(CRC_by64_loop);
3288 
3289     align(CodeEntryAlignment);
3290   BIND(CRC_by64_loop);
3291     subs(len, len, 64);
3292     crc32x(crc, crc, tmp2);
3293     ldr(tmp0, Address(buf, 8));
3294     crc32x(crc, crc, tmp3);
3295     ldr(tmp1, Address(buf, 16));
3296     crc32x(crc, crc, tmp0);
3297     ldr(tmp2, Address(buf, 24));
3298     crc32x(crc, crc, tmp1);
3299     ldr(tmp3, Address(buf, 32));
3300     crc32x(crc, crc, tmp2);
3301     ldr(tmp0, Address(buf, 40));
3302     crc32x(crc, crc, tmp3);
3303     ldr(tmp1, Address(buf, 48));
3304     crc32x(crc, crc, tmp0);
3305     ldr(tmp2, Address(buf, 56));
3306     crc32x(crc, crc, tmp1);
3307     ldr(tmp3, Address(pre(buf, 64)));
3308     br(Assembler::GE, CRC_by64_loop);
3309 
3310     // post-loop
3311     crc32x(crc, crc, tmp2);
3312     crc32x(crc, crc, tmp3);
3313 
3314     sub(len, len, 64);
3315     add(buf, buf, 8);
3316     cmn(len, 128);
3317     br(Assembler::NE, CRC_less64);
3318   BIND(L_exit);
3319     mvnw(crc, crc);
3320 }
3321 
3322 /**
3323  * @param crc   register containing existing CRC (32-bit)
3324  * @param buf   register pointing to input byte buffer (byte*)
3325  * @param len   register containing number of bytes
3326  * @param table register that will contain address of CRC table
3327  * @param tmp   scratch register
3328  */
3329 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3330         Register table0, Register table1, Register table2, Register table3,
3331         Register tmp, Register tmp2, Register tmp3) {
3332   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3333   unsigned long offset;
3334 
3335   if (UseCRC32) {
3336       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3337       return;
3338   }
3339 
3340     mvnw(crc, crc);
3341 
3342     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3343     if (offset) add(table0, table0, offset);
3344     add(table1, table0, 1*256*sizeof(juint));
3345     add(table2, table0, 2*256*sizeof(juint));
3346     add(table3, table0, 3*256*sizeof(juint));
3347 
3348   if (UseNeon) {
3349       cmp(len, (u1)64);
3350       br(Assembler::LT, L_by16);
3351       eor(v16, T16B, v16, v16);
3352 
3353     Label L_fold;
3354 
3355       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3356 
3357       ld1(v0, v1, T2D, post(buf, 32));
3358       ld1r(v4, T2D, post(tmp, 8));
3359       ld1r(v5, T2D, post(tmp, 8));
3360       ld1r(v6, T2D, post(tmp, 8));
3361       ld1r(v7, T2D, post(tmp, 8));
3362       mov(v16, T4S, 0, crc);
3363 
3364       eor(v0, T16B, v0, v16);
3365       sub(len, len, 64);
3366 
3367     BIND(L_fold);
3368       pmull(v22, T8H, v0, v5, T8B);
3369       pmull(v20, T8H, v0, v7, T8B);
3370       pmull(v23, T8H, v0, v4, T8B);
3371       pmull(v21, T8H, v0, v6, T8B);
3372 
3373       pmull2(v18, T8H, v0, v5, T16B);
3374       pmull2(v16, T8H, v0, v7, T16B);
3375       pmull2(v19, T8H, v0, v4, T16B);
3376       pmull2(v17, T8H, v0, v6, T16B);
3377 
3378       uzp1(v24, T8H, v20, v22);
3379       uzp2(v25, T8H, v20, v22);
3380       eor(v20, T16B, v24, v25);
3381 
3382       uzp1(v26, T8H, v16, v18);
3383       uzp2(v27, T8H, v16, v18);
3384       eor(v16, T16B, v26, v27);
3385 
3386       ushll2(v22, T4S, v20, T8H, 8);
3387       ushll(v20, T4S, v20, T4H, 8);
3388 
3389       ushll2(v18, T4S, v16, T8H, 8);
3390       ushll(v16, T4S, v16, T4H, 8);
3391 
3392       eor(v22, T16B, v23, v22);
3393       eor(v18, T16B, v19, v18);
3394       eor(v20, T16B, v21, v20);
3395       eor(v16, T16B, v17, v16);
3396 
3397       uzp1(v17, T2D, v16, v20);
3398       uzp2(v21, T2D, v16, v20);
3399       eor(v17, T16B, v17, v21);
3400 
3401       ushll2(v20, T2D, v17, T4S, 16);
3402       ushll(v16, T2D, v17, T2S, 16);
3403 
3404       eor(v20, T16B, v20, v22);
3405       eor(v16, T16B, v16, v18);
3406 
3407       uzp1(v17, T2D, v20, v16);
3408       uzp2(v21, T2D, v20, v16);
3409       eor(v28, T16B, v17, v21);
3410 
3411       pmull(v22, T8H, v1, v5, T8B);
3412       pmull(v20, T8H, v1, v7, T8B);
3413       pmull(v23, T8H, v1, v4, T8B);
3414       pmull(v21, T8H, v1, v6, T8B);
3415 
3416       pmull2(v18, T8H, v1, v5, T16B);
3417       pmull2(v16, T8H, v1, v7, T16B);
3418       pmull2(v19, T8H, v1, v4, T16B);
3419       pmull2(v17, T8H, v1, v6, T16B);
3420 
3421       ld1(v0, v1, T2D, post(buf, 32));
3422 
3423       uzp1(v24, T8H, v20, v22);
3424       uzp2(v25, T8H, v20, v22);
3425       eor(v20, T16B, v24, v25);
3426 
3427       uzp1(v26, T8H, v16, v18);
3428       uzp2(v27, T8H, v16, v18);
3429       eor(v16, T16B, v26, v27);
3430 
3431       ushll2(v22, T4S, v20, T8H, 8);
3432       ushll(v20, T4S, v20, T4H, 8);
3433 
3434       ushll2(v18, T4S, v16, T8H, 8);
3435       ushll(v16, T4S, v16, T4H, 8);
3436 
3437       eor(v22, T16B, v23, v22);
3438       eor(v18, T16B, v19, v18);
3439       eor(v20, T16B, v21, v20);
3440       eor(v16, T16B, v17, v16);
3441 
3442       uzp1(v17, T2D, v16, v20);
3443       uzp2(v21, T2D, v16, v20);
3444       eor(v16, T16B, v17, v21);
3445 
3446       ushll2(v20, T2D, v16, T4S, 16);
3447       ushll(v16, T2D, v16, T2S, 16);
3448 
3449       eor(v20, T16B, v22, v20);
3450       eor(v16, T16B, v16, v18);
3451 
3452       uzp1(v17, T2D, v20, v16);
3453       uzp2(v21, T2D, v20, v16);
3454       eor(v20, T16B, v17, v21);
3455 
3456       shl(v16, T2D, v28, 1);
3457       shl(v17, T2D, v20, 1);
3458 
3459       eor(v0, T16B, v0, v16);
3460       eor(v1, T16B, v1, v17);
3461 
3462       subs(len, len, 32);
3463       br(Assembler::GE, L_fold);
3464 
3465       mov(crc, 0);
3466       mov(tmp, v0, T1D, 0);
3467       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3468       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3469       mov(tmp, v0, T1D, 1);
3470       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3471       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3472       mov(tmp, v1, T1D, 0);
3473       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3474       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3475       mov(tmp, v1, T1D, 1);
3476       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3477       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3478 
3479       add(len, len, 32);
3480   }
3481 
3482   BIND(L_by16);
3483     subs(len, len, 16);
3484     br(Assembler::GE, L_by16_loop);
3485     adds(len, len, 16-4);
3486     br(Assembler::GE, L_by4_loop);
3487     adds(len, len, 4);
3488     br(Assembler::GT, L_by1_loop);
3489     b(L_exit);
3490 
3491   BIND(L_by4_loop);
3492     ldrw(tmp, Address(post(buf, 4)));
3493     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3494     subs(len, len, 4);
3495     br(Assembler::GE, L_by4_loop);
3496     adds(len, len, 4);
3497     br(Assembler::LE, L_exit);
3498   BIND(L_by1_loop);
3499     subs(len, len, 1);
3500     ldrb(tmp, Address(post(buf, 1)));
3501     update_byte_crc32(crc, tmp, table0);
3502     br(Assembler::GT, L_by1_loop);
3503     b(L_exit);
3504 
3505     align(CodeEntryAlignment);
3506   BIND(L_by16_loop);
3507     subs(len, len, 16);
3508     ldp(tmp, tmp3, Address(post(buf, 16)));
3509     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3510     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3511     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3512     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3513     br(Assembler::GE, L_by16_loop);
3514     adds(len, len, 16-4);
3515     br(Assembler::GE, L_by4_loop);
3516     adds(len, len, 4);
3517     br(Assembler::GT, L_by1_loop);
3518   BIND(L_exit);
3519     mvnw(crc, crc);
3520 }
3521 
3522 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3523         Register len, Register tmp0, Register tmp1, Register tmp2,
3524         Register tmp3) {
3525     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3526     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3527 
3528     subs(len, len, 128);
3529     br(Assembler::GE, CRC_by64_pre);
3530   BIND(CRC_less64);
3531     adds(len, len, 128-32);
3532     br(Assembler::GE, CRC_by32_loop);
3533   BIND(CRC_less32);
3534     adds(len, len, 32-4);
3535     br(Assembler::GE, CRC_by4_loop);
3536     adds(len, len, 4);
3537     br(Assembler::GT, CRC_by1_loop);
3538     b(L_exit);
3539 
3540   BIND(CRC_by32_loop);
3541     ldp(tmp0, tmp1, Address(post(buf, 16)));
3542     subs(len, len, 32);
3543     crc32cx(crc, crc, tmp0);
3544     ldr(tmp2, Address(post(buf, 8)));
3545     crc32cx(crc, crc, tmp1);
3546     ldr(tmp3, Address(post(buf, 8)));
3547     crc32cx(crc, crc, tmp2);
3548     crc32cx(crc, crc, tmp3);
3549     br(Assembler::GE, CRC_by32_loop);
3550     cmn(len, 32);
3551     br(Assembler::NE, CRC_less32);
3552     b(L_exit);
3553 
3554   BIND(CRC_by4_loop);
3555     ldrw(tmp0, Address(post(buf, 4)));
3556     subs(len, len, 4);
3557     crc32cw(crc, crc, tmp0);
3558     br(Assembler::GE, CRC_by4_loop);
3559     adds(len, len, 4);
3560     br(Assembler::LE, L_exit);
3561   BIND(CRC_by1_loop);
3562     ldrb(tmp0, Address(post(buf, 1)));
3563     subs(len, len, 1);
3564     crc32cb(crc, crc, tmp0);
3565     br(Assembler::GT, CRC_by1_loop);
3566     b(L_exit);
3567 
3568   BIND(CRC_by64_pre);
3569     sub(buf, buf, 8);
3570     ldp(tmp0, tmp1, Address(buf, 8));
3571     crc32cx(crc, crc, tmp0);
3572     ldr(tmp2, Address(buf, 24));
3573     crc32cx(crc, crc, tmp1);
3574     ldr(tmp3, Address(buf, 32));
3575     crc32cx(crc, crc, tmp2);
3576     ldr(tmp0, Address(buf, 40));
3577     crc32cx(crc, crc, tmp3);
3578     ldr(tmp1, Address(buf, 48));
3579     crc32cx(crc, crc, tmp0);
3580     ldr(tmp2, Address(buf, 56));
3581     crc32cx(crc, crc, tmp1);
3582     ldr(tmp3, Address(pre(buf, 64)));
3583 
3584     b(CRC_by64_loop);
3585 
3586     align(CodeEntryAlignment);
3587   BIND(CRC_by64_loop);
3588     subs(len, len, 64);
3589     crc32cx(crc, crc, tmp2);
3590     ldr(tmp0, Address(buf, 8));
3591     crc32cx(crc, crc, tmp3);
3592     ldr(tmp1, Address(buf, 16));
3593     crc32cx(crc, crc, tmp0);
3594     ldr(tmp2, Address(buf, 24));
3595     crc32cx(crc, crc, tmp1);
3596     ldr(tmp3, Address(buf, 32));
3597     crc32cx(crc, crc, tmp2);
3598     ldr(tmp0, Address(buf, 40));
3599     crc32cx(crc, crc, tmp3);
3600     ldr(tmp1, Address(buf, 48));
3601     crc32cx(crc, crc, tmp0);
3602     ldr(tmp2, Address(buf, 56));
3603     crc32cx(crc, crc, tmp1);
3604     ldr(tmp3, Address(pre(buf, 64)));
3605     br(Assembler::GE, CRC_by64_loop);
3606 
3607     // post-loop
3608     crc32cx(crc, crc, tmp2);
3609     crc32cx(crc, crc, tmp3);
3610 
3611     sub(len, len, 64);
3612     add(buf, buf, 8);
3613     cmn(len, 128);
3614     br(Assembler::NE, CRC_less64);
3615   BIND(L_exit);
3616 }
3617 
3618 /**
3619  * @param crc   register containing existing CRC (32-bit)
3620  * @param buf   register pointing to input byte buffer (byte*)
3621  * @param len   register containing number of bytes
3622  * @param table register that will contain address of CRC table
3623  * @param tmp   scratch register
3624  */
3625 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3626         Register table0, Register table1, Register table2, Register table3,
3627         Register tmp, Register tmp2, Register tmp3) {
3628   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3629 }
3630 
3631 
3632 SkipIfEqual::SkipIfEqual(
3633     MacroAssembler* masm, const bool* flag_addr, bool value) {
3634   _masm = masm;
3635   unsigned long offset;
3636   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3637   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3638   _masm->cbzw(rscratch1, _label);
3639 }
3640 
3641 SkipIfEqual::~SkipIfEqual() {
3642   _masm->bind(_label);
3643 }
3644 
3645 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3646   Address adr;
3647   switch(dst.getMode()) {
3648   case Address::base_plus_offset:
3649     // This is the expected mode, although we allow all the other
3650     // forms below.
3651     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3652     break;
3653   default:
3654     lea(rscratch2, dst);
3655     adr = Address(rscratch2);
3656     break;
3657   }
3658   ldr(rscratch1, adr);
3659   add(rscratch1, rscratch1, src);
3660   str(rscratch1, adr);
3661 }
3662 
3663 void MacroAssembler::cmpptr(Register src1, Address src2) {
3664   unsigned long offset;
3665   adrp(rscratch1, src2, offset);
3666   ldr(rscratch1, Address(rscratch1, offset));
3667   cmp(src1, rscratch1);
3668 }
3669 
3670 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3671   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3672   bs->obj_equals(this, obj1, obj2);
3673 }
3674 
3675 void MacroAssembler::load_klass(Register dst, Register src) {
3676   if (UseCompressedClassPointers) {
3677     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3678     decode_klass_not_null(dst);
3679   } else {
3680     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3681   }
3682 }
3683 
3684 // ((OopHandle)result).resolve();
3685 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3686   // OopHandle::resolve is an indirection.
3687   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3688 }
3689 
3690 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3691   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3692   ldr(dst, Address(rmethod, Method::const_offset()));
3693   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3694   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3695   ldr(dst, Address(dst, mirror_offset));
3696   resolve_oop_handle(dst, tmp);
3697 }
3698 
3699 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3700   if (UseCompressedClassPointers) {
3701     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3702     if (Universe::narrow_klass_base() == NULL) {
3703       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3704       return;
3705     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3706                && Universe::narrow_klass_shift() == 0) {
3707       // Only the bottom 32 bits matter
3708       cmpw(trial_klass, tmp);
3709       return;
3710     }
3711     decode_klass_not_null(tmp);
3712   } else {
3713     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3714   }
3715   cmp(trial_klass, tmp);
3716 }
3717 
3718 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3719   load_klass(dst, src);
3720   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3721 }
3722 
3723 void MacroAssembler::store_klass(Register dst, Register src) {
3724   // FIXME: Should this be a store release?  concurrent gcs assumes
3725   // klass length is valid if klass field is not null.
3726   if (UseCompressedClassPointers) {
3727     encode_klass_not_null(src);
3728     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3729   } else {
3730     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3731   }
3732 }
3733 
3734 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3735   if (UseCompressedClassPointers) {
3736     // Store to klass gap in destination
3737     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3738   }
3739 }
3740 
3741 // Algorithm must match CompressedOops::encode.
3742 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3743 #ifdef ASSERT
3744   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3745 #endif
3746   verify_oop(s, "broken oop in encode_heap_oop");
3747   if (Universe::narrow_oop_base() == NULL) {
3748     if (Universe::narrow_oop_shift() != 0) {
3749       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3750       lsr(d, s, LogMinObjAlignmentInBytes);
3751     } else {
3752       mov(d, s);
3753     }
3754   } else {
3755     subs(d, s, rheapbase);
3756     csel(d, d, zr, Assembler::HS);
3757     lsr(d, d, LogMinObjAlignmentInBytes);
3758 
3759     /*  Old algorithm: is this any worse?
3760     Label nonnull;
3761     cbnz(r, nonnull);
3762     sub(r, r, rheapbase);
3763     bind(nonnull);
3764     lsr(r, r, LogMinObjAlignmentInBytes);
3765     */
3766   }
3767 }
3768 
3769 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3770 #ifdef ASSERT
3771   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3772   if (CheckCompressedOops) {
3773     Label ok;
3774     cbnz(r, ok);
3775     stop("null oop passed to encode_heap_oop_not_null");
3776     bind(ok);
3777   }
3778 #endif
3779   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3780   if (Universe::narrow_oop_base() != NULL) {
3781     sub(r, r, rheapbase);
3782   }
3783   if (Universe::narrow_oop_shift() != 0) {
3784     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3785     lsr(r, r, LogMinObjAlignmentInBytes);
3786   }
3787 }
3788 
3789 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3790 #ifdef ASSERT
3791   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3792   if (CheckCompressedOops) {
3793     Label ok;
3794     cbnz(src, ok);
3795     stop("null oop passed to encode_heap_oop_not_null2");
3796     bind(ok);
3797   }
3798 #endif
3799   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3800 
3801   Register data = src;
3802   if (Universe::narrow_oop_base() != NULL) {
3803     sub(dst, src, rheapbase);
3804     data = dst;
3805   }
3806   if (Universe::narrow_oop_shift() != 0) {
3807     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3808     lsr(dst, data, LogMinObjAlignmentInBytes);
3809     data = dst;
3810   }
3811   if (data == src)
3812     mov(dst, src);
3813 }
3814 
3815 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3816 #ifdef ASSERT
3817   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3818 #endif
3819   if (Universe::narrow_oop_base() == NULL) {
3820     if (Universe::narrow_oop_shift() != 0 || d != s) {
3821       lsl(d, s, Universe::narrow_oop_shift());
3822     }
3823   } else {
3824     Label done;
3825     if (d != s)
3826       mov(d, s);
3827     cbz(s, done);
3828     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3829     bind(done);
3830   }
3831   verify_oop(d, "broken oop in decode_heap_oop");
3832 }
3833 
3834 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3835   assert (UseCompressedOops, "should only be used for compressed headers");
3836   assert (Universe::heap() != NULL, "java heap should be initialized");
3837   // Cannot assert, unverified entry point counts instructions (see .ad file)
3838   // vtableStubs also counts instructions in pd_code_size_limit.
3839   // Also do not verify_oop as this is called by verify_oop.
3840   if (Universe::narrow_oop_shift() != 0) {
3841     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3842     if (Universe::narrow_oop_base() != NULL) {
3843       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3844     } else {
3845       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3846     }
3847   } else {
3848     assert (Universe::narrow_oop_base() == NULL, "sanity");
3849   }
3850 }
3851 
3852 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3853   assert (UseCompressedOops, "should only be used for compressed headers");
3854   assert (Universe::heap() != NULL, "java heap should be initialized");
3855   // Cannot assert, unverified entry point counts instructions (see .ad file)
3856   // vtableStubs also counts instructions in pd_code_size_limit.
3857   // Also do not verify_oop as this is called by verify_oop.
3858   if (Universe::narrow_oop_shift() != 0) {
3859     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3860     if (Universe::narrow_oop_base() != NULL) {
3861       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3862     } else {
3863       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3864     }
3865   } else {
3866     assert (Universe::narrow_oop_base() == NULL, "sanity");
3867     if (dst != src) {
3868       mov(dst, src);
3869     }
3870   }
3871 }
3872 
3873 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3874   if (Universe::narrow_klass_base() == NULL) {
3875     if (Universe::narrow_klass_shift() != 0) {
3876       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3877       lsr(dst, src, LogKlassAlignmentInBytes);
3878     } else {
3879       if (dst != src) mov(dst, src);
3880     }
3881     return;
3882   }
3883 
3884   if (use_XOR_for_compressed_class_base) {
3885     if (Universe::narrow_klass_shift() != 0) {
3886       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3887       lsr(dst, dst, LogKlassAlignmentInBytes);
3888     } else {
3889       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3890     }
3891     return;
3892   }
3893 
3894   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3895       && Universe::narrow_klass_shift() == 0) {
3896     movw(dst, src);
3897     return;
3898   }
3899 
3900 #ifdef ASSERT
3901   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3902 #endif
3903 
3904   Register rbase = dst;
3905   if (dst == src) rbase = rheapbase;
3906   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3907   sub(dst, src, rbase);
3908   if (Universe::narrow_klass_shift() != 0) {
3909     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3910     lsr(dst, dst, LogKlassAlignmentInBytes);
3911   }
3912   if (dst == src) reinit_heapbase();
3913 }
3914 
3915 void MacroAssembler::encode_klass_not_null(Register r) {
3916   encode_klass_not_null(r, r);
3917 }
3918 
3919 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3920   Register rbase = dst;
3921   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3922 
3923   if (Universe::narrow_klass_base() == NULL) {
3924     if (Universe::narrow_klass_shift() != 0) {
3925       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3926       lsl(dst, src, LogKlassAlignmentInBytes);
3927     } else {
3928       if (dst != src) mov(dst, src);
3929     }
3930     return;
3931   }
3932 
3933   if (use_XOR_for_compressed_class_base) {
3934     if (Universe::narrow_klass_shift() != 0) {
3935       lsl(dst, src, LogKlassAlignmentInBytes);
3936       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3937     } else {
3938       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3939     }
3940     return;
3941   }
3942 
3943   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3944       && Universe::narrow_klass_shift() == 0) {
3945     if (dst != src)
3946       movw(dst, src);
3947     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3948     return;
3949   }
3950 
3951   // Cannot assert, unverified entry point counts instructions (see .ad file)
3952   // vtableStubs also counts instructions in pd_code_size_limit.
3953   // Also do not verify_oop as this is called by verify_oop.
3954   if (dst == src) rbase = rheapbase;
3955   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3956   if (Universe::narrow_klass_shift() != 0) {
3957     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3958     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3959   } else {
3960     add(dst, rbase, src);
3961   }
3962   if (dst == src) reinit_heapbase();
3963 }
3964 
3965 void  MacroAssembler::decode_klass_not_null(Register r) {
3966   decode_klass_not_null(r, r);
3967 }
3968 
3969 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3970 #ifdef ASSERT
3971   {
3972     ThreadInVMfromUnknown tiv;
3973     assert (UseCompressedOops, "should only be used for compressed oops");
3974     assert (Universe::heap() != NULL, "java heap should be initialized");
3975     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3976     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3977   }
3978 #endif
3979   int oop_index = oop_recorder()->find_index(obj);
3980   InstructionMark im(this);
3981   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3982   code_section()->relocate(inst_mark(), rspec);
3983   movz(dst, 0xDEAD, 16);
3984   movk(dst, 0xBEEF);
3985 }
3986 
3987 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3988   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3989   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3990   int index = oop_recorder()->find_index(k);
3991   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3992 
3993   InstructionMark im(this);
3994   RelocationHolder rspec = metadata_Relocation::spec(index);
3995   code_section()->relocate(inst_mark(), rspec);
3996   narrowKlass nk = Klass::encode_klass(k);
3997   movz(dst, (nk >> 16), 16);
3998   movk(dst, nk & 0xffff);
3999 }
4000 
4001 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4002                                     Register dst, Address src,
4003                                     Register tmp1, Register thread_tmp) {
4004   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4005   decorators = AccessInternal::decorator_fixup(decorators);
4006   bool as_raw = (decorators & AS_RAW) != 0;
4007   if (as_raw) {
4008     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4009   } else {
4010     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4011   }
4012 }
4013 
4014 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4015                                      Address dst, Register src,
4016                                      Register tmp1, Register thread_tmp) {
4017   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4018   decorators = AccessInternal::decorator_fixup(decorators);
4019   bool as_raw = (decorators & AS_RAW) != 0;
4020   if (as_raw) {
4021     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4022   } else {
4023     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4024   }
4025 }
4026 
4027 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4028   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4029   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4030     decorators |= ACCESS_READ | ACCESS_WRITE;
4031   }
4032   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4033   return bs->resolve(this, decorators, obj);
4034 }
4035 
4036 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4037                                    Register thread_tmp, DecoratorSet decorators) {
4038   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4039 }
4040 
4041 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4042                                             Register thread_tmp, DecoratorSet decorators) {
4043   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4044 }
4045 
4046 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4047                                     Register thread_tmp, DecoratorSet decorators) {
4048   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4049 }
4050 
4051 // Used for storing NULLs.
4052 void MacroAssembler::store_heap_oop_null(Address dst) {
4053   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4054 }
4055 
4056 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4057   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4058   int index = oop_recorder()->allocate_metadata_index(obj);
4059   RelocationHolder rspec = metadata_Relocation::spec(index);
4060   return Address((address)obj, rspec);
4061 }
4062 
4063 // Move an oop into a register.  immediate is true if we want
4064 // immediate instrcutions, i.e. we are not going to patch this
4065 // instruction while the code is being executed by another thread.  In
4066 // that case we can use move immediates rather than the constant pool.
4067 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4068   int oop_index;
4069   if (obj == NULL) {
4070     oop_index = oop_recorder()->allocate_oop_index(obj);
4071   } else {
4072 #ifdef ASSERT
4073     {
4074       ThreadInVMfromUnknown tiv;
4075       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4076     }
4077 #endif
4078     oop_index = oop_recorder()->find_index(obj);
4079   }
4080   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4081   if (! immediate) {
4082     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4083     ldr_constant(dst, Address(dummy, rspec));
4084   } else
4085     mov(dst, Address((address)obj, rspec));
4086 }
4087 
4088 // Move a metadata address into a register.
4089 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4090   int oop_index;
4091   if (obj == NULL) {
4092     oop_index = oop_recorder()->allocate_metadata_index(obj);
4093   } else {
4094     oop_index = oop_recorder()->find_index(obj);
4095   }
4096   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4097   mov(dst, Address((address)obj, rspec));
4098 }
4099 
4100 Address MacroAssembler::constant_oop_address(jobject obj) {
4101 #ifdef ASSERT
4102   {
4103     ThreadInVMfromUnknown tiv;
4104     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4105     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4106   }
4107 #endif
4108   int oop_index = oop_recorder()->find_index(obj);
4109   return Address((address)obj, oop_Relocation::spec(oop_index));
4110 }
4111 
4112 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4113 void MacroAssembler::tlab_allocate(Register obj,
4114                                    Register var_size_in_bytes,
4115                                    int con_size_in_bytes,
4116                                    Register t1,
4117                                    Register t2,
4118                                    Label& slow_case) {
4119   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4120   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4121 }
4122 
4123 // Defines obj, preserves var_size_in_bytes
4124 void MacroAssembler::eden_allocate(Register obj,
4125                                    Register var_size_in_bytes,
4126                                    int con_size_in_bytes,
4127                                    Register t1,
4128                                    Label& slow_case) {
4129   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4130   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4131 }
4132 
4133 // Zero words; len is in bytes
4134 // Destroys all registers except addr
4135 // len must be a nonzero multiple of wordSize
4136 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4137   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4138 
4139 #ifdef ASSERT
4140   { Label L;
4141     tst(len, BytesPerWord - 1);
4142     br(Assembler::EQ, L);
4143     stop("len is not a multiple of BytesPerWord");
4144     bind(L);
4145   }
4146 #endif
4147 
4148 #ifndef PRODUCT
4149   block_comment("zero memory");
4150 #endif
4151 
4152   Label loop;
4153   Label entry;
4154 
4155 //  Algorithm:
4156 //
4157 //    scratch1 = cnt & 7;
4158 //    cnt -= scratch1;
4159 //    p += scratch1;
4160 //    switch (scratch1) {
4161 //      do {
4162 //        cnt -= 8;
4163 //          p[-8] = 0;
4164 //        case 7:
4165 //          p[-7] = 0;
4166 //        case 6:
4167 //          p[-6] = 0;
4168 //          // ...
4169 //        case 1:
4170 //          p[-1] = 0;
4171 //        case 0:
4172 //          p += 8;
4173 //      } while (cnt);
4174 //    }
4175 
4176   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4177 
4178   lsr(len, len, LogBytesPerWord);
4179   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4180   sub(len, len, rscratch1);      // cnt -= unroll
4181   // t1 always points to the end of the region we're about to zero
4182   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4183   adr(rscratch2, entry);
4184   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4185   br(rscratch2);
4186   bind(loop);
4187   sub(len, len, unroll);
4188   for (int i = -unroll; i < 0; i++)
4189     Assembler::str(zr, Address(t1, i * wordSize));
4190   bind(entry);
4191   add(t1, t1, unroll * wordSize);
4192   cbnz(len, loop);
4193 }
4194 
4195 void MacroAssembler::verify_tlab() {
4196 #ifdef ASSERT
4197   if (UseTLAB && VerifyOops) {
4198     Label next, ok;
4199 
4200     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4201 
4202     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4203     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4204     cmp(rscratch2, rscratch1);
4205     br(Assembler::HS, next);
4206     STOP("assert(top >= start)");
4207     should_not_reach_here();
4208 
4209     bind(next);
4210     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4211     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4212     cmp(rscratch2, rscratch1);
4213     br(Assembler::HS, ok);
4214     STOP("assert(top <= end)");
4215     should_not_reach_here();
4216 
4217     bind(ok);
4218     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4219   }
4220 #endif
4221 }
4222 
4223 // Writes to stack successive pages until offset reached to check for
4224 // stack overflow + shadow pages.  This clobbers tmp.
4225 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4226   assert_different_registers(tmp, size, rscratch1);
4227   mov(tmp, sp);
4228   // Bang stack for total size given plus shadow page size.
4229   // Bang one page at a time because large size can bang beyond yellow and
4230   // red zones.
4231   Label loop;
4232   mov(rscratch1, os::vm_page_size());
4233   bind(loop);
4234   lea(tmp, Address(tmp, -os::vm_page_size()));
4235   subsw(size, size, rscratch1);
4236   str(size, Address(tmp));
4237   br(Assembler::GT, loop);
4238 
4239   // Bang down shadow pages too.
4240   // At this point, (tmp-0) is the last address touched, so don't
4241   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4242   // was post-decremented.)  Skip this address by starting at i=1, and
4243   // touch a few more pages below.  N.B.  It is important to touch all
4244   // the way down to and including i=StackShadowPages.
4245   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4246     // this could be any sized move but this is can be a debugging crumb
4247     // so the bigger the better.
4248     lea(tmp, Address(tmp, -os::vm_page_size()));
4249     str(size, Address(tmp));
4250   }
4251 }
4252 
4253 
4254 // Move the address of the polling page into dest.
4255 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4256   if (SafepointMechanism::uses_thread_local_poll()) {
4257     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4258   } else {
4259     unsigned long off;
4260     adrp(dest, Address(page, rtype), off);
4261     assert(off == 0, "polling page must be page aligned");
4262   }
4263 }
4264 
4265 // Move the address of the polling page into r, then read the polling
4266 // page.
4267 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4268   get_polling_page(r, page, rtype);
4269   return read_polling_page(r, rtype);
4270 }
4271 
4272 // Read the polling page.  The address of the polling page must
4273 // already be in r.
4274 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4275   InstructionMark im(this);
4276   code_section()->relocate(inst_mark(), rtype);
4277   ldrw(zr, Address(r, 0));
4278   return inst_mark();
4279 }
4280 
4281 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4282   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4283   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4284   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4285   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4286   long offset_low = dest_page - low_page;
4287   long offset_high = dest_page - high_page;
4288 
4289   assert(is_valid_AArch64_address(dest.target()), "bad address");
4290   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4291 
4292   InstructionMark im(this);
4293   code_section()->relocate(inst_mark(), dest.rspec());
4294   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4295   // the code cache so that if it is relocated we know it will still reach
4296   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4297     _adrp(reg1, dest.target());
4298   } else {
4299     unsigned long target = (unsigned long)dest.target();
4300     unsigned long adrp_target
4301       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4302 
4303     _adrp(reg1, (address)adrp_target);
4304     movk(reg1, target >> 32, 32);
4305   }
4306   byte_offset = (unsigned long)dest.target() & 0xfff;
4307 }
4308 
4309 void MacroAssembler::load_byte_map_base(Register reg) {
4310   jbyte *byte_map_base =
4311     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4312 
4313   if (is_valid_AArch64_address((address)byte_map_base)) {
4314     // Strictly speaking the byte_map_base isn't an address at all,
4315     // and it might even be negative.
4316     unsigned long offset;
4317     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4318     // We expect offset to be zero with most collectors.
4319     if (offset != 0) {
4320       add(reg, reg, offset);
4321     }
4322   } else {
4323     mov(reg, (uint64_t)byte_map_base);
4324   }
4325 }
4326 
4327 void MacroAssembler::build_frame(int framesize) {
4328   assert(framesize > 0, "framesize must be > 0");
4329   if (framesize < ((1 << 9) + 2 * wordSize)) {
4330     sub(sp, sp, framesize);
4331     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4332     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4333   } else {
4334     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4335     if (PreserveFramePointer) mov(rfp, sp);
4336     if (framesize < ((1 << 12) + 2 * wordSize))
4337       sub(sp, sp, framesize - 2 * wordSize);
4338     else {
4339       mov(rscratch1, framesize - 2 * wordSize);
4340       sub(sp, sp, rscratch1);
4341     }
4342   }
4343 }
4344 
4345 void MacroAssembler::remove_frame(int framesize) {
4346   assert(framesize > 0, "framesize must be > 0");
4347   if (framesize < ((1 << 9) + 2 * wordSize)) {
4348     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4349     add(sp, sp, framesize);
4350   } else {
4351     if (framesize < ((1 << 12) + 2 * wordSize))
4352       add(sp, sp, framesize - 2 * wordSize);
4353     else {
4354       mov(rscratch1, framesize - 2 * wordSize);
4355       add(sp, sp, rscratch1);
4356     }
4357     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4358   }
4359 }
4360 
4361 #ifdef COMPILER2
4362 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4363 
4364 // Search for str1 in str2 and return index or -1
4365 void MacroAssembler::string_indexof(Register str2, Register str1,
4366                                     Register cnt2, Register cnt1,
4367                                     Register tmp1, Register tmp2,
4368                                     Register tmp3, Register tmp4,
4369                                     Register tmp5, Register tmp6,
4370                                     int icnt1, Register result, int ae) {
4371   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4372   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4373 
4374   Register ch1 = rscratch1;
4375   Register ch2 = rscratch2;
4376   Register cnt1tmp = tmp1;
4377   Register cnt2tmp = tmp2;
4378   Register cnt1_neg = cnt1;
4379   Register cnt2_neg = cnt2;
4380   Register result_tmp = tmp4;
4381 
4382   bool isL = ae == StrIntrinsicNode::LL;
4383 
4384   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4385   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4386   int str1_chr_shift = str1_isL ? 0:1;
4387   int str2_chr_shift = str2_isL ? 0:1;
4388   int str1_chr_size = str1_isL ? 1:2;
4389   int str2_chr_size = str2_isL ? 1:2;
4390   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4391                                       (chr_insn)&MacroAssembler::ldrh;
4392   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4393                                       (chr_insn)&MacroAssembler::ldrh;
4394   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4395   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4396 
4397   // Note, inline_string_indexOf() generates checks:
4398   // if (substr.count > string.count) return -1;
4399   // if (substr.count == 0) return 0;
4400 
4401   // We have two strings, a source string in str2, cnt2 and a pattern string
4402   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4403 
4404   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4405   // With a small pattern and source we use linear scan.
4406 
4407   if (icnt1 == -1) {
4408     sub(result_tmp, cnt2, cnt1);
4409     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4410     br(LT, LINEARSEARCH);
4411     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4412     subs(zr, cnt1, 256);
4413     lsr(tmp1, cnt2, 2);
4414     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4415     br(GE, LINEARSTUB);
4416   }
4417 
4418 // The Boyer Moore alogorithm is based on the description here:-
4419 //
4420 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4421 //
4422 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4423 // and the 'Good Suffix' rule.
4424 //
4425 // These rules are essentially heuristics for how far we can shift the
4426 // pattern along the search string.
4427 //
4428 // The implementation here uses the 'Bad Character' rule only because of the
4429 // complexity of initialisation for the 'Good Suffix' rule.
4430 //
4431 // This is also known as the Boyer-Moore-Horspool algorithm:-
4432 //
4433 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4434 //
4435 // This particular implementation has few java-specific optimizations.
4436 //
4437 // #define ASIZE 256
4438 //
4439 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4440 //       int i, j;
4441 //       unsigned c;
4442 //       unsigned char bc[ASIZE];
4443 //
4444 //       /* Preprocessing */
4445 //       for (i = 0; i < ASIZE; ++i)
4446 //          bc[i] = m;
4447 //       for (i = 0; i < m - 1; ) {
4448 //          c = x[i];
4449 //          ++i;
4450 //          // c < 256 for Latin1 string, so, no need for branch
4451 //          #ifdef PATTERN_STRING_IS_LATIN1
4452 //          bc[c] = m - i;
4453 //          #else
4454 //          if (c < ASIZE) bc[c] = m - i;
4455 //          #endif
4456 //       }
4457 //
4458 //       /* Searching */
4459 //       j = 0;
4460 //       while (j <= n - m) {
4461 //          c = y[i+j];
4462 //          if (x[m-1] == c)
4463 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4464 //          if (i < 0) return j;
4465 //          // c < 256 for Latin1 string, so, no need for branch
4466 //          #ifdef SOURCE_STRING_IS_LATIN1
4467 //          // LL case: (c< 256) always true. Remove branch
4468 //          j += bc[y[j+m-1]];
4469 //          #endif
4470 //          #ifndef PATTERN_STRING_IS_UTF
4471 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4472 //          if (c < ASIZE)
4473 //            j += bc[y[j+m-1]];
4474 //          else
4475 //            j += 1
4476 //          #endif
4477 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4478 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4479 //          if (c < ASIZE)
4480 //            j += bc[y[j+m-1]];
4481 //          else
4482 //            j += m
4483 //          #endif
4484 //       }
4485 //    }
4486 
4487   if (icnt1 == -1) {
4488     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4489         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4490     Register cnt1end = tmp2;
4491     Register str2end = cnt2;
4492     Register skipch = tmp2;
4493 
4494     // str1 length is >=8, so, we can read at least 1 register for cases when
4495     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4496     // UL case. We'll re-read last character in inner pre-loop code to have
4497     // single outer pre-loop load
4498     const int firstStep = isL ? 7 : 3;
4499 
4500     const int ASIZE = 256;
4501     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4502     sub(sp, sp, ASIZE);
4503     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4504     mov(ch1, sp);
4505     BIND(BM_INIT_LOOP);
4506       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4507       subs(tmp5, tmp5, 1);
4508       br(GT, BM_INIT_LOOP);
4509 
4510       sub(cnt1tmp, cnt1, 1);
4511       mov(tmp5, str2);
4512       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4513       sub(ch2, cnt1, 1);
4514       mov(tmp3, str1);
4515     BIND(BCLOOP);
4516       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4517       if (!str1_isL) {
4518         subs(zr, ch1, ASIZE);
4519         br(HS, BCSKIP);
4520       }
4521       strb(ch2, Address(sp, ch1));
4522     BIND(BCSKIP);
4523       subs(ch2, ch2, 1);
4524       br(GT, BCLOOP);
4525 
4526       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4527       if (str1_isL == str2_isL) {
4528         // load last 8 bytes (8LL/4UU symbols)
4529         ldr(tmp6, Address(tmp6, -wordSize));
4530       } else {
4531         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4532         // convert Latin1 to UTF. We'll have to wait until load completed, but
4533         // it's still faster than per-character loads+checks
4534         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4535         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4536         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4537         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4538         orr(ch2, ch1, ch2, LSL, 16);
4539         orr(tmp6, tmp6, tmp3, LSL, 48);
4540         orr(tmp6, tmp6, ch2, LSL, 16);
4541       }
4542     BIND(BMLOOPSTR2);
4543       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4544       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4545       if (str1_isL == str2_isL) {
4546         // re-init tmp3. It's for free because it's executed in parallel with
4547         // load above. Alternative is to initialize it before loop, but it'll
4548         // affect performance on in-order systems with 2 or more ld/st pipelines
4549         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4550       }
4551       if (!isL) { // UU/UL case
4552         lsl(ch2, cnt1tmp, 1); // offset in bytes
4553       }
4554       cmp(tmp3, skipch);
4555       br(NE, BMSKIP);
4556       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4557       mov(ch1, tmp6);
4558       if (isL) {
4559         b(BMLOOPSTR1_AFTER_LOAD);
4560       } else {
4561         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4562         b(BMLOOPSTR1_CMP);
4563       }
4564     BIND(BMLOOPSTR1);
4565       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4566       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4567     BIND(BMLOOPSTR1_AFTER_LOAD);
4568       subs(cnt1tmp, cnt1tmp, 1);
4569       br(LT, BMLOOPSTR1_LASTCMP);
4570     BIND(BMLOOPSTR1_CMP);
4571       cmp(ch1, ch2);
4572       br(EQ, BMLOOPSTR1);
4573     BIND(BMSKIP);
4574       if (!isL) {
4575         // if we've met UTF symbol while searching Latin1 pattern, then we can
4576         // skip cnt1 symbols
4577         if (str1_isL != str2_isL) {
4578           mov(result_tmp, cnt1);
4579         } else {
4580           mov(result_tmp, 1);
4581         }
4582         subs(zr, skipch, ASIZE);
4583         br(HS, BMADV);
4584       }
4585       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4586     BIND(BMADV);
4587       sub(cnt1tmp, cnt1, 1);
4588       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4589       cmp(str2, str2end);
4590       br(LE, BMLOOPSTR2);
4591       add(sp, sp, ASIZE);
4592       b(NOMATCH);
4593     BIND(BMLOOPSTR1_LASTCMP);
4594       cmp(ch1, ch2);
4595       br(NE, BMSKIP);
4596     BIND(BMMATCH);
4597       sub(result, str2, tmp5);
4598       if (!str2_isL) lsr(result, result, 1);
4599       add(sp, sp, ASIZE);
4600       b(DONE);
4601 
4602     BIND(LINEARSTUB);
4603     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4604     br(LT, LINEAR_MEDIUM);
4605     mov(result, zr);
4606     RuntimeAddress stub = NULL;
4607     if (isL) {
4608       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4609       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4610     } else if (str1_isL) {
4611       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4612        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4613     } else {
4614       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4615       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4616     }
4617     trampoline_call(stub);
4618     b(DONE);
4619   }
4620 
4621   BIND(LINEARSEARCH);
4622   {
4623     Label DO1, DO2, DO3;
4624 
4625     Register str2tmp = tmp2;
4626     Register first = tmp3;
4627 
4628     if (icnt1 == -1)
4629     {
4630         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4631 
4632         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4633         br(LT, DOSHORT);
4634       BIND(LINEAR_MEDIUM);
4635         (this->*str1_load_1chr)(first, Address(str1));
4636         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4637         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4638         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4639         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4640 
4641       BIND(FIRST_LOOP);
4642         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4643         cmp(first, ch2);
4644         br(EQ, STR1_LOOP);
4645       BIND(STR2_NEXT);
4646         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4647         br(LE, FIRST_LOOP);
4648         b(NOMATCH);
4649 
4650       BIND(STR1_LOOP);
4651         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4652         add(cnt2tmp, cnt2_neg, str2_chr_size);
4653         br(GE, MATCH);
4654 
4655       BIND(STR1_NEXT);
4656         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4657         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4658         cmp(ch1, ch2);
4659         br(NE, STR2_NEXT);
4660         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4661         add(cnt2tmp, cnt2tmp, str2_chr_size);
4662         br(LT, STR1_NEXT);
4663         b(MATCH);
4664 
4665       BIND(DOSHORT);
4666       if (str1_isL == str2_isL) {
4667         cmp(cnt1, (u1)2);
4668         br(LT, DO1);
4669         br(GT, DO3);
4670       }
4671     }
4672 
4673     if (icnt1 == 4) {
4674       Label CH1_LOOP;
4675 
4676         (this->*load_4chr)(ch1, str1);
4677         sub(result_tmp, cnt2, 4);
4678         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4679         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4680 
4681       BIND(CH1_LOOP);
4682         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4683         cmp(ch1, ch2);
4684         br(EQ, MATCH);
4685         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4686         br(LE, CH1_LOOP);
4687         b(NOMATCH);
4688       }
4689 
4690     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4691       Label CH1_LOOP;
4692 
4693       BIND(DO2);
4694         (this->*load_2chr)(ch1, str1);
4695         if (icnt1 == 2) {
4696           sub(result_tmp, cnt2, 2);
4697         }
4698         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4699         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4700       BIND(CH1_LOOP);
4701         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4702         cmp(ch1, ch2);
4703         br(EQ, MATCH);
4704         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4705         br(LE, CH1_LOOP);
4706         b(NOMATCH);
4707     }
4708 
4709     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4710       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4711 
4712       BIND(DO3);
4713         (this->*load_2chr)(first, str1);
4714         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4715         if (icnt1 == 3) {
4716           sub(result_tmp, cnt2, 3);
4717         }
4718         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4719         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4720       BIND(FIRST_LOOP);
4721         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4722         cmpw(first, ch2);
4723         br(EQ, STR1_LOOP);
4724       BIND(STR2_NEXT);
4725         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4726         br(LE, FIRST_LOOP);
4727         b(NOMATCH);
4728 
4729       BIND(STR1_LOOP);
4730         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4731         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4732         cmp(ch1, ch2);
4733         br(NE, STR2_NEXT);
4734         b(MATCH);
4735     }
4736 
4737     if (icnt1 == -1 || icnt1 == 1) {
4738       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4739 
4740       BIND(DO1);
4741         (this->*str1_load_1chr)(ch1, str1);
4742         cmp(cnt2, (u1)8);
4743         br(LT, DO1_SHORT);
4744 
4745         sub(result_tmp, cnt2, 8/str2_chr_size);
4746         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4747         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4748         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4749 
4750         if (str2_isL) {
4751           orr(ch1, ch1, ch1, LSL, 8);
4752         }
4753         orr(ch1, ch1, ch1, LSL, 16);
4754         orr(ch1, ch1, ch1, LSL, 32);
4755       BIND(CH1_LOOP);
4756         ldr(ch2, Address(str2, cnt2_neg));
4757         eor(ch2, ch1, ch2);
4758         sub(tmp1, ch2, tmp3);
4759         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4760         bics(tmp1, tmp1, tmp2);
4761         br(NE, HAS_ZERO);
4762         adds(cnt2_neg, cnt2_neg, 8);
4763         br(LT, CH1_LOOP);
4764 
4765         cmp(cnt2_neg, (u1)8);
4766         mov(cnt2_neg, 0);
4767         br(LT, CH1_LOOP);
4768         b(NOMATCH);
4769 
4770       BIND(HAS_ZERO);
4771         rev(tmp1, tmp1);
4772         clz(tmp1, tmp1);
4773         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4774         b(MATCH);
4775 
4776       BIND(DO1_SHORT);
4777         mov(result_tmp, cnt2);
4778         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4779         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4780       BIND(DO1_LOOP);
4781         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4782         cmpw(ch1, ch2);
4783         br(EQ, MATCH);
4784         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4785         br(LT, DO1_LOOP);
4786     }
4787   }
4788   BIND(NOMATCH);
4789     mov(result, -1);
4790     b(DONE);
4791   BIND(MATCH);
4792     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4793   BIND(DONE);
4794 }
4795 
4796 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4797 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4798 
4799 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4800                                          Register ch, Register result,
4801                                          Register tmp1, Register tmp2, Register tmp3)
4802 {
4803   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4804   Register cnt1_neg = cnt1;
4805   Register ch1 = rscratch1;
4806   Register result_tmp = rscratch2;
4807 
4808   cmp(cnt1, (u1)4);
4809   br(LT, DO1_SHORT);
4810 
4811   orr(ch, ch, ch, LSL, 16);
4812   orr(ch, ch, ch, LSL, 32);
4813 
4814   sub(cnt1, cnt1, 4);
4815   mov(result_tmp, cnt1);
4816   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4817   sub(cnt1_neg, zr, cnt1, LSL, 1);
4818 
4819   mov(tmp3, 0x0001000100010001);
4820 
4821   BIND(CH1_LOOP);
4822     ldr(ch1, Address(str1, cnt1_neg));
4823     eor(ch1, ch, ch1);
4824     sub(tmp1, ch1, tmp3);
4825     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4826     bics(tmp1, tmp1, tmp2);
4827     br(NE, HAS_ZERO);
4828     adds(cnt1_neg, cnt1_neg, 8);
4829     br(LT, CH1_LOOP);
4830 
4831     cmp(cnt1_neg, (u1)8);
4832     mov(cnt1_neg, 0);
4833     br(LT, CH1_LOOP);
4834     b(NOMATCH);
4835 
4836   BIND(HAS_ZERO);
4837     rev(tmp1, tmp1);
4838     clz(tmp1, tmp1);
4839     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4840     b(MATCH);
4841 
4842   BIND(DO1_SHORT);
4843     mov(result_tmp, cnt1);
4844     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4845     sub(cnt1_neg, zr, cnt1, LSL, 1);
4846   BIND(DO1_LOOP);
4847     ldrh(ch1, Address(str1, cnt1_neg));
4848     cmpw(ch, ch1);
4849     br(EQ, MATCH);
4850     adds(cnt1_neg, cnt1_neg, 2);
4851     br(LT, DO1_LOOP);
4852   BIND(NOMATCH);
4853     mov(result, -1);
4854     b(DONE);
4855   BIND(MATCH);
4856     add(result, result_tmp, cnt1_neg, ASR, 1);
4857   BIND(DONE);
4858 }
4859 
4860 // Compare strings.
4861 void MacroAssembler::string_compare(Register str1, Register str2,
4862     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
4863     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
4864   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
4865       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
4866       SHORT_LOOP_START, TAIL_CHECK;
4867 
4868   const u1 STUB_THRESHOLD = 64 + 8;
4869   bool isLL = ae == StrIntrinsicNode::LL;
4870   bool isLU = ae == StrIntrinsicNode::LU;
4871   bool isUL = ae == StrIntrinsicNode::UL;
4872 
4873   bool str1_isL = isLL || isLU;
4874   bool str2_isL = isLL || isUL;
4875 
4876   int str1_chr_shift = str1_isL ? 0 : 1;
4877   int str2_chr_shift = str2_isL ? 0 : 1;
4878   int str1_chr_size = str1_isL ? 1 : 2;
4879   int str2_chr_size = str2_isL ? 1 : 2;
4880   int minCharsInWord = isLL ? wordSize : wordSize/2;
4881 
4882   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
4883   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4884                                       (chr_insn)&MacroAssembler::ldrh;
4885   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4886                                       (chr_insn)&MacroAssembler::ldrh;
4887   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
4888                             (uxt_insn)&MacroAssembler::uxthw;
4889 
4890   BLOCK_COMMENT("string_compare {");
4891 
4892   // Bizzarely, the counts are passed in bytes, regardless of whether they
4893   // are L or U strings, however the result is always in characters.
4894   if (!str1_isL) asrw(cnt1, cnt1, 1);
4895   if (!str2_isL) asrw(cnt2, cnt2, 1);
4896 
4897   // Compute the minimum of the string lengths and save the difference.
4898   subsw(result, cnt1, cnt2);
4899   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4900 
4901   // A very short string
4902   cmpw(cnt2, minCharsInWord);
4903   br(Assembler::LT, SHORT_STRING);
4904 
4905   // Compare longwords
4906   // load first parts of strings and finish initialization while loading
4907   {
4908     if (str1_isL == str2_isL) { // LL or UU
4909       ldr(tmp1, Address(str1));
4910       cmp(str1, str2);
4911       br(Assembler::EQ, DONE);
4912       ldr(tmp2, Address(str2));
4913       cmp(cnt2, STUB_THRESHOLD);
4914       br(GE, STUB);
4915       subsw(cnt2, cnt2, minCharsInWord);
4916       br(EQ, TAIL_CHECK);
4917       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4918       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4919       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4920     } else if (isLU) {
4921       ldrs(vtmp, Address(str1));
4922       cmp(str1, str2);
4923       br(Assembler::EQ, DONE);
4924       ldr(tmp2, Address(str2));
4925       cmp(cnt2, STUB_THRESHOLD);
4926       br(GE, STUB);
4927       subsw(cnt2, cnt2, 4);
4928       br(EQ, TAIL_CHECK);
4929       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4930       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4931       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4932       zip1(vtmp, T8B, vtmp, vtmpZ);
4933       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4934       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4935       add(cnt1, cnt1, 4);
4936       fmovd(tmp1, vtmp);
4937     } else { // UL case
4938       ldr(tmp1, Address(str1));
4939       cmp(str1, str2);
4940       br(Assembler::EQ, DONE);
4941       ldrs(vtmp, Address(str2));
4942       cmp(cnt2, STUB_THRESHOLD);
4943       br(GE, STUB);
4944       subsw(cnt2, cnt2, 4);
4945       br(EQ, TAIL_CHECK);
4946       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
4947       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
4948       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
4949       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
4950       zip1(vtmp, T8B, vtmp, vtmpZ);
4951       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
4952       add(cnt1, cnt1, 8);
4953       fmovd(tmp2, vtmp);
4954     }
4955     adds(cnt2, cnt2, isUL ? 4 : 8);
4956     br(GE, TAIL);
4957     eor(rscratch2, tmp1, tmp2);
4958     cbnz(rscratch2, DIFFERENCE);
4959     // main loop
4960     bind(NEXT_WORD);
4961     if (str1_isL == str2_isL) {
4962       ldr(tmp1, Address(str1, cnt2));
4963       ldr(tmp2, Address(str2, cnt2));
4964       adds(cnt2, cnt2, 8);
4965     } else if (isLU) {
4966       ldrs(vtmp, Address(str1, cnt1));
4967       ldr(tmp2, Address(str2, cnt2));
4968       add(cnt1, cnt1, 4);
4969       zip1(vtmp, T8B, vtmp, vtmpZ);
4970       fmovd(tmp1, vtmp);
4971       adds(cnt2, cnt2, 8);
4972     } else { // UL
4973       ldrs(vtmp, Address(str2, cnt2));
4974       ldr(tmp1, Address(str1, cnt1));
4975       zip1(vtmp, T8B, vtmp, vtmpZ);
4976       add(cnt1, cnt1, 8);
4977       fmovd(tmp2, vtmp);
4978       adds(cnt2, cnt2, 4);
4979     }
4980     br(GE, TAIL);
4981 
4982     eor(rscratch2, tmp1, tmp2);
4983     cbz(rscratch2, NEXT_WORD);
4984     b(DIFFERENCE);
4985     bind(TAIL);
4986     eor(rscratch2, tmp1, tmp2);
4987     cbnz(rscratch2, DIFFERENCE);
4988     // Last longword.  In the case where length == 4 we compare the
4989     // same longword twice, but that's still faster than another
4990     // conditional branch.
4991     if (str1_isL == str2_isL) {
4992       ldr(tmp1, Address(str1));
4993       ldr(tmp2, Address(str2));
4994     } else if (isLU) {
4995       ldrs(vtmp, Address(str1));
4996       ldr(tmp2, Address(str2));
4997       zip1(vtmp, T8B, vtmp, vtmpZ);
4998       fmovd(tmp1, vtmp);
4999     } else { // UL
5000       ldrs(vtmp, Address(str2));
5001       ldr(tmp1, Address(str1));
5002       zip1(vtmp, T8B, vtmp, vtmpZ);
5003       fmovd(tmp2, vtmp);
5004     }
5005     bind(TAIL_CHECK);
5006     eor(rscratch2, tmp1, tmp2);
5007     cbz(rscratch2, DONE);
5008 
5009     // Find the first different characters in the longwords and
5010     // compute their difference.
5011     bind(DIFFERENCE);
5012     rev(rscratch2, rscratch2);
5013     clz(rscratch2, rscratch2);
5014     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5015     lsrv(tmp1, tmp1, rscratch2);
5016     (this->*ext_chr)(tmp1, tmp1);
5017     lsrv(tmp2, tmp2, rscratch2);
5018     (this->*ext_chr)(tmp2, tmp2);
5019     subw(result, tmp1, tmp2);
5020     b(DONE);
5021   }
5022 
5023   bind(STUB);
5024     RuntimeAddress stub = NULL;
5025     switch(ae) {
5026       case StrIntrinsicNode::LL:
5027         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5028         break;
5029       case StrIntrinsicNode::UU:
5030         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5031         break;
5032       case StrIntrinsicNode::LU:
5033         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5034         break;
5035       case StrIntrinsicNode::UL:
5036         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5037         break;
5038       default:
5039         ShouldNotReachHere();
5040      }
5041     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5042     trampoline_call(stub);
5043     b(DONE);
5044 
5045   bind(SHORT_STRING);
5046   // Is the minimum length zero?
5047   cbz(cnt2, DONE);
5048   // arrange code to do most branches while loading and loading next characters
5049   // while comparing previous
5050   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5051   subs(cnt2, cnt2, 1);
5052   br(EQ, SHORT_LAST_INIT);
5053   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5054   b(SHORT_LOOP_START);
5055   bind(SHORT_LOOP);
5056   subs(cnt2, cnt2, 1);
5057   br(EQ, SHORT_LAST);
5058   bind(SHORT_LOOP_START);
5059   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5060   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5061   cmp(tmp1, cnt1);
5062   br(NE, SHORT_LOOP_TAIL);
5063   subs(cnt2, cnt2, 1);
5064   br(EQ, SHORT_LAST2);
5065   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5066   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5067   cmp(tmp2, rscratch1);
5068   br(EQ, SHORT_LOOP);
5069   sub(result, tmp2, rscratch1);
5070   b(DONE);
5071   bind(SHORT_LOOP_TAIL);
5072   sub(result, tmp1, cnt1);
5073   b(DONE);
5074   bind(SHORT_LAST2);
5075   cmp(tmp2, rscratch1);
5076   br(EQ, DONE);
5077   sub(result, tmp2, rscratch1);
5078 
5079   b(DONE);
5080   bind(SHORT_LAST_INIT);
5081   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5082   bind(SHORT_LAST);
5083   cmp(tmp1, cnt1);
5084   br(EQ, DONE);
5085   sub(result, tmp1, cnt1);
5086 
5087   bind(DONE);
5088 
5089   BLOCK_COMMENT("} string_compare");
5090 }
5091 #endif // COMPILER2
5092 
5093 // This method checks if provided byte array contains byte with highest bit set.
5094 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5095     // Simple and most common case of aligned small array which is not at the
5096     // end of memory page is placed here. All other cases are in stub.
5097     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5098     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5099     assert_different_registers(ary1, len, result);
5100 
5101     cmpw(len, 0);
5102     br(LE, SET_RESULT);
5103     cmpw(len, 4 * wordSize);
5104     br(GE, STUB_LONG); // size > 32 then go to stub
5105 
5106     int shift = 64 - exact_log2(os::vm_page_size());
5107     lsl(rscratch1, ary1, shift);
5108     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5109     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5110     br(CS, STUB); // at the end of page then go to stub
5111     subs(len, len, wordSize);
5112     br(LT, END);
5113 
5114   BIND(LOOP);
5115     ldr(rscratch1, Address(post(ary1, wordSize)));
5116     tst(rscratch1, UPPER_BIT_MASK);
5117     br(NE, SET_RESULT);
5118     subs(len, len, wordSize);
5119     br(GE, LOOP);
5120     cmpw(len, -wordSize);
5121     br(EQ, SET_RESULT);
5122 
5123   BIND(END);
5124     ldr(result, Address(ary1));
5125     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5126     lslv(result, result, len);
5127     tst(result, UPPER_BIT_MASK);
5128     b(SET_RESULT);
5129 
5130   BIND(STUB);
5131     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5132     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5133     trampoline_call(has_neg);
5134     b(DONE);
5135 
5136   BIND(STUB_LONG);
5137     RuntimeAddress has_neg_long =  RuntimeAddress(
5138             StubRoutines::aarch64::has_negatives_long());
5139     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5140     trampoline_call(has_neg_long);
5141     b(DONE);
5142 
5143   BIND(SET_RESULT);
5144     cset(result, NE); // set true or false
5145 
5146   BIND(DONE);
5147 }
5148 
5149 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5150                                    Register tmp4, Register tmp5, Register result,
5151                                    Register cnt1, int elem_size) {
5152   Label DONE, SAME;
5153   Register tmp1 = rscratch1;
5154   Register tmp2 = rscratch2;
5155   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5156   int elem_per_word = wordSize/elem_size;
5157   int log_elem_size = exact_log2(elem_size);
5158   int length_offset = arrayOopDesc::length_offset_in_bytes();
5159   int base_offset
5160     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5161   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5162 
5163   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5164   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5165 
5166 #ifndef PRODUCT
5167   {
5168     const char kind = (elem_size == 2) ? 'U' : 'L';
5169     char comment[64];
5170     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5171     BLOCK_COMMENT(comment);
5172   }
5173 #endif
5174 
5175   // if (a1 == a2)
5176   //     return true;
5177   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5178   br(EQ, SAME);
5179 
5180   if (UseSimpleArrayEquals) {
5181     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5182     // if (a1 == null || a2 == null)
5183     //     return false;
5184     // a1 & a2 == 0 means (some-pointer is null) or
5185     // (very-rare-or-even-probably-impossible-pointer-values)
5186     // so, we can save one branch in most cases
5187     tst(a1, a2);
5188     mov(result, false);
5189     br(EQ, A_MIGHT_BE_NULL);
5190     // if (a1.length != a2.length)
5191     //      return false;
5192     bind(A_IS_NOT_NULL);
5193     ldrw(cnt1, Address(a1, length_offset));
5194     ldrw(cnt2, Address(a2, length_offset));
5195     eorw(tmp5, cnt1, cnt2);
5196     cbnzw(tmp5, DONE);
5197     lea(a1, Address(a1, base_offset));
5198     lea(a2, Address(a2, base_offset));
5199     // Check for short strings, i.e. smaller than wordSize.
5200     subs(cnt1, cnt1, elem_per_word);
5201     br(Assembler::LT, SHORT);
5202     // Main 8 byte comparison loop.
5203     bind(NEXT_WORD); {
5204       ldr(tmp1, Address(post(a1, wordSize)));
5205       ldr(tmp2, Address(post(a2, wordSize)));
5206       subs(cnt1, cnt1, elem_per_word);
5207       eor(tmp5, tmp1, tmp2);
5208       cbnz(tmp5, DONE);
5209     } br(GT, NEXT_WORD);
5210     // Last longword.  In the case where length == 4 we compare the
5211     // same longword twice, but that's still faster than another
5212     // conditional branch.
5213     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5214     // length == 4.
5215     if (log_elem_size > 0)
5216       lsl(cnt1, cnt1, log_elem_size);
5217     ldr(tmp3, Address(a1, cnt1));
5218     ldr(tmp4, Address(a2, cnt1));
5219     eor(tmp5, tmp3, tmp4);
5220     cbnz(tmp5, DONE);
5221     b(SAME);
5222     bind(A_MIGHT_BE_NULL);
5223     // in case both a1 and a2 are not-null, proceed with loads
5224     cbz(a1, DONE);
5225     cbz(a2, DONE);
5226     b(A_IS_NOT_NULL);
5227     bind(SHORT);
5228 
5229     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5230     {
5231       ldrw(tmp1, Address(post(a1, 4)));
5232       ldrw(tmp2, Address(post(a2, 4)));
5233       eorw(tmp5, tmp1, tmp2);
5234       cbnzw(tmp5, DONE);
5235     }
5236     bind(TAIL03);
5237     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5238     {
5239       ldrh(tmp3, Address(post(a1, 2)));
5240       ldrh(tmp4, Address(post(a2, 2)));
5241       eorw(tmp5, tmp3, tmp4);
5242       cbnzw(tmp5, DONE);
5243     }
5244     bind(TAIL01);
5245     if (elem_size == 1) { // Only needed when comparing byte arrays.
5246       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5247       {
5248         ldrb(tmp1, a1);
5249         ldrb(tmp2, a2);
5250         eorw(tmp5, tmp1, tmp2);
5251         cbnzw(tmp5, DONE);
5252       }
5253     }
5254   } else {
5255     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5256         CSET_EQ, LAST_CHECK;
5257     mov(result, false);
5258     cbz(a1, DONE);
5259     ldrw(cnt1, Address(a1, length_offset));
5260     cbz(a2, DONE);
5261     ldrw(cnt2, Address(a2, length_offset));
5262     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5263     // faster to perform another branch before comparing a1 and a2
5264     cmp(cnt1, (u1)elem_per_word);
5265     br(LE, SHORT); // short or same
5266     ldr(tmp3, Address(pre(a1, base_offset)));
5267     subs(zr, cnt1, stubBytesThreshold);
5268     br(GE, STUB);
5269     ldr(tmp4, Address(pre(a2, base_offset)));
5270     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5271     cmp(cnt2, cnt1);
5272     br(NE, DONE);
5273 
5274     // Main 16 byte comparison loop with 2 exits
5275     bind(NEXT_DWORD); {
5276       ldr(tmp1, Address(pre(a1, wordSize)));
5277       ldr(tmp2, Address(pre(a2, wordSize)));
5278       subs(cnt1, cnt1, 2 * elem_per_word);
5279       br(LE, TAIL);
5280       eor(tmp4, tmp3, tmp4);
5281       cbnz(tmp4, DONE);
5282       ldr(tmp3, Address(pre(a1, wordSize)));
5283       ldr(tmp4, Address(pre(a2, wordSize)));
5284       cmp(cnt1, (u1)elem_per_word);
5285       br(LE, TAIL2);
5286       cmp(tmp1, tmp2);
5287     } br(EQ, NEXT_DWORD);
5288     b(DONE);
5289 
5290     bind(TAIL);
5291     eor(tmp4, tmp3, tmp4);
5292     eor(tmp2, tmp1, tmp2);
5293     lslv(tmp2, tmp2, tmp5);
5294     orr(tmp5, tmp4, tmp2);
5295     cmp(tmp5, zr);
5296     b(CSET_EQ);
5297 
5298     bind(TAIL2);
5299     eor(tmp2, tmp1, tmp2);
5300     cbnz(tmp2, DONE);
5301     b(LAST_CHECK);
5302 
5303     bind(STUB);
5304     ldr(tmp4, Address(pre(a2, base_offset)));
5305     cmp(cnt2, cnt1);
5306     br(NE, DONE);
5307     if (elem_size == 2) { // convert to byte counter
5308       lsl(cnt1, cnt1, 1);
5309     }
5310     eor(tmp5, tmp3, tmp4);
5311     cbnz(tmp5, DONE);
5312     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5313     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5314     trampoline_call(stub);
5315     b(DONE);
5316 
5317     bind(EARLY_OUT);
5318     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5319     // so, if a2 == null => return false(0), else return true, so we can return a2
5320     mov(result, a2);
5321     b(DONE);
5322     bind(SHORT);
5323     cmp(cnt2, cnt1);
5324     br(NE, DONE);
5325     cbz(cnt1, SAME);
5326     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5327     ldr(tmp3, Address(a1, base_offset));
5328     ldr(tmp4, Address(a2, base_offset));
5329     bind(LAST_CHECK);
5330     eor(tmp4, tmp3, tmp4);
5331     lslv(tmp5, tmp4, tmp5);
5332     cmp(tmp5, zr);
5333     bind(CSET_EQ);
5334     cset(result, EQ);
5335     b(DONE);
5336   }
5337 
5338   bind(SAME);
5339   mov(result, true);
5340   // That's it.
5341   bind(DONE);
5342 
5343   BLOCK_COMMENT("} array_equals");
5344 }
5345 
5346 // Compare Strings
5347 
5348 // For Strings we're passed the address of the first characters in a1
5349 // and a2 and the length in cnt1.
5350 // elem_size is the element size in bytes: either 1 or 2.
5351 // There are two implementations.  For arrays >= 8 bytes, all
5352 // comparisons (including the final one, which may overlap) are
5353 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5354 // halfword, then a short, and then a byte.
5355 
5356 void MacroAssembler::string_equals(Register a1, Register a2,
5357                                    Register result, Register cnt1, int elem_size)
5358 {
5359   Label SAME, DONE, SHORT, NEXT_WORD;
5360   Register tmp1 = rscratch1;
5361   Register tmp2 = rscratch2;
5362   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5363 
5364   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5365   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5366 
5367 #ifndef PRODUCT
5368   {
5369     const char kind = (elem_size == 2) ? 'U' : 'L';
5370     char comment[64];
5371     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5372     BLOCK_COMMENT(comment);
5373   }
5374 #endif
5375 
5376   mov(result, false);
5377 
5378   // Check for short strings, i.e. smaller than wordSize.
5379   subs(cnt1, cnt1, wordSize);
5380   br(Assembler::LT, SHORT);
5381   // Main 8 byte comparison loop.
5382   bind(NEXT_WORD); {
5383     ldr(tmp1, Address(post(a1, wordSize)));
5384     ldr(tmp2, Address(post(a2, wordSize)));
5385     subs(cnt1, cnt1, wordSize);
5386     eor(tmp1, tmp1, tmp2);
5387     cbnz(tmp1, DONE);
5388   } br(GT, NEXT_WORD);
5389   // Last longword.  In the case where length == 4 we compare the
5390   // same longword twice, but that's still faster than another
5391   // conditional branch.
5392   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5393   // length == 4.
5394   ldr(tmp1, Address(a1, cnt1));
5395   ldr(tmp2, Address(a2, cnt1));
5396   eor(tmp2, tmp1, tmp2);
5397   cbnz(tmp2, DONE);
5398   b(SAME);
5399 
5400   bind(SHORT);
5401   Label TAIL03, TAIL01;
5402 
5403   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5404   {
5405     ldrw(tmp1, Address(post(a1, 4)));
5406     ldrw(tmp2, Address(post(a2, 4)));
5407     eorw(tmp1, tmp1, tmp2);
5408     cbnzw(tmp1, DONE);
5409   }
5410   bind(TAIL03);
5411   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5412   {
5413     ldrh(tmp1, Address(post(a1, 2)));
5414     ldrh(tmp2, Address(post(a2, 2)));
5415     eorw(tmp1, tmp1, tmp2);
5416     cbnzw(tmp1, DONE);
5417   }
5418   bind(TAIL01);
5419   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5420     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5421     {
5422       ldrb(tmp1, a1);
5423       ldrb(tmp2, a2);
5424       eorw(tmp1, tmp1, tmp2);
5425       cbnzw(tmp1, DONE);
5426     }
5427   }
5428   // Arrays are equal.
5429   bind(SAME);
5430   mov(result, true);
5431 
5432   // That's it.
5433   bind(DONE);
5434   BLOCK_COMMENT("} string_equals");
5435 }
5436 
5437 
5438 // The size of the blocks erased by the zero_blocks stub.  We must
5439 // handle anything smaller than this ourselves in zero_words().
5440 const int MacroAssembler::zero_words_block_size = 8;
5441 
5442 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5443 // possible, handling small word counts locally and delegating
5444 // anything larger to the zero_blocks stub.  It is expanded many times
5445 // in compiled code, so it is important to keep it short.
5446 
5447 // ptr:   Address of a buffer to be zeroed.
5448 // cnt:   Count in HeapWords.
5449 //
5450 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5451 void MacroAssembler::zero_words(Register ptr, Register cnt)
5452 {
5453   assert(is_power_of_2(zero_words_block_size), "adjust this");
5454   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5455 
5456   BLOCK_COMMENT("zero_words {");
5457   cmp(cnt, (u1)zero_words_block_size);
5458   Label around;
5459   br(LO, around);
5460   {
5461     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5462     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5463     if (StubRoutines::aarch64::complete()) {
5464       trampoline_call(zero_blocks);
5465     } else {
5466       bl(zero_blocks);
5467     }
5468   }
5469   bind(around);
5470   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5471     Label l;
5472     tbz(cnt, exact_log2(i), l);
5473     for (int j = 0; j < i; j += 2) {
5474       stp(zr, zr, post(ptr, 16));
5475     }
5476     bind(l);
5477   }
5478   {
5479     Label l;
5480     tbz(cnt, 0, l);
5481     str(zr, Address(ptr));
5482     bind(l);
5483   }
5484   BLOCK_COMMENT("} zero_words");
5485 }
5486 
5487 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5488 // cnt:          Immediate count in HeapWords.
5489 #define SmallArraySize (18 * BytesPerLong)
5490 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5491 {
5492   BLOCK_COMMENT("zero_words {");
5493   int i = cnt & 1;  // store any odd word to start
5494   if (i) str(zr, Address(base));
5495 
5496   if (cnt <= SmallArraySize / BytesPerLong) {
5497     for (; i < (int)cnt; i += 2)
5498       stp(zr, zr, Address(base, i * wordSize));
5499   } else {
5500     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5501     int remainder = cnt % (2 * unroll);
5502     for (; i < remainder; i += 2)
5503       stp(zr, zr, Address(base, i * wordSize));
5504 
5505     Label loop;
5506     Register cnt_reg = rscratch1;
5507     Register loop_base = rscratch2;
5508     cnt = cnt - remainder;
5509     mov(cnt_reg, cnt);
5510     // adjust base and prebias by -2 * wordSize so we can pre-increment
5511     add(loop_base, base, (remainder - 2) * wordSize);
5512     bind(loop);
5513     sub(cnt_reg, cnt_reg, 2 * unroll);
5514     for (i = 1; i < unroll; i++)
5515       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5516     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5517     cbnz(cnt_reg, loop);
5518   }
5519   BLOCK_COMMENT("} zero_words");
5520 }
5521 
5522 // Zero blocks of memory by using DC ZVA.
5523 //
5524 // Aligns the base address first sufficently for DC ZVA, then uses
5525 // DC ZVA repeatedly for every full block.  cnt is the size to be
5526 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5527 // in cnt.
5528 //
5529 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5530 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5531 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5532   Register tmp = rscratch1;
5533   Register tmp2 = rscratch2;
5534   int zva_length = VM_Version::zva_length();
5535   Label initial_table_end, loop_zva;
5536   Label fini;
5537 
5538   // Base must be 16 byte aligned. If not just return and let caller handle it
5539   tst(base, 0x0f);
5540   br(Assembler::NE, fini);
5541   // Align base with ZVA length.
5542   neg(tmp, base);
5543   andr(tmp, tmp, zva_length - 1);
5544 
5545   // tmp: the number of bytes to be filled to align the base with ZVA length.
5546   add(base, base, tmp);
5547   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5548   adr(tmp2, initial_table_end);
5549   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5550   br(tmp2);
5551 
5552   for (int i = -zva_length + 16; i < 0; i += 16)
5553     stp(zr, zr, Address(base, i));
5554   bind(initial_table_end);
5555 
5556   sub(cnt, cnt, zva_length >> 3);
5557   bind(loop_zva);
5558   dc(Assembler::ZVA, base);
5559   subs(cnt, cnt, zva_length >> 3);
5560   add(base, base, zva_length);
5561   br(Assembler::GE, loop_zva);
5562   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5563   bind(fini);
5564 }
5565 
5566 // base:   Address of a buffer to be filled, 8 bytes aligned.
5567 // cnt:    Count in 8-byte unit.
5568 // value:  Value to be filled with.
5569 // base will point to the end of the buffer after filling.
5570 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5571 {
5572 //  Algorithm:
5573 //
5574 //    scratch1 = cnt & 7;
5575 //    cnt -= scratch1;
5576 //    p += scratch1;
5577 //    switch (scratch1) {
5578 //      do {
5579 //        cnt -= 8;
5580 //          p[-8] = v;
5581 //        case 7:
5582 //          p[-7] = v;
5583 //        case 6:
5584 //          p[-6] = v;
5585 //          // ...
5586 //        case 1:
5587 //          p[-1] = v;
5588 //        case 0:
5589 //          p += 8;
5590 //      } while (cnt);
5591 //    }
5592 
5593   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5594 
5595   Label fini, skip, entry, loop;
5596   const int unroll = 8; // Number of stp instructions we'll unroll
5597 
5598   cbz(cnt, fini);
5599   tbz(base, 3, skip);
5600   str(value, Address(post(base, 8)));
5601   sub(cnt, cnt, 1);
5602   bind(skip);
5603 
5604   andr(rscratch1, cnt, (unroll-1) * 2);
5605   sub(cnt, cnt, rscratch1);
5606   add(base, base, rscratch1, Assembler::LSL, 3);
5607   adr(rscratch2, entry);
5608   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5609   br(rscratch2);
5610 
5611   bind(loop);
5612   add(base, base, unroll * 16);
5613   for (int i = -unroll; i < 0; i++)
5614     stp(value, value, Address(base, i * 16));
5615   bind(entry);
5616   subs(cnt, cnt, unroll * 2);
5617   br(Assembler::GE, loop);
5618 
5619   tbz(cnt, 0, fini);
5620   str(value, Address(post(base, 8)));
5621   bind(fini);
5622 }
5623 
5624 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5625 // java/lang/StringUTF16.compress.
5626 void MacroAssembler::encode_iso_array(Register src, Register dst,
5627                       Register len, Register result,
5628                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5629                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5630 {
5631     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5632         NEXT_32_START, NEXT_32_PRFM_START;
5633     Register tmp1 = rscratch1, tmp2 = rscratch2;
5634 
5635       mov(result, len); // Save initial len
5636 
5637 #ifndef BUILTIN_SIM
5638       cmp(len, (u1)8); // handle shortest strings first
5639       br(LT, LOOP_1);
5640       cmp(len, (u1)32);
5641       br(LT, NEXT_8);
5642       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5643       // to convert chars to bytes
5644       if (SoftwarePrefetchHintDistance >= 0) {
5645         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5646         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5647         br(LE, NEXT_32_START);
5648         b(NEXT_32_PRFM_START);
5649         BIND(NEXT_32_PRFM);
5650           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5651         BIND(NEXT_32_PRFM_START);
5652           prfm(Address(src, SoftwarePrefetchHintDistance));
5653           orr(v4, T16B, Vtmp1, Vtmp2);
5654           orr(v5, T16B, Vtmp3, Vtmp4);
5655           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5656           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5657           stpq(Vtmp1, Vtmp3, dst);
5658           uzp2(v5, T16B, v4, v5); // high bytes
5659           umov(tmp2, v5, D, 1);
5660           fmovd(tmp1, v5);
5661           orr(tmp1, tmp1, tmp2);
5662           cbnz(tmp1, LOOP_8);
5663           sub(len, len, 32);
5664           add(dst, dst, 32);
5665           add(src, src, 64);
5666           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5667           br(GE, NEXT_32_PRFM);
5668           cmp(len, (u1)32);
5669           br(LT, LOOP_8);
5670         BIND(NEXT_32);
5671           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5672         BIND(NEXT_32_START);
5673       } else {
5674         BIND(NEXT_32);
5675           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5676       }
5677       prfm(Address(src, SoftwarePrefetchHintDistance));
5678       uzp1(v4, T16B, Vtmp1, Vtmp2);
5679       uzp1(v5, T16B, Vtmp3, Vtmp4);
5680       stpq(v4, v5, dst);
5681       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
5682       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
5683       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
5684       umov(tmp2, Vtmp1, D, 1);
5685       fmovd(tmp1, Vtmp1);
5686       orr(tmp1, tmp1, tmp2);
5687       cbnz(tmp1, LOOP_8);
5688       sub(len, len, 32);
5689       add(dst, dst, 32);
5690       add(src, src, 64);
5691       cmp(len, (u1)32);
5692       br(GE, NEXT_32);
5693       cbz(len, DONE);
5694 
5695     BIND(LOOP_8);
5696       cmp(len, (u1)8);
5697       br(LT, LOOP_1);
5698     BIND(NEXT_8);
5699       ld1(Vtmp1, T8H, src);
5700       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
5701       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
5702       strd(Vtmp2, dst);
5703       fmovd(tmp1, Vtmp3);
5704       cbnz(tmp1, NEXT_1);
5705 
5706       sub(len, len, 8);
5707       add(dst, dst, 8);
5708       add(src, src, 16);
5709       cmp(len, (u1)8);
5710       br(GE, NEXT_8);
5711 
5712     BIND(LOOP_1);
5713 #endif
5714     cbz(len, DONE);
5715     BIND(NEXT_1);
5716       ldrh(tmp1, Address(post(src, 2)));
5717       strb(tmp1, Address(post(dst, 1)));
5718       tst(tmp1, 0xff00);
5719       br(NE, SET_RESULT);
5720       subs(len, len, 1);
5721       br(GT, NEXT_1);
5722 
5723     BIND(SET_RESULT);
5724       sub(result, result, len); // Return index where we stopped
5725                                 // Return len == 0 if we processed all
5726                                 // characters
5727     BIND(DONE);
5728 }
5729 
5730 
5731 // Inflate byte[] array to char[].
5732 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
5733                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
5734                                         Register tmp4) {
5735   Label big, done, after_init, to_stub;
5736 
5737   assert_different_registers(src, dst, len, tmp4, rscratch1);
5738 
5739   fmovd(vtmp1, zr);
5740   lsrw(tmp4, len, 3);
5741   bind(after_init);
5742   cbnzw(tmp4, big);
5743   // Short string: less than 8 bytes.
5744   {
5745     Label loop, tiny;
5746 
5747     cmpw(len, 4);
5748     br(LT, tiny);
5749     // Use SIMD to do 4 bytes.
5750     ldrs(vtmp2, post(src, 4));
5751     zip1(vtmp3, T8B, vtmp2, vtmp1);
5752     subw(len, len, 4);
5753     strd(vtmp3, post(dst, 8));
5754 
5755     cbzw(len, done);
5756 
5757     // Do the remaining bytes by steam.
5758     bind(loop);
5759     ldrb(tmp4, post(src, 1));
5760     strh(tmp4, post(dst, 2));
5761     subw(len, len, 1);
5762 
5763     bind(tiny);
5764     cbnz(len, loop);
5765 
5766     b(done);
5767   }
5768 
5769   if (SoftwarePrefetchHintDistance >= 0) {
5770     bind(to_stub);
5771       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
5772       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
5773       trampoline_call(stub);
5774       b(after_init);
5775   }
5776 
5777   // Unpack the bytes 8 at a time.
5778   bind(big);
5779   {
5780     Label loop, around, loop_last, loop_start;
5781 
5782     if (SoftwarePrefetchHintDistance >= 0) {
5783       const int large_loop_threshold = (64 + 16)/8;
5784       ldrd(vtmp2, post(src, 8));
5785       andw(len, len, 7);
5786       cmp(tmp4, (u1)large_loop_threshold);
5787       br(GE, to_stub);
5788       b(loop_start);
5789 
5790       bind(loop);
5791       ldrd(vtmp2, post(src, 8));
5792       bind(loop_start);
5793       subs(tmp4, tmp4, 1);
5794       br(EQ, loop_last);
5795       zip1(vtmp2, T16B, vtmp2, vtmp1);
5796       ldrd(vtmp3, post(src, 8));
5797       st1(vtmp2, T8H, post(dst, 16));
5798       subs(tmp4, tmp4, 1);
5799       zip1(vtmp3, T16B, vtmp3, vtmp1);
5800       st1(vtmp3, T8H, post(dst, 16));
5801       br(NE, loop);
5802       b(around);
5803       bind(loop_last);
5804       zip1(vtmp2, T16B, vtmp2, vtmp1);
5805       st1(vtmp2, T8H, post(dst, 16));
5806       bind(around);
5807       cbz(len, done);
5808     } else {
5809       andw(len, len, 7);
5810       bind(loop);
5811       ldrd(vtmp2, post(src, 8));
5812       sub(tmp4, tmp4, 1);
5813       zip1(vtmp3, T16B, vtmp2, vtmp1);
5814       st1(vtmp3, T8H, post(dst, 16));
5815       cbnz(tmp4, loop);
5816     }
5817   }
5818 
5819   // Do the tail of up to 8 bytes.
5820   add(src, src, len);
5821   ldrd(vtmp3, Address(src, -8));
5822   add(dst, dst, len, ext::uxtw, 1);
5823   zip1(vtmp3, T16B, vtmp3, vtmp1);
5824   strq(vtmp3, Address(dst, -16));
5825 
5826   bind(done);
5827 }
5828 
5829 // Compress char[] array to byte[].
5830 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
5831                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
5832                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
5833                                          Register result) {
5834   encode_iso_array(src, dst, len, result,
5835                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
5836   cmp(len, zr);
5837   csel(result, result, zr, EQ);
5838 }
5839 
5840 // get_thread() can be called anywhere inside generated code so we
5841 // need to save whatever non-callee save context might get clobbered
5842 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
5843 // the call setup code.
5844 //
5845 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
5846 //
5847 void MacroAssembler::get_thread(Register dst) {
5848   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
5849   push(saved_regs, sp);
5850 
5851   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
5852   blrt(lr, 1, 0, 1);
5853   if (dst != c_rarg0) {
5854     mov(dst, c_rarg0);
5855   }
5856 
5857   pop(saved_regs, sp);
5858 }