1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "jvm.h"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "gc/shared/barrierSet.hpp"
  33 #include "gc/shared/cardTable.hpp"
  34 #include "gc/shared/barrierSetAssembler.hpp"
  35 #include "gc/shared/cardTableBarrierSet.hpp"
  36 #include "interpreter/interpreter.hpp"
  37 #include "compiler/disassembler.hpp"
  38 #include "memory/resourceArea.hpp"
  39 #include "nativeInst_aarch64.hpp"
  40 #include "oops/accessDecorators.hpp"
  41 #include "oops/compressedOops.inline.hpp"
  42 #include "oops/klass.inline.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.inline.hpp"
  46 #include "runtime/jniHandles.inline.hpp"
  47 #include "runtime/sharedRuntime.hpp"
  48 #include "runtime/thread.hpp"
  49 #ifdef COMPILER1
  50 #include "c1/c1_LIRAssembler.hpp"
  51 #endif
  52 #ifdef COMPILER2
  53 #include "oops/oop.hpp"
  54 #include "opto/compile.hpp"
  55 #include "opto/intrinsicnode.hpp"
  56 #include "opto/node.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 4 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       //       movk    Rx, #imm16<<32
 109       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 110       // In the first 3 cases we must check that Rx is the same in the adrp and the
 111       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 112       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 113       // to be followed by a random unrelated ldr/str, add or movk instruction.
 114       //
 115       unsigned insn2 = ((unsigned*)branch)[1];
 116       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 117                 Instruction_aarch64::extract(insn, 4, 0) ==
 118                         Instruction_aarch64::extract(insn2, 9, 5)) {
 119         // Load/store register (unsigned immediate)
 120         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 121         Instruction_aarch64::patch(branch + sizeof (unsigned),
 122                                     21, 10, offset_lo >> size);
 123         guarantee(((dest >> size) << size) == dest, "misaligned target");
 124         instructions = 2;
 125       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 126                 Instruction_aarch64::extract(insn, 4, 0) ==
 127                         Instruction_aarch64::extract(insn2, 4, 0)) {
 128         // add (immediate)
 129         Instruction_aarch64::patch(branch + sizeof (unsigned),
 130                                    21, 10, offset_lo);
 131         instructions = 2;
 132       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 133                    Instruction_aarch64::extract(insn, 4, 0) ==
 134                      Instruction_aarch64::extract(insn2, 4, 0)) {
 135         // movk #imm16<<32
 136         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 137         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 138         long pc_page = (long)branch >> 12;
 139         long adr_page = (long)dest >> 12;
 140         offset = adr_page - pc_page;
 141         instructions = 2;
 142       }
 143     }
 144     int offset_lo = offset & 3;
 145     offset >>= 2;
 146     Instruction_aarch64::spatch(branch, 23, 5, offset);
 147     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 148   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 149     u_int64_t dest = (u_int64_t)target;
 150     // Move wide constant
 151     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 152     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 153     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 154     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 155     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 156     assert(target_addr_for_insn(branch) == target, "should be");
 157     instructions = 3;
 158   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 159              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 160     // nothing to do
 161     assert(target == 0, "did not expect to relocate target for polling page load");
 162   } else {
 163     ShouldNotReachHere();
 164   }
 165   return instructions * NativeInstruction::instruction_size;
 166 }
 167 
 168 int MacroAssembler::patch_oop(address insn_addr, address o) {
 169   int instructions;
 170   unsigned insn = *(unsigned*)insn_addr;
 171   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 172 
 173   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 174   // narrow OOPs by setting the upper 16 bits in the first
 175   // instruction.
 176   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 177     // Move narrow OOP
 178     narrowOop n = CompressedOops::encode((oop)o);
 179     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 180     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 181     instructions = 2;
 182   } else {
 183     // Move wide OOP
 184     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 185     uintptr_t dest = (uintptr_t)o;
 186     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 188     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 189     instructions = 3;
 190   }
 191   return instructions * NativeInstruction::instruction_size;
 192 }
 193 
 194 int MacroAssembler::patch_narrow_klass(address insn_addr, narrowKlass n) {
 195   // Metatdata pointers are either narrow (32 bits) or wide (48 bits).
 196   // We encode narrow ones by setting the upper 16 bits in the first
 197   // instruction.
 198   NativeInstruction *insn = nativeInstruction_at(insn_addr);
 199   assert(Instruction_aarch64::extract(insn->encoding(), 31, 21) == 0b11010010101 &&
 200          nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 201 
 202   Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 203   Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 204   return 2 * NativeInstruction::instruction_size;
 205 }
 206 
 207 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 208   long offset = 0;
 209   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 210     // Load register (literal)
 211     offset = Instruction_aarch64::sextract(insn, 23, 5);
 212     return address(((uint64_t)insn_addr + (offset << 2)));
 213   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 214     // Unconditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 25, 0);
 216   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 217     // Conditional branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 220     // Compare & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 23, 5);
 222    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 223     // Test & branch (immediate)
 224     offset = Instruction_aarch64::sextract(insn, 18, 5);
 225   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 226     // PC-rel. addressing
 227     offset = Instruction_aarch64::extract(insn, 30, 29);
 228     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 229     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 230     if (shift) {
 231       offset <<= shift;
 232       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 233       target_page &= ((uint64_t)-1) << shift;
 234       // Return the target address for the following sequences
 235       //   1 - adrp    Rx, target_page
 236       //       ldr/str Ry, [Rx, #offset_in_page]
 237       //   2 - adrp    Rx, target_page
 238       //       add     Ry, Rx, #offset_in_page
 239       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 240       //       movk    Rx, #imm12<<32
 241       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 242       //
 243       // In the first two cases  we check that the register is the same and
 244       // return the target_page + the offset within the page.
 245       // Otherwise we assume it is a page aligned relocation and return
 246       // the target page only.
 247       //
 248       unsigned insn2 = ((unsigned*)insn_addr)[1];
 249       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 250                 Instruction_aarch64::extract(insn, 4, 0) ==
 251                         Instruction_aarch64::extract(insn2, 9, 5)) {
 252         // Load/store register (unsigned immediate)
 253         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 254         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 255         return address(target_page + (byte_offset << size));
 256       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 257                 Instruction_aarch64::extract(insn, 4, 0) ==
 258                         Instruction_aarch64::extract(insn2, 4, 0)) {
 259         // add (immediate)
 260         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 261         return address(target_page + byte_offset);
 262       } else {
 263         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 264                Instruction_aarch64::extract(insn, 4, 0) ==
 265                  Instruction_aarch64::extract(insn2, 4, 0)) {
 266           target_page = (target_page & 0xffffffff) |
 267                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 268         }
 269         return (address)target_page;
 270       }
 271     } else {
 272       ShouldNotReachHere();
 273     }
 274   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 275     u_int32_t *insns = (u_int32_t *)insn_addr;
 276     // Move wide constant: movz, movk, movk.  See movptr().
 277     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 278     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 279     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 280                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 281                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 282   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 283              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 284     return 0;
 285   } else {
 286     ShouldNotReachHere();
 287   }
 288   return address(((uint64_t)insn_addr + (offset << 2)));
 289 }
 290 
 291 void MacroAssembler::safepoint_poll(Label& slow_path) {
 292   if (SafepointMechanism::uses_thread_local_poll()) {
 293     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 294     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 295   } else {
 296     unsigned long offset;
 297     adrp(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()), offset);
 298     ldrw(rscratch1, Address(rscratch1, offset));
 299     assert(SafepointSynchronize::_not_synchronized == 0, "rewrite this code");
 300     cbnz(rscratch1, slow_path);
 301   }
 302 }
 303 
 304 // Just like safepoint_poll, but use an acquiring load for thread-
 305 // local polling.
 306 //
 307 // We need an acquire here to ensure that any subsequent load of the
 308 // global SafepointSynchronize::_state flag is ordered after this load
 309 // of the local Thread::_polling page.  We don't want this poll to
 310 // return false (i.e. not safepointing) and a later poll of the global
 311 // SafepointSynchronize::_state spuriously to return true.
 312 //
 313 // This is to avoid a race when we're in a native->Java transition
 314 // racing the code which wakes up from a safepoint.
 315 //
 316 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 317   if (SafepointMechanism::uses_thread_local_poll()) {
 318     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 319     ldar(rscratch1, rscratch1);
 320     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 321   } else {
 322     safepoint_poll(slow_path);
 323   }
 324 }
 325 
 326 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 327   // we must set sp to zero to clear frame
 328   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 329 
 330   // must clear fp, so that compiled frames are not confused; it is
 331   // possible that we need it only for debugging
 332   if (clear_fp) {
 333     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 334   }
 335 
 336   // Always clear the pc because it could have been set by make_walkable()
 337   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 338 }
 339 
 340 // Calls to C land
 341 //
 342 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 343 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 344 // has to be reset to 0. This is required to allow proper stack traversal.
 345 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 346                                          Register last_java_fp,
 347                                          Register last_java_pc,
 348                                          Register scratch) {
 349 
 350   if (last_java_pc->is_valid()) {
 351       str(last_java_pc, Address(rthread,
 352                                 JavaThread::frame_anchor_offset()
 353                                 + JavaFrameAnchor::last_Java_pc_offset()));
 354     }
 355 
 356   // determine last_java_sp register
 357   if (last_java_sp == sp) {
 358     mov(scratch, sp);
 359     last_java_sp = scratch;
 360   } else if (!last_java_sp->is_valid()) {
 361     last_java_sp = esp;
 362   }
 363 
 364   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 365 
 366   // last_java_fp is optional
 367   if (last_java_fp->is_valid()) {
 368     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 369   }
 370 }
 371 
 372 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 373                                          Register last_java_fp,
 374                                          address  last_java_pc,
 375                                          Register scratch) {
 376   if (last_java_pc != NULL) {
 377     adr(scratch, last_java_pc);
 378   } else {
 379     // FIXME: This is almost never correct.  We should delete all
 380     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 381     // correct return address instead.
 382     adr(scratch, pc());
 383   }
 384 
 385   str(scratch, Address(rthread,
 386                        JavaThread::frame_anchor_offset()
 387                        + JavaFrameAnchor::last_Java_pc_offset()));
 388 
 389   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 390 }
 391 
 392 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 393                                          Register last_java_fp,
 394                                          Label &L,
 395                                          Register scratch) {
 396   if (L.is_bound()) {
 397     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 398   } else {
 399     InstructionMark im(this);
 400     L.add_patch_at(code(), locator());
 401     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 402   }
 403 }
 404 
 405 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 406   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 407   assert(CodeCache::find_blob(entry.target()) != NULL,
 408          "destination of far call not found in code cache");
 409   if (far_branches()) {
 410     unsigned long offset;
 411     // We can use ADRP here because we know that the total size of
 412     // the code cache cannot exceed 2Gb.
 413     adrp(tmp, entry, offset);
 414     add(tmp, tmp, offset);
 415     if (cbuf) cbuf->set_insts_mark();
 416     blr(tmp);
 417   } else {
 418     if (cbuf) cbuf->set_insts_mark();
 419     bl(entry);
 420   }
 421 }
 422 
 423 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 424   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 425   assert(CodeCache::find_blob(entry.target()) != NULL,
 426          "destination of far call not found in code cache");
 427   if (far_branches()) {
 428     unsigned long offset;
 429     // We can use ADRP here because we know that the total size of
 430     // the code cache cannot exceed 2Gb.
 431     adrp(tmp, entry, offset);
 432     add(tmp, tmp, offset);
 433     if (cbuf) cbuf->set_insts_mark();
 434     br(tmp);
 435   } else {
 436     if (cbuf) cbuf->set_insts_mark();
 437     b(entry);
 438   }
 439 }
 440 
 441 void MacroAssembler::reserved_stack_check() {
 442     // testing if reserved zone needs to be enabled
 443     Label no_reserved_zone_enabling;
 444 
 445     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 446     cmp(sp, rscratch1);
 447     br(Assembler::LO, no_reserved_zone_enabling);
 448 
 449     enter();   // LR and FP are live.
 450     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 451     mov(c_rarg0, rthread);
 452     blr(rscratch1);
 453     leave();
 454 
 455     // We have already removed our own frame.
 456     // throw_delayed_StackOverflowError will think that it's been
 457     // called by our caller.
 458     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 459     br(rscratch1);
 460     should_not_reach_here();
 461 
 462     bind(no_reserved_zone_enabling);
 463 }
 464 
 465 int MacroAssembler::biased_locking_enter(Register lock_reg,
 466                                          Register obj_reg,
 467                                          Register swap_reg,
 468                                          Register tmp_reg,
 469                                          bool swap_reg_contains_mark,
 470                                          Label& done,
 471                                          Label* slow_case,
 472                                          BiasedLockingCounters* counters) {
 473   assert(UseBiasedLocking, "why call this otherwise?");
 474   assert_different_registers(lock_reg, obj_reg, swap_reg);
 475 
 476   if (PrintBiasedLockingStatistics && counters == NULL)
 477     counters = BiasedLocking::counters();
 478 
 479   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 480   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 481   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 482   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 483   Address saved_mark_addr(lock_reg, 0);
 484 
 485   // Biased locking
 486   // See whether the lock is currently biased toward our thread and
 487   // whether the epoch is still valid
 488   // Note that the runtime guarantees sufficient alignment of JavaThread
 489   // pointers to allow age to be placed into low bits
 490   // First check to see whether biasing is even enabled for this object
 491   Label cas_label;
 492   int null_check_offset = -1;
 493   if (!swap_reg_contains_mark) {
 494     null_check_offset = offset();
 495     ldr(swap_reg, mark_addr);
 496   }
 497   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 498   cmp(tmp_reg, (u1)markOopDesc::biased_lock_pattern);
 499   br(Assembler::NE, cas_label);
 500   // The bias pattern is present in the object's header. Need to check
 501   // whether the bias owner and the epoch are both still current.
 502   load_prototype_header(tmp_reg, obj_reg);
 503   orr(tmp_reg, tmp_reg, rthread);
 504   eor(tmp_reg, swap_reg, tmp_reg);
 505   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 506   if (counters != NULL) {
 507     Label around;
 508     cbnz(tmp_reg, around);
 509     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 510     b(done);
 511     bind(around);
 512   } else {
 513     cbz(tmp_reg, done);
 514   }
 515 
 516   Label try_revoke_bias;
 517   Label try_rebias;
 518 
 519   // At this point we know that the header has the bias pattern and
 520   // that we are not the bias owner in the current epoch. We need to
 521   // figure out more details about the state of the header in order to
 522   // know what operations can be legally performed on the object's
 523   // header.
 524 
 525   // If the low three bits in the xor result aren't clear, that means
 526   // the prototype header is no longer biased and we have to revoke
 527   // the bias on this object.
 528   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 529   cbnz(rscratch1, try_revoke_bias);
 530 
 531   // Biasing is still enabled for this data type. See whether the
 532   // epoch of the current bias is still valid, meaning that the epoch
 533   // bits of the mark word are equal to the epoch bits of the
 534   // prototype header. (Note that the prototype header's epoch bits
 535   // only change at a safepoint.) If not, attempt to rebias the object
 536   // toward the current thread. Note that we must be absolutely sure
 537   // that the current epoch is invalid in order to do this because
 538   // otherwise the manipulations it performs on the mark word are
 539   // illegal.
 540   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 541   cbnz(rscratch1, try_rebias);
 542 
 543   // The epoch of the current bias is still valid but we know nothing
 544   // about the owner; it might be set or it might be clear. Try to
 545   // acquire the bias of the object using an atomic operation. If this
 546   // fails we will go in to the runtime to revoke the object's bias.
 547   // Note that we first construct the presumed unbiased header so we
 548   // don't accidentally blow away another thread's valid bias.
 549   {
 550     Label here;
 551     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 552     andr(swap_reg, swap_reg, rscratch1);
 553     orr(tmp_reg, swap_reg, rthread);
 554     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 555     // If the biasing toward our thread failed, this means that
 556     // another thread succeeded in biasing it toward itself and we
 557     // need to revoke that bias. The revocation will occur in the
 558     // interpreter runtime in the slow case.
 559     bind(here);
 560     if (counters != NULL) {
 561       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 562                   tmp_reg, rscratch1, rscratch2);
 563     }
 564   }
 565   b(done);
 566 
 567   bind(try_rebias);
 568   // At this point we know the epoch has expired, meaning that the
 569   // current "bias owner", if any, is actually invalid. Under these
 570   // circumstances _only_, we are allowed to use the current header's
 571   // value as the comparison value when doing the cas to acquire the
 572   // bias in the current epoch. In other words, we allow transfer of
 573   // the bias from one thread to another directly in this situation.
 574   //
 575   // FIXME: due to a lack of registers we currently blow away the age
 576   // bits in this situation. Should attempt to preserve them.
 577   {
 578     Label here;
 579     load_prototype_header(tmp_reg, obj_reg);
 580     orr(tmp_reg, rthread, tmp_reg);
 581     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 582     // If the biasing toward our thread failed, then another thread
 583     // succeeded in biasing it toward itself and we need to revoke that
 584     // bias. The revocation will occur in the runtime in the slow case.
 585     bind(here);
 586     if (counters != NULL) {
 587       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 588                   tmp_reg, rscratch1, rscratch2);
 589     }
 590   }
 591   b(done);
 592 
 593   bind(try_revoke_bias);
 594   // The prototype mark in the klass doesn't have the bias bit set any
 595   // more, indicating that objects of this data type are not supposed
 596   // to be biased any more. We are going to try to reset the mark of
 597   // this object to the prototype value and fall through to the
 598   // CAS-based locking scheme. Note that if our CAS fails, it means
 599   // that another thread raced us for the privilege of revoking the
 600   // bias of this particular object, so it's okay to continue in the
 601   // normal locking code.
 602   //
 603   // FIXME: due to a lack of registers we currently blow away the age
 604   // bits in this situation. Should attempt to preserve them.
 605   {
 606     Label here, nope;
 607     load_prototype_header(tmp_reg, obj_reg);
 608     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 609     bind(here);
 610 
 611     // Fall through to the normal CAS-based lock, because no matter what
 612     // the result of the above CAS, some thread must have succeeded in
 613     // removing the bias bit from the object's header.
 614     if (counters != NULL) {
 615       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 616                   rscratch1, rscratch2);
 617     }
 618     bind(nope);
 619   }
 620 
 621   bind(cas_label);
 622 
 623   return null_check_offset;
 624 }
 625 
 626 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 627   assert(UseBiasedLocking, "why call this otherwise?");
 628 
 629   // Check for biased locking unlock case, which is a no-op
 630   // Note: we do not have to check the thread ID for two reasons.
 631   // First, the interpreter checks for IllegalMonitorStateException at
 632   // a higher level. Second, if the bias was revoked while we held the
 633   // lock, the object could not be rebiased toward another thread, so
 634   // the bias bit would be clear.
 635   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 636   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 637   cmp(temp_reg, (u1)markOopDesc::biased_lock_pattern);
 638   br(Assembler::EQ, done);
 639 }
 640 
 641 static void pass_arg0(MacroAssembler* masm, Register arg) {
 642   if (c_rarg0 != arg ) {
 643     masm->mov(c_rarg0, arg);
 644   }
 645 }
 646 
 647 static void pass_arg1(MacroAssembler* masm, Register arg) {
 648   if (c_rarg1 != arg ) {
 649     masm->mov(c_rarg1, arg);
 650   }
 651 }
 652 
 653 static void pass_arg2(MacroAssembler* masm, Register arg) {
 654   if (c_rarg2 != arg ) {
 655     masm->mov(c_rarg2, arg);
 656   }
 657 }
 658 
 659 static void pass_arg3(MacroAssembler* masm, Register arg) {
 660   if (c_rarg3 != arg ) {
 661     masm->mov(c_rarg3, arg);
 662   }
 663 }
 664 
 665 void MacroAssembler::call_VM_base(Register oop_result,
 666                                   Register java_thread,
 667                                   Register last_java_sp,
 668                                   address  entry_point,
 669                                   int      number_of_arguments,
 670                                   bool     check_exceptions) {
 671    // determine java_thread register
 672   if (!java_thread->is_valid()) {
 673     java_thread = rthread;
 674   }
 675 
 676   // determine last_java_sp register
 677   if (!last_java_sp->is_valid()) {
 678     last_java_sp = esp;
 679   }
 680 
 681   // debugging support
 682   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 683   assert(java_thread == rthread, "unexpected register");
 684 #ifdef ASSERT
 685   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 686   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 687 #endif // ASSERT
 688 
 689   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 690   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 691 
 692   // push java thread (becomes first argument of C function)
 693 
 694   mov(c_rarg0, java_thread);
 695 
 696   // set last Java frame before call
 697   assert(last_java_sp != rfp, "can't use rfp");
 698 
 699   Label l;
 700   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 701 
 702   // do the call, remove parameters
 703   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 704 
 705   // reset last Java frame
 706   // Only interpreter should have to clear fp
 707   reset_last_Java_frame(true);
 708 
 709    // C++ interp handles this in the interpreter
 710   check_and_handle_popframe(java_thread);
 711   check_and_handle_earlyret(java_thread);
 712 
 713   if (check_exceptions) {
 714     // check for pending exceptions (java_thread is set upon return)
 715     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 716     Label ok;
 717     cbz(rscratch1, ok);
 718     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 719     br(rscratch1);
 720     bind(ok);
 721   }
 722 
 723   // get oop result if there is one and reset the value in the thread
 724   if (oop_result->is_valid()) {
 725     get_vm_result(oop_result, java_thread);
 726   }
 727 }
 728 
 729 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 730   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 731 }
 732 
 733 // Maybe emit a call via a trampoline.  If the code cache is small
 734 // trampolines won't be emitted.
 735 
 736 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 737   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 738   assert(entry.rspec().type() == relocInfo::runtime_call_type
 739          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 740          || entry.rspec().type() == relocInfo::static_call_type
 741          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 742 
 743   // We need a trampoline if branches are far.
 744   if (far_branches()) {
 745     bool in_scratch_emit_size = false;
 746 #ifdef COMPILER2
 747     // We don't want to emit a trampoline if C2 is generating dummy
 748     // code during its branch shortening phase.
 749     CompileTask* task = ciEnv::current()->task();
 750     in_scratch_emit_size =
 751       (task != NULL && is_c2_compile(task->comp_level()) &&
 752        Compile::current()->in_scratch_emit_size());
 753 #endif
 754     if (!in_scratch_emit_size) {
 755       address stub = emit_trampoline_stub(offset(), entry.target());
 756       if (stub == NULL) {
 757         return NULL; // CodeCache is full
 758       }
 759     }
 760   }
 761 
 762   if (cbuf) cbuf->set_insts_mark();
 763   relocate(entry.rspec());
 764   if (!far_branches()) {
 765     bl(entry.target());
 766   } else {
 767     bl(pc());
 768   }
 769   // just need to return a non-null address
 770   return pc();
 771 }
 772 
 773 
 774 // Emit a trampoline stub for a call to a target which is too far away.
 775 //
 776 // code sequences:
 777 //
 778 // call-site:
 779 //   branch-and-link to <destination> or <trampoline stub>
 780 //
 781 // Related trampoline stub for this call site in the stub section:
 782 //   load the call target from the constant pool
 783 //   branch (LR still points to the call site above)
 784 
 785 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 786                                              address dest) {
 787   // Max stub size: alignment nop, TrampolineStub.
 788   address stub = start_a_stub(NativeInstruction::instruction_size
 789                    + NativeCallTrampolineStub::instruction_size);
 790   if (stub == NULL) {
 791     return NULL;  // CodeBuffer::expand failed
 792   }
 793 
 794   // Create a trampoline stub relocation which relates this trampoline stub
 795   // with the call instruction at insts_call_instruction_offset in the
 796   // instructions code-section.
 797   align(wordSize);
 798   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 799                                             + insts_call_instruction_offset));
 800   const int stub_start_offset = offset();
 801 
 802   // Now, create the trampoline stub's code:
 803   // - load the call
 804   // - call
 805   Label target;
 806   ldr(rscratch1, target);
 807   br(rscratch1);
 808   bind(target);
 809   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 810          "should be");
 811   emit_int64((int64_t)dest);
 812 
 813   const address stub_start_addr = addr_at(stub_start_offset);
 814 
 815   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 816 
 817   end_a_stub();
 818   return stub_start_addr;
 819 }
 820 
 821 void MacroAssembler::c2bool(Register x) {
 822   // implements x == 0 ? 0 : 1
 823   // note: must only look at least-significant byte of x
 824   //       since C-style booleans are stored in one byte
 825   //       only! (was bug)
 826   tst(x, 0xff);
 827   cset(x, Assembler::NE);
 828 }
 829 
 830 address MacroAssembler::ic_call(address entry, jint method_index) {
 831   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 832   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 833   // unsigned long offset;
 834   // ldr_constant(rscratch2, const_ptr);
 835   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 836   return trampoline_call(Address(entry, rh));
 837 }
 838 
 839 // Implementation of call_VM versions
 840 
 841 void MacroAssembler::call_VM(Register oop_result,
 842                              address entry_point,
 843                              bool check_exceptions) {
 844   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 845 }
 846 
 847 void MacroAssembler::call_VM(Register oop_result,
 848                              address entry_point,
 849                              Register arg_1,
 850                              bool check_exceptions) {
 851   pass_arg1(this, arg_1);
 852   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 853 }
 854 
 855 void MacroAssembler::call_VM(Register oop_result,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860   assert(arg_1 != c_rarg2, "smashed arg");
 861   pass_arg2(this, arg_2);
 862   pass_arg1(this, arg_1);
 863   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 864 }
 865 
 866 void MacroAssembler::call_VM(Register oop_result,
 867                              address entry_point,
 868                              Register arg_1,
 869                              Register arg_2,
 870                              Register arg_3,
 871                              bool check_exceptions) {
 872   assert(arg_1 != c_rarg3, "smashed arg");
 873   assert(arg_2 != c_rarg3, "smashed arg");
 874   pass_arg3(this, arg_3);
 875 
 876   assert(arg_1 != c_rarg2, "smashed arg");
 877   pass_arg2(this, arg_2);
 878 
 879   pass_arg1(this, arg_1);
 880   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 881 }
 882 
 883 void MacroAssembler::call_VM(Register oop_result,
 884                              Register last_java_sp,
 885                              address entry_point,
 886                              int number_of_arguments,
 887                              bool check_exceptions) {
 888   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 889 }
 890 
 891 void MacroAssembler::call_VM(Register oop_result,
 892                              Register last_java_sp,
 893                              address entry_point,
 894                              Register arg_1,
 895                              bool check_exceptions) {
 896   pass_arg1(this, arg_1);
 897   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 898 }
 899 
 900 void MacroAssembler::call_VM(Register oop_result,
 901                              Register last_java_sp,
 902                              address entry_point,
 903                              Register arg_1,
 904                              Register arg_2,
 905                              bool check_exceptions) {
 906 
 907   assert(arg_1 != c_rarg2, "smashed arg");
 908   pass_arg2(this, arg_2);
 909   pass_arg1(this, arg_1);
 910   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 911 }
 912 
 913 void MacroAssembler::call_VM(Register oop_result,
 914                              Register last_java_sp,
 915                              address entry_point,
 916                              Register arg_1,
 917                              Register arg_2,
 918                              Register arg_3,
 919                              bool check_exceptions) {
 920   assert(arg_1 != c_rarg3, "smashed arg");
 921   assert(arg_2 != c_rarg3, "smashed arg");
 922   pass_arg3(this, arg_3);
 923   assert(arg_1 != c_rarg2, "smashed arg");
 924   pass_arg2(this, arg_2);
 925   pass_arg1(this, arg_1);
 926   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 927 }
 928 
 929 
 930 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 931   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 932   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 933   verify_oop(oop_result, "broken oop in call_VM_base");
 934 }
 935 
 936 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 937   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 938   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 939 }
 940 
 941 void MacroAssembler::align(int modulus) {
 942   while (offset() % modulus != 0) nop();
 943 }
 944 
 945 // these are no-ops overridden by InterpreterMacroAssembler
 946 
 947 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 948 
 949 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 950 
 951 
 952 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 953                                                       Register tmp,
 954                                                       int offset) {
 955   intptr_t value = *delayed_value_addr;
 956   if (value != 0)
 957     return RegisterOrConstant(value + offset);
 958 
 959   // load indirectly to solve generation ordering problem
 960   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 961 
 962   if (offset != 0)
 963     add(tmp, tmp, offset);
 964 
 965   return RegisterOrConstant(tmp);
 966 }
 967 
 968 
 969 void MacroAssembler:: notify(int type) {
 970   if (type == bytecode_start) {
 971     // set_last_Java_frame(esp, rfp, (address)NULL);
 972     Assembler:: notify(type);
 973     // reset_last_Java_frame(true);
 974   }
 975   else
 976     Assembler:: notify(type);
 977 }
 978 
 979 // Look up the method for a megamorphic invokeinterface call.
 980 // The target method is determined by <intf_klass, itable_index>.
 981 // The receiver klass is in recv_klass.
 982 // On success, the result will be in method_result, and execution falls through.
 983 // On failure, execution transfers to the given label.
 984 void MacroAssembler::lookup_interface_method(Register recv_klass,
 985                                              Register intf_klass,
 986                                              RegisterOrConstant itable_index,
 987                                              Register method_result,
 988                                              Register scan_temp,
 989                                              Label& L_no_such_interface,
 990                          bool return_method) {
 991   assert_different_registers(recv_klass, intf_klass, scan_temp);
 992   assert_different_registers(method_result, intf_klass, scan_temp);
 993   assert(recv_klass != method_result || !return_method,
 994      "recv_klass can be destroyed when method isn't needed");
 995   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 996          "caller must use same register for non-constant itable index as for method");
 997 
 998   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 999   int vtable_base = in_bytes(Klass::vtable_start_offset());
1000   int itentry_off = itableMethodEntry::method_offset_in_bytes();
1001   int scan_step   = itableOffsetEntry::size() * wordSize;
1002   int vte_size    = vtableEntry::size_in_bytes();
1003   assert(vte_size == wordSize, "else adjust times_vte_scale");
1004 
1005   ldrw(scan_temp, Address(recv_klass, Klass::vtable_length_offset()));
1006 
1007   // %%% Could store the aligned, prescaled offset in the klassoop.
1008   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
1009   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
1010   add(scan_temp, scan_temp, vtable_base);
1011 
1012   if (return_method) {
1013     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
1014     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
1015     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
1016     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
1017     if (itentry_off)
1018       add(recv_klass, recv_klass, itentry_off);
1019   }
1020 
1021   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
1022   //   if (scan->interface() == intf) {
1023   //     result = (klass + scan->offset() + itable_index);
1024   //   }
1025   // }
1026   Label search, found_method;
1027 
1028   for (int peel = 1; peel >= 0; peel--) {
1029     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
1030     cmp(intf_klass, method_result);
1031 
1032     if (peel) {
1033       br(Assembler::EQ, found_method);
1034     } else {
1035       br(Assembler::NE, search);
1036       // (invert the test to fall through to found_method...)
1037     }
1038 
1039     if (!peel)  break;
1040 
1041     bind(search);
1042 
1043     // Check that the previous entry is non-null.  A null entry means that
1044     // the receiver class doesn't implement the interface, and wasn't the
1045     // same as when the caller was compiled.
1046     cbz(method_result, L_no_such_interface);
1047     add(scan_temp, scan_temp, scan_step);
1048   }
1049 
1050   bind(found_method);
1051 
1052   // Got a hit.
1053   if (return_method) {
1054     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1055     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1056   }
1057 }
1058 
1059 // virtual method calling
1060 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1061                                            RegisterOrConstant vtable_index,
1062                                            Register method_result) {
1063   const int base = in_bytes(Klass::vtable_start_offset());
1064   assert(vtableEntry::size() * wordSize == 8,
1065          "adjust the scaling in the code below");
1066   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1067 
1068   if (vtable_index.is_register()) {
1069     lea(method_result, Address(recv_klass,
1070                                vtable_index.as_register(),
1071                                Address::lsl(LogBytesPerWord)));
1072     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1073   } else {
1074     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1075     ldr(method_result,
1076         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1077   }
1078 }
1079 
1080 void MacroAssembler::check_klass_subtype(Register sub_klass,
1081                            Register super_klass,
1082                            Register temp_reg,
1083                            Label& L_success) {
1084   Label L_failure;
1085   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1086   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1087   bind(L_failure);
1088 }
1089 
1090 
1091 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1092                                                    Register super_klass,
1093                                                    Register temp_reg,
1094                                                    Label* L_success,
1095                                                    Label* L_failure,
1096                                                    Label* L_slow_path,
1097                                         RegisterOrConstant super_check_offset) {
1098   assert_different_registers(sub_klass, super_klass, temp_reg);
1099   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1100   if (super_check_offset.is_register()) {
1101     assert_different_registers(sub_klass, super_klass,
1102                                super_check_offset.as_register());
1103   } else if (must_load_sco) {
1104     assert(temp_reg != noreg, "supply either a temp or a register offset");
1105   }
1106 
1107   Label L_fallthrough;
1108   int label_nulls = 0;
1109   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1110   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1111   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1112   assert(label_nulls <= 1, "at most one NULL in the batch");
1113 
1114   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1115   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1116   Address super_check_offset_addr(super_klass, sco_offset);
1117 
1118   // Hacked jmp, which may only be used just before L_fallthrough.
1119 #define final_jmp(label)                                                \
1120   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1121   else                            b(label)                /*omit semi*/
1122 
1123   // If the pointers are equal, we are done (e.g., String[] elements).
1124   // This self-check enables sharing of secondary supertype arrays among
1125   // non-primary types such as array-of-interface.  Otherwise, each such
1126   // type would need its own customized SSA.
1127   // We move this check to the front of the fast path because many
1128   // type checks are in fact trivially successful in this manner,
1129   // so we get a nicely predicted branch right at the start of the check.
1130   cmp(sub_klass, super_klass);
1131   br(Assembler::EQ, *L_success);
1132 
1133   // Check the supertype display:
1134   if (must_load_sco) {
1135     ldrw(temp_reg, super_check_offset_addr);
1136     super_check_offset = RegisterOrConstant(temp_reg);
1137   }
1138   Address super_check_addr(sub_klass, super_check_offset);
1139   ldr(rscratch1, super_check_addr);
1140   cmp(super_klass, rscratch1); // load displayed supertype
1141 
1142   // This check has worked decisively for primary supers.
1143   // Secondary supers are sought in the super_cache ('super_cache_addr').
1144   // (Secondary supers are interfaces and very deeply nested subtypes.)
1145   // This works in the same check above because of a tricky aliasing
1146   // between the super_cache and the primary super display elements.
1147   // (The 'super_check_addr' can address either, as the case requires.)
1148   // Note that the cache is updated below if it does not help us find
1149   // what we need immediately.
1150   // So if it was a primary super, we can just fail immediately.
1151   // Otherwise, it's the slow path for us (no success at this point).
1152 
1153   if (super_check_offset.is_register()) {
1154     br(Assembler::EQ, *L_success);
1155     subs(zr, super_check_offset.as_register(), sc_offset);
1156     if (L_failure == &L_fallthrough) {
1157       br(Assembler::EQ, *L_slow_path);
1158     } else {
1159       br(Assembler::NE, *L_failure);
1160       final_jmp(*L_slow_path);
1161     }
1162   } else if (super_check_offset.as_constant() == sc_offset) {
1163     // Need a slow path; fast failure is impossible.
1164     if (L_slow_path == &L_fallthrough) {
1165       br(Assembler::EQ, *L_success);
1166     } else {
1167       br(Assembler::NE, *L_slow_path);
1168       final_jmp(*L_success);
1169     }
1170   } else {
1171     // No slow path; it's a fast decision.
1172     if (L_failure == &L_fallthrough) {
1173       br(Assembler::EQ, *L_success);
1174     } else {
1175       br(Assembler::NE, *L_failure);
1176       final_jmp(*L_success);
1177     }
1178   }
1179 
1180   bind(L_fallthrough);
1181 
1182 #undef final_jmp
1183 }
1184 
1185 // These two are taken from x86, but they look generally useful
1186 
1187 // scans count pointer sized words at [addr] for occurence of value,
1188 // generic
1189 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1190                                 Register scratch) {
1191   Label Lloop, Lexit;
1192   cbz(count, Lexit);
1193   bind(Lloop);
1194   ldr(scratch, post(addr, wordSize));
1195   cmp(value, scratch);
1196   br(EQ, Lexit);
1197   sub(count, count, 1);
1198   cbnz(count, Lloop);
1199   bind(Lexit);
1200 }
1201 
1202 // scans count 4 byte words at [addr] for occurence of value,
1203 // generic
1204 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1205                                 Register scratch) {
1206   Label Lloop, Lexit;
1207   cbz(count, Lexit);
1208   bind(Lloop);
1209   ldrw(scratch, post(addr, wordSize));
1210   cmpw(value, scratch);
1211   br(EQ, Lexit);
1212   sub(count, count, 1);
1213   cbnz(count, Lloop);
1214   bind(Lexit);
1215 }
1216 
1217 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1218                                                    Register super_klass,
1219                                                    Register temp_reg,
1220                                                    Register temp2_reg,
1221                                                    Label* L_success,
1222                                                    Label* L_failure,
1223                                                    bool set_cond_codes) {
1224   assert_different_registers(sub_klass, super_klass, temp_reg);
1225   if (temp2_reg != noreg)
1226     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1227 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1228 
1229   Label L_fallthrough;
1230   int label_nulls = 0;
1231   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1232   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1233   assert(label_nulls <= 1, "at most one NULL in the batch");
1234 
1235   // a couple of useful fields in sub_klass:
1236   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1237   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1238   Address secondary_supers_addr(sub_klass, ss_offset);
1239   Address super_cache_addr(     sub_klass, sc_offset);
1240 
1241   BLOCK_COMMENT("check_klass_subtype_slow_path");
1242 
1243   // Do a linear scan of the secondary super-klass chain.
1244   // This code is rarely used, so simplicity is a virtue here.
1245   // The repne_scan instruction uses fixed registers, which we must spill.
1246   // Don't worry too much about pre-existing connections with the input regs.
1247 
1248   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1249   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1250 
1251   RegSet pushed_registers;
1252   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1253   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1254 
1255   if (super_klass != r0 || UseCompressedOops) {
1256     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1257   }
1258 
1259   push(pushed_registers, sp);
1260 
1261   // Get super_klass value into r0 (even if it was in r5 or r2).
1262   if (super_klass != r0) {
1263     mov(r0, super_klass);
1264   }
1265 
1266 #ifndef PRODUCT
1267   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1268   Address pst_counter_addr(rscratch2);
1269   ldr(rscratch1, pst_counter_addr);
1270   add(rscratch1, rscratch1, 1);
1271   str(rscratch1, pst_counter_addr);
1272 #endif //PRODUCT
1273 
1274   // We will consult the secondary-super array.
1275   ldr(r5, secondary_supers_addr);
1276   // Load the array length.
1277   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1278   // Skip to start of data.
1279   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1280 
1281   cmp(sp, zr); // Clear Z flag; SP is never zero
1282   // Scan R2 words at [R5] for an occurrence of R0.
1283   // Set NZ/Z based on last compare.
1284   repne_scan(r5, r0, r2, rscratch1);
1285 
1286   // Unspill the temp. registers:
1287   pop(pushed_registers, sp);
1288 
1289   br(Assembler::NE, *L_failure);
1290 
1291   // Success.  Cache the super we found and proceed in triumph.
1292   str(super_klass, super_cache_addr);
1293 
1294   if (L_success != &L_fallthrough) {
1295     b(*L_success);
1296   }
1297 
1298 #undef IS_A_TEMP
1299 
1300   bind(L_fallthrough);
1301 }
1302 
1303 
1304 void MacroAssembler::verify_oop(Register reg, const char* s) {
1305   if (!VerifyOops) return;
1306 
1307   // Pass register number to verify_oop_subroutine
1308   const char* b = NULL;
1309   {
1310     ResourceMark rm;
1311     stringStream ss;
1312     ss.print("verify_oop: %s: %s", reg->name(), s);
1313     b = code_string(ss.as_string());
1314   }
1315   BLOCK_COMMENT("verify_oop {");
1316 
1317   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1318   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1319 
1320   mov(r0, reg);
1321   mov(rscratch1, (address)b);
1322 
1323   // call indirectly to solve generation ordering problem
1324   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1325   ldr(rscratch2, Address(rscratch2));
1326   blr(rscratch2);
1327 
1328   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1329   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1330 
1331   BLOCK_COMMENT("} verify_oop");
1332 }
1333 
1334 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1335   if (!VerifyOops) return;
1336 
1337   const char* b = NULL;
1338   {
1339     ResourceMark rm;
1340     stringStream ss;
1341     ss.print("verify_oop_addr: %s", s);
1342     b = code_string(ss.as_string());
1343   }
1344   BLOCK_COMMENT("verify_oop_addr {");
1345 
1346   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1347   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1348 
1349   // addr may contain sp so we will have to adjust it based on the
1350   // pushes that we just did.
1351   if (addr.uses(sp)) {
1352     lea(r0, addr);
1353     ldr(r0, Address(r0, 4 * wordSize));
1354   } else {
1355     ldr(r0, addr);
1356   }
1357   mov(rscratch1, (address)b);
1358 
1359   // call indirectly to solve generation ordering problem
1360   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1361   ldr(rscratch2, Address(rscratch2));
1362   blr(rscratch2);
1363 
1364   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1365   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1366 
1367   BLOCK_COMMENT("} verify_oop_addr");
1368 }
1369 
1370 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1371                                          int extra_slot_offset) {
1372   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1373   int stackElementSize = Interpreter::stackElementSize;
1374   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1375 #ifdef ASSERT
1376   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1377   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1378 #endif
1379   if (arg_slot.is_constant()) {
1380     return Address(esp, arg_slot.as_constant() * stackElementSize
1381                    + offset);
1382   } else {
1383     add(rscratch1, esp, arg_slot.as_register(),
1384         ext::uxtx, exact_log2(stackElementSize));
1385     return Address(rscratch1, offset);
1386   }
1387 }
1388 
1389 void MacroAssembler::call_VM_leaf_base(address entry_point,
1390                                        int number_of_arguments,
1391                                        Label *retaddr) {
1392   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1393 }
1394 
1395 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1396                                         int number_of_gp_arguments,
1397                                         int number_of_fp_arguments,
1398                                         ret_type type,
1399                                         Label *retaddr) {
1400   Label E, L;
1401 
1402   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1403 
1404   // We add 1 to number_of_arguments because the thread in arg0 is
1405   // not counted
1406   mov(rscratch1, entry_point);
1407   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1408   if (retaddr)
1409     bind(*retaddr);
1410 
1411   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1412   maybe_isb();
1413 }
1414 
1415 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1416   call_VM_leaf_base(entry_point, number_of_arguments);
1417 }
1418 
1419 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1420   pass_arg0(this, arg_0);
1421   call_VM_leaf_base(entry_point, 1);
1422 }
1423 
1424 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1425   pass_arg0(this, arg_0);
1426   pass_arg1(this, arg_1);
1427   call_VM_leaf_base(entry_point, 2);
1428 }
1429 
1430 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1431                                   Register arg_1, Register arg_2) {
1432   pass_arg0(this, arg_0);
1433   pass_arg1(this, arg_1);
1434   pass_arg2(this, arg_2);
1435   call_VM_leaf_base(entry_point, 3);
1436 }
1437 
1438 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1439   pass_arg0(this, arg_0);
1440   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1441 }
1442 
1443 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1444 
1445   assert(arg_0 != c_rarg1, "smashed arg");
1446   pass_arg1(this, arg_1);
1447   pass_arg0(this, arg_0);
1448   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1449 }
1450 
1451 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1452   assert(arg_0 != c_rarg2, "smashed arg");
1453   assert(arg_1 != c_rarg2, "smashed arg");
1454   pass_arg2(this, arg_2);
1455   assert(arg_0 != c_rarg1, "smashed arg");
1456   pass_arg1(this, arg_1);
1457   pass_arg0(this, arg_0);
1458   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1459 }
1460 
1461 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1462   assert(arg_0 != c_rarg3, "smashed arg");
1463   assert(arg_1 != c_rarg3, "smashed arg");
1464   assert(arg_2 != c_rarg3, "smashed arg");
1465   pass_arg3(this, arg_3);
1466   assert(arg_0 != c_rarg2, "smashed arg");
1467   assert(arg_1 != c_rarg2, "smashed arg");
1468   pass_arg2(this, arg_2);
1469   assert(arg_0 != c_rarg1, "smashed arg");
1470   pass_arg1(this, arg_1);
1471   pass_arg0(this, arg_0);
1472   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1473 }
1474 
1475 void MacroAssembler::null_check(Register reg, int offset) {
1476   if (needs_explicit_null_check(offset)) {
1477     // provoke OS NULL exception if reg = NULL by
1478     // accessing M[reg] w/o changing any registers
1479     // NOTE: this is plenty to provoke a segv
1480     ldr(zr, Address(reg));
1481   } else {
1482     // nothing to do, (later) access of M[reg + offset]
1483     // will provoke OS NULL exception if reg = NULL
1484   }
1485 }
1486 
1487 // MacroAssembler protected routines needed to implement
1488 // public methods
1489 
1490 void MacroAssembler::mov(Register r, Address dest) {
1491   code_section()->relocate(pc(), dest.rspec());
1492   u_int64_t imm64 = (u_int64_t)dest.target();
1493   movptr(r, imm64);
1494 }
1495 
1496 // Move a constant pointer into r.  In AArch64 mode the virtual
1497 // address space is 48 bits in size, so we only need three
1498 // instructions to create a patchable instruction sequence that can
1499 // reach anywhere.
1500 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1501 #ifndef PRODUCT
1502   {
1503     char buffer[64];
1504     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1505     block_comment(buffer);
1506   }
1507 #endif
1508   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1509   movz(r, imm64 & 0xffff);
1510   imm64 >>= 16;
1511   movk(r, imm64 & 0xffff, 16);
1512   imm64 >>= 16;
1513   movk(r, imm64 & 0xffff, 32);
1514 }
1515 
1516 // Macro to mov replicated immediate to vector register.
1517 //  Vd will get the following values for different arrangements in T
1518 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1519 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1520 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1521 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1522 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1523 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1524 //   T1D/T2D: invalid
1525 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1526   assert(T != T1D && T != T2D, "invalid arrangement");
1527   if (T == T8B || T == T16B) {
1528     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1529     movi(Vd, T, imm32 & 0xff, 0);
1530     return;
1531   }
1532   u_int32_t nimm32 = ~imm32;
1533   if (T == T4H || T == T8H) {
1534     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1535     imm32 &= 0xffff;
1536     nimm32 &= 0xffff;
1537   }
1538   u_int32_t x = imm32;
1539   int movi_cnt = 0;
1540   int movn_cnt = 0;
1541   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1542   x = nimm32;
1543   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1544   if (movn_cnt < movi_cnt) imm32 = nimm32;
1545   unsigned lsl = 0;
1546   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1547   if (movn_cnt < movi_cnt)
1548     mvni(Vd, T, imm32 & 0xff, lsl);
1549   else
1550     movi(Vd, T, imm32 & 0xff, lsl);
1551   imm32 >>= 8; lsl += 8;
1552   while (imm32) {
1553     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1554     if (movn_cnt < movi_cnt)
1555       bici(Vd, T, imm32 & 0xff, lsl);
1556     else
1557       orri(Vd, T, imm32 & 0xff, lsl);
1558     lsl += 8; imm32 >>= 8;
1559   }
1560 }
1561 
1562 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1563 {
1564 #ifndef PRODUCT
1565   {
1566     char buffer[64];
1567     snprintf(buffer, sizeof(buffer), "0x%" PRIX64, imm64);
1568     block_comment(buffer);
1569   }
1570 #endif
1571   if (operand_valid_for_logical_immediate(false, imm64)) {
1572     orr(dst, zr, imm64);
1573   } else {
1574     // we can use a combination of MOVZ or MOVN with
1575     // MOVK to build up the constant
1576     u_int64_t imm_h[4];
1577     int zero_count = 0;
1578     int neg_count = 0;
1579     int i;
1580     for (i = 0; i < 4; i++) {
1581       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1582       if (imm_h[i] == 0) {
1583         zero_count++;
1584       } else if (imm_h[i] == 0xffffL) {
1585         neg_count++;
1586       }
1587     }
1588     if (zero_count == 4) {
1589       // one MOVZ will do
1590       movz(dst, 0);
1591     } else if (neg_count == 4) {
1592       // one MOVN will do
1593       movn(dst, 0);
1594     } else if (zero_count == 3) {
1595       for (i = 0; i < 4; i++) {
1596         if (imm_h[i] != 0L) {
1597           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1598           break;
1599         }
1600       }
1601     } else if (neg_count == 3) {
1602       // one MOVN will do
1603       for (int i = 0; i < 4; i++) {
1604         if (imm_h[i] != 0xffffL) {
1605           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1606           break;
1607         }
1608       }
1609     } else if (zero_count == 2) {
1610       // one MOVZ and one MOVK will do
1611       for (i = 0; i < 3; i++) {
1612         if (imm_h[i] != 0L) {
1613           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1614           i++;
1615           break;
1616         }
1617       }
1618       for (;i < 4; i++) {
1619         if (imm_h[i] != 0L) {
1620           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1621         }
1622       }
1623     } else if (neg_count == 2) {
1624       // one MOVN and one MOVK will do
1625       for (i = 0; i < 4; i++) {
1626         if (imm_h[i] != 0xffffL) {
1627           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1628           i++;
1629           break;
1630         }
1631       }
1632       for (;i < 4; i++) {
1633         if (imm_h[i] != 0xffffL) {
1634           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1635         }
1636       }
1637     } else if (zero_count == 1) {
1638       // one MOVZ and two MOVKs will do
1639       for (i = 0; i < 4; i++) {
1640         if (imm_h[i] != 0L) {
1641           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1642           i++;
1643           break;
1644         }
1645       }
1646       for (;i < 4; i++) {
1647         if (imm_h[i] != 0x0L) {
1648           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1649         }
1650       }
1651     } else if (neg_count == 1) {
1652       // one MOVN and two MOVKs will do
1653       for (i = 0; i < 4; i++) {
1654         if (imm_h[i] != 0xffffL) {
1655           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1656           i++;
1657           break;
1658         }
1659       }
1660       for (;i < 4; i++) {
1661         if (imm_h[i] != 0xffffL) {
1662           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1663         }
1664       }
1665     } else {
1666       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1667       movz(dst, (u_int32_t)imm_h[0], 0);
1668       for (i = 1; i < 4; i++) {
1669         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1670       }
1671     }
1672   }
1673 }
1674 
1675 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1676 {
1677 #ifndef PRODUCT
1678     {
1679       char buffer[64];
1680       snprintf(buffer, sizeof(buffer), "0x%" PRIX32, imm32);
1681       block_comment(buffer);
1682     }
1683 #endif
1684   if (operand_valid_for_logical_immediate(true, imm32)) {
1685     orrw(dst, zr, imm32);
1686   } else {
1687     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1688     // constant
1689     u_int32_t imm_h[2];
1690     imm_h[0] = imm32 & 0xffff;
1691     imm_h[1] = ((imm32 >> 16) & 0xffff);
1692     if (imm_h[0] == 0) {
1693       movzw(dst, imm_h[1], 16);
1694     } else if (imm_h[0] == 0xffff) {
1695       movnw(dst, imm_h[1] ^ 0xffff, 16);
1696     } else if (imm_h[1] == 0) {
1697       movzw(dst, imm_h[0], 0);
1698     } else if (imm_h[1] == 0xffff) {
1699       movnw(dst, imm_h[0] ^ 0xffff, 0);
1700     } else {
1701       // use a MOVZ and MOVK (makes it easier to debug)
1702       movzw(dst, imm_h[0], 0);
1703       movkw(dst, imm_h[1], 16);
1704     }
1705   }
1706 }
1707 
1708 // Form an address from base + offset in Rd.  Rd may or may
1709 // not actually be used: you must use the Address that is returned.
1710 // It is up to you to ensure that the shift provided matches the size
1711 // of your data.
1712 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1713   if (Address::offset_ok_for_immed(byte_offset, shift))
1714     // It fits; no need for any heroics
1715     return Address(base, byte_offset);
1716 
1717   // Don't do anything clever with negative or misaligned offsets
1718   unsigned mask = (1 << shift) - 1;
1719   if (byte_offset < 0 || byte_offset & mask) {
1720     mov(Rd, byte_offset);
1721     add(Rd, base, Rd);
1722     return Address(Rd);
1723   }
1724 
1725   // See if we can do this with two 12-bit offsets
1726   {
1727     unsigned long word_offset = byte_offset >> shift;
1728     unsigned long masked_offset = word_offset & 0xfff000;
1729     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1730         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1731       add(Rd, base, masked_offset << shift);
1732       word_offset -= masked_offset;
1733       return Address(Rd, word_offset << shift);
1734     }
1735   }
1736 
1737   // Do it the hard way
1738   mov(Rd, byte_offset);
1739   add(Rd, base, Rd);
1740   return Address(Rd);
1741 }
1742 
1743 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1744   if (UseLSE) {
1745     mov(tmp, 1);
1746     ldadd(Assembler::word, tmp, zr, counter_addr);
1747     return;
1748   }
1749   Label retry_load;
1750   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
1751     prfm(Address(counter_addr), PSTL1STRM);
1752   bind(retry_load);
1753   // flush and load exclusive from the memory location
1754   ldxrw(tmp, counter_addr);
1755   addw(tmp, tmp, 1);
1756   // if we store+flush with no intervening write tmp wil be zero
1757   stxrw(tmp2, tmp, counter_addr);
1758   cbnzw(tmp2, retry_load);
1759 }
1760 
1761 
1762 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1763                                     bool want_remainder, Register scratch)
1764 {
1765   // Full implementation of Java idiv and irem.  The function
1766   // returns the (pc) offset of the div instruction - may be needed
1767   // for implicit exceptions.
1768   //
1769   // constraint : ra/rb =/= scratch
1770   //         normal case
1771   //
1772   // input : ra: dividend
1773   //         rb: divisor
1774   //
1775   // result: either
1776   //         quotient  (= ra idiv rb)
1777   //         remainder (= ra irem rb)
1778 
1779   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1780 
1781   int idivl_offset = offset();
1782   if (! want_remainder) {
1783     sdivw(result, ra, rb);
1784   } else {
1785     sdivw(scratch, ra, rb);
1786     Assembler::msubw(result, scratch, rb, ra);
1787   }
1788 
1789   return idivl_offset;
1790 }
1791 
1792 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1793                                     bool want_remainder, Register scratch)
1794 {
1795   // Full implementation of Java ldiv and lrem.  The function
1796   // returns the (pc) offset of the div instruction - may be needed
1797   // for implicit exceptions.
1798   //
1799   // constraint : ra/rb =/= scratch
1800   //         normal case
1801   //
1802   // input : ra: dividend
1803   //         rb: divisor
1804   //
1805   // result: either
1806   //         quotient  (= ra idiv rb)
1807   //         remainder (= ra irem rb)
1808 
1809   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1810 
1811   int idivq_offset = offset();
1812   if (! want_remainder) {
1813     sdiv(result, ra, rb);
1814   } else {
1815     sdiv(scratch, ra, rb);
1816     Assembler::msub(result, scratch, rb, ra);
1817   }
1818 
1819   return idivq_offset;
1820 }
1821 
1822 void MacroAssembler::membar(Membar_mask_bits order_constraint) {
1823   address prev = pc() - NativeMembar::instruction_size;
1824   address last = code()->last_insn();
1825   if (last != NULL && nativeInstruction_at(last)->is_Membar() && prev == last) {
1826     NativeMembar *bar = NativeMembar_at(prev);
1827     // We are merging two memory barrier instructions.  On AArch64 we
1828     // can do this simply by ORing them together.
1829     bar->set_kind(bar->get_kind() | order_constraint);
1830     BLOCK_COMMENT("merged membar");
1831   } else {
1832     code()->set_last_insn(pc());
1833     dmb(Assembler::barrier(order_constraint));
1834   }
1835 }
1836 
1837 bool MacroAssembler::try_merge_ldst(Register rt, const Address &adr, size_t size_in_bytes, bool is_store) {
1838   if (ldst_can_merge(rt, adr, size_in_bytes, is_store)) {
1839     merge_ldst(rt, adr, size_in_bytes, is_store);
1840     code()->clear_last_insn();
1841     return true;
1842   } else {
1843     assert(size_in_bytes == 8 || size_in_bytes == 4, "only 8 bytes or 4 bytes load/store is supported.");
1844     const unsigned mask = size_in_bytes - 1;
1845     if (adr.getMode() == Address::base_plus_offset &&
1846         (adr.offset() & mask) == 0) { // only supports base_plus_offset.
1847       code()->set_last_insn(pc());
1848     }
1849     return false;
1850   }
1851 }
1852 
1853 void MacroAssembler::ldr(Register Rx, const Address &adr) {
1854   // We always try to merge two adjacent loads into one ldp.
1855   if (!try_merge_ldst(Rx, adr, 8, false)) {
1856     Assembler::ldr(Rx, adr);
1857   }
1858 }
1859 
1860 void MacroAssembler::ldrw(Register Rw, const Address &adr) {
1861   // We always try to merge two adjacent loads into one ldp.
1862   if (!try_merge_ldst(Rw, adr, 4, false)) {
1863     Assembler::ldrw(Rw, adr);
1864   }
1865 }
1866 
1867 void MacroAssembler::str(Register Rx, const Address &adr) {
1868   // We always try to merge two adjacent stores into one stp.
1869   if (!try_merge_ldst(Rx, adr, 8, true)) {
1870     Assembler::str(Rx, adr);
1871   }
1872 }
1873 
1874 void MacroAssembler::strw(Register Rw, const Address &adr) {
1875   // We always try to merge two adjacent stores into one stp.
1876   if (!try_merge_ldst(Rw, adr, 4, true)) {
1877     Assembler::strw(Rw, adr);
1878   }
1879 }
1880 
1881 // MacroAssembler routines found actually to be needed
1882 
1883 void MacroAssembler::push(Register src)
1884 {
1885   str(src, Address(pre(esp, -1 * wordSize)));
1886 }
1887 
1888 void MacroAssembler::pop(Register dst)
1889 {
1890   ldr(dst, Address(post(esp, 1 * wordSize)));
1891 }
1892 
1893 // Note: load_unsigned_short used to be called load_unsigned_word.
1894 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1895   int off = offset();
1896   ldrh(dst, src);
1897   return off;
1898 }
1899 
1900 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1901   int off = offset();
1902   ldrb(dst, src);
1903   return off;
1904 }
1905 
1906 int MacroAssembler::load_signed_short(Register dst, Address src) {
1907   int off = offset();
1908   ldrsh(dst, src);
1909   return off;
1910 }
1911 
1912 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1913   int off = offset();
1914   ldrsb(dst, src);
1915   return off;
1916 }
1917 
1918 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1919   int off = offset();
1920   ldrshw(dst, src);
1921   return off;
1922 }
1923 
1924 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1925   int off = offset();
1926   ldrsbw(dst, src);
1927   return off;
1928 }
1929 
1930 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1931   switch (size_in_bytes) {
1932   case  8:  ldr(dst, src); break;
1933   case  4:  ldrw(dst, src); break;
1934   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1935   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1936   default:  ShouldNotReachHere();
1937   }
1938 }
1939 
1940 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1941   switch (size_in_bytes) {
1942   case  8:  str(src, dst); break;
1943   case  4:  strw(src, dst); break;
1944   case  2:  strh(src, dst); break;
1945   case  1:  strb(src, dst); break;
1946   default:  ShouldNotReachHere();
1947   }
1948 }
1949 
1950 void MacroAssembler::decrementw(Register reg, int value)
1951 {
1952   if (value < 0)  { incrementw(reg, -value);      return; }
1953   if (value == 0) {                               return; }
1954   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1955   /* else */ {
1956     guarantee(reg != rscratch2, "invalid dst for register decrement");
1957     movw(rscratch2, (unsigned)value);
1958     subw(reg, reg, rscratch2);
1959   }
1960 }
1961 
1962 void MacroAssembler::decrement(Register reg, int value)
1963 {
1964   if (value < 0)  { increment(reg, -value);      return; }
1965   if (value == 0) {                              return; }
1966   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1967   /* else */ {
1968     assert(reg != rscratch2, "invalid dst for register decrement");
1969     mov(rscratch2, (unsigned long)value);
1970     sub(reg, reg, rscratch2);
1971   }
1972 }
1973 
1974 void MacroAssembler::decrementw(Address dst, int value)
1975 {
1976   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1977   if (dst.getMode() == Address::literal) {
1978     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1979     lea(rscratch2, dst);
1980     dst = Address(rscratch2);
1981   }
1982   ldrw(rscratch1, dst);
1983   decrementw(rscratch1, value);
1984   strw(rscratch1, dst);
1985 }
1986 
1987 void MacroAssembler::decrement(Address dst, int value)
1988 {
1989   assert(!dst.uses(rscratch1), "invalid address for decrement");
1990   if (dst.getMode() == Address::literal) {
1991     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
1992     lea(rscratch2, dst);
1993     dst = Address(rscratch2);
1994   }
1995   ldr(rscratch1, dst);
1996   decrement(rscratch1, value);
1997   str(rscratch1, dst);
1998 }
1999 
2000 void MacroAssembler::incrementw(Register reg, int value)
2001 {
2002   if (value < 0)  { decrementw(reg, -value);      return; }
2003   if (value == 0) {                               return; }
2004   if (value < (1 << 12)) { addw(reg, reg, value); return; }
2005   /* else */ {
2006     assert(reg != rscratch2, "invalid dst for register increment");
2007     movw(rscratch2, (unsigned)value);
2008     addw(reg, reg, rscratch2);
2009   }
2010 }
2011 
2012 void MacroAssembler::increment(Register reg, int value)
2013 {
2014   if (value < 0)  { decrement(reg, -value);      return; }
2015   if (value == 0) {                              return; }
2016   if (value < (1 << 12)) { add(reg, reg, value); return; }
2017   /* else */ {
2018     assert(reg != rscratch2, "invalid dst for register increment");
2019     movw(rscratch2, (unsigned)value);
2020     add(reg, reg, rscratch2);
2021   }
2022 }
2023 
2024 void MacroAssembler::incrementw(Address dst, int value)
2025 {
2026   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2027   if (dst.getMode() == Address::literal) {
2028     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2029     lea(rscratch2, dst);
2030     dst = Address(rscratch2);
2031   }
2032   ldrw(rscratch1, dst);
2033   incrementw(rscratch1, value);
2034   strw(rscratch1, dst);
2035 }
2036 
2037 void MacroAssembler::increment(Address dst, int value)
2038 {
2039   assert(!dst.uses(rscratch1), "invalid dst for address increment");
2040   if (dst.getMode() == Address::literal) {
2041     assert(abs(value) < (1 << 12), "invalid value and address mode combination");
2042     lea(rscratch2, dst);
2043     dst = Address(rscratch2);
2044   }
2045   ldr(rscratch1, dst);
2046   increment(rscratch1, value);
2047   str(rscratch1, dst);
2048 }
2049 
2050 
2051 void MacroAssembler::pusha() {
2052   push(0x7fffffff, sp);
2053 }
2054 
2055 void MacroAssembler::popa() {
2056   pop(0x7fffffff, sp);
2057 }
2058 
2059 // Push lots of registers in the bit set supplied.  Don't push sp.
2060 // Return the number of words pushed
2061 int MacroAssembler::push(unsigned int bitset, Register stack) {
2062   int words_pushed = 0;
2063 
2064   // Scan bitset to accumulate register pairs
2065   unsigned char regs[32];
2066   int count = 0;
2067   for (int reg = 0; reg <= 30; reg++) {
2068     if (1 & bitset)
2069       regs[count++] = reg;
2070     bitset >>= 1;
2071   }
2072   regs[count++] = zr->encoding_nocheck();
2073   count &= ~1;  // Only push an even nuber of regs
2074 
2075   if (count) {
2076     stp(as_Register(regs[0]), as_Register(regs[1]),
2077        Address(pre(stack, -count * wordSize)));
2078     words_pushed += 2;
2079   }
2080   for (int i = 2; i < count; i += 2) {
2081     stp(as_Register(regs[i]), as_Register(regs[i+1]),
2082        Address(stack, i * wordSize));
2083     words_pushed += 2;
2084   }
2085 
2086   assert(words_pushed == count, "oops, pushed != count");
2087 
2088   return count;
2089 }
2090 
2091 int MacroAssembler::pop(unsigned int bitset, Register stack) {
2092   int words_pushed = 0;
2093 
2094   // Scan bitset to accumulate register pairs
2095   unsigned char regs[32];
2096   int count = 0;
2097   for (int reg = 0; reg <= 30; reg++) {
2098     if (1 & bitset)
2099       regs[count++] = reg;
2100     bitset >>= 1;
2101   }
2102   regs[count++] = zr->encoding_nocheck();
2103   count &= ~1;
2104 
2105   for (int i = 2; i < count; i += 2) {
2106     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
2107        Address(stack, i * wordSize));
2108     words_pushed += 2;
2109   }
2110   if (count) {
2111     ldp(as_Register(regs[0]), as_Register(regs[1]),
2112        Address(post(stack, count * wordSize)));
2113     words_pushed += 2;
2114   }
2115 
2116   assert(words_pushed == count, "oops, pushed != count");
2117 
2118   return count;
2119 }
2120 #ifdef ASSERT
2121 void MacroAssembler::verify_heapbase(const char* msg) {
2122 #if 0
2123   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2124   assert (Universe::heap() != NULL, "java heap should be initialized");
2125   if (CheckCompressedOops) {
2126     Label ok;
2127     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2128     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2129     br(Assembler::EQ, ok);
2130     stop(msg);
2131     bind(ok);
2132     pop(1 << rscratch1->encoding(), sp);
2133   }
2134 #endif
2135 }
2136 #endif
2137 
2138 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
2139   Label done, not_weak;
2140   cbz(value, done);           // Use NULL as-is.
2141 
2142   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2143   tbz(r0, 0, not_weak);    // Test for jweak tag.
2144 
2145   // Resolve jweak.
2146   access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, value,
2147                  Address(value, -JNIHandles::weak_tag_value), tmp, thread);
2148   verify_oop(value);
2149   b(done);
2150 
2151   bind(not_weak);
2152   // Resolve (untagged) jobject.
2153   access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread);
2154   verify_oop(value);
2155   bind(done);
2156 }
2157 
2158 void MacroAssembler::stop(const char* msg) {
2159   address ip = pc();
2160   pusha();
2161   mov(c_rarg0, (address)msg);
2162   mov(c_rarg1, (address)ip);
2163   mov(c_rarg2, sp);
2164   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2165   // call(c_rarg3);
2166   blrt(c_rarg3, 3, 0, 1);
2167   hlt(0);
2168 }
2169 
2170 void MacroAssembler::warn(const char* msg) {
2171   pusha();
2172   mov(c_rarg0, (address)msg);
2173   mov(lr, CAST_FROM_FN_PTR(address, warning));
2174   blrt(lr, 1, 0, MacroAssembler::ret_type_void);
2175   popa();
2176 }
2177 
2178 void MacroAssembler::unimplemented(const char* what) {
2179   const char* buf = NULL;
2180   {
2181     ResourceMark rm;
2182     stringStream ss;
2183     ss.print("unimplemented: %s", what);
2184     buf = code_string(ss.as_string());
2185   }
2186   stop(buf);
2187 }
2188 
2189 // If a constant does not fit in an immediate field, generate some
2190 // number of MOV instructions and then perform the operation.
2191 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2192                                            add_sub_imm_insn insn1,
2193                                            add_sub_reg_insn insn2) {
2194   assert(Rd != zr, "Rd = zr and not setting flags?");
2195   if (operand_valid_for_add_sub_immediate((int)imm)) {
2196     (this->*insn1)(Rd, Rn, imm);
2197   } else {
2198     if (uabs(imm) < (1 << 24)) {
2199        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2200        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2201     } else {
2202        assert_different_registers(Rd, Rn);
2203        mov(Rd, (uint64_t)imm);
2204        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2205     }
2206   }
2207 }
2208 
2209 // Seperate vsn which sets the flags. Optimisations are more restricted
2210 // because we must set the flags correctly.
2211 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2212                                            add_sub_imm_insn insn1,
2213                                            add_sub_reg_insn insn2) {
2214   if (operand_valid_for_add_sub_immediate((int)imm)) {
2215     (this->*insn1)(Rd, Rn, imm);
2216   } else {
2217     assert_different_registers(Rd, Rn);
2218     assert(Rd != zr, "overflow in immediate operand");
2219     mov(Rd, (uint64_t)imm);
2220     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2221   }
2222 }
2223 
2224 
2225 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2226   if (increment.is_register()) {
2227     add(Rd, Rn, increment.as_register());
2228   } else {
2229     add(Rd, Rn, increment.as_constant());
2230   }
2231 }
2232 
2233 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2234   if (increment.is_register()) {
2235     addw(Rd, Rn, increment.as_register());
2236   } else {
2237     addw(Rd, Rn, increment.as_constant());
2238   }
2239 }
2240 
2241 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2242   if (decrement.is_register()) {
2243     sub(Rd, Rn, decrement.as_register());
2244   } else {
2245     sub(Rd, Rn, decrement.as_constant());
2246   }
2247 }
2248 
2249 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2250   if (decrement.is_register()) {
2251     subw(Rd, Rn, decrement.as_register());
2252   } else {
2253     subw(Rd, Rn, decrement.as_constant());
2254   }
2255 }
2256 
2257 void MacroAssembler::reinit_heapbase()
2258 {
2259   if (UseCompressedOops) {
2260     if (Universe::is_fully_initialized()) {
2261       mov(rheapbase, Universe::narrow_ptrs_base());
2262     } else {
2263       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2264       ldr(rheapbase, Address(rheapbase));
2265     }
2266   }
2267 }
2268 
2269 // this simulates the behaviour of the x86 cmpxchg instruction using a
2270 // load linked/store conditional pair. we use the acquire/release
2271 // versions of these instructions so that we flush pending writes as
2272 // per Java semantics.
2273 
2274 // n.b the x86 version assumes the old value to be compared against is
2275 // in rax and updates rax with the value located in memory if the
2276 // cmpxchg fails. we supply a register for the old value explicitly
2277 
2278 // the aarch64 load linked/store conditional instructions do not
2279 // accept an offset. so, unlike x86, we must provide a plain register
2280 // to identify the memory word to be compared/exchanged rather than a
2281 // register+offset Address.
2282 
2283 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2284                                 Label &succeed, Label *fail) {
2285   // oldv holds comparison value
2286   // newv holds value to write in exchange
2287   // addr identifies memory word to compare against/update
2288   if (UseLSE) {
2289     mov(tmp, oldv);
2290     casal(Assembler::xword, oldv, newv, addr);
2291     cmp(tmp, oldv);
2292     br(Assembler::EQ, succeed);
2293     membar(AnyAny);
2294   } else {
2295     Label retry_load, nope;
2296     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2297       prfm(Address(addr), PSTL1STRM);
2298     bind(retry_load);
2299     // flush and load exclusive from the memory location
2300     // and fail if it is not what we expect
2301     ldaxr(tmp, addr);
2302     cmp(tmp, oldv);
2303     br(Assembler::NE, nope);
2304     // if we store+flush with no intervening write tmp wil be zero
2305     stlxr(tmp, newv, addr);
2306     cbzw(tmp, succeed);
2307     // retry so we only ever return after a load fails to compare
2308     // ensures we don't return a stale value after a failed write.
2309     b(retry_load);
2310     // if the memory word differs we return it in oldv and signal a fail
2311     bind(nope);
2312     membar(AnyAny);
2313     mov(oldv, tmp);
2314   }
2315   if (fail)
2316     b(*fail);
2317 }
2318 
2319 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
2320                                         Label &succeed, Label *fail) {
2321   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
2322   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
2323 }
2324 
2325 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2326                                 Label &succeed, Label *fail) {
2327   // oldv holds comparison value
2328   // newv holds value to write in exchange
2329   // addr identifies memory word to compare against/update
2330   // tmp returns 0/1 for success/failure
2331   if (UseLSE) {
2332     mov(tmp, oldv);
2333     casal(Assembler::word, oldv, newv, addr);
2334     cmp(tmp, oldv);
2335     br(Assembler::EQ, succeed);
2336     membar(AnyAny);
2337   } else {
2338     Label retry_load, nope;
2339     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2340       prfm(Address(addr), PSTL1STRM);
2341     bind(retry_load);
2342     // flush and load exclusive from the memory location
2343     // and fail if it is not what we expect
2344     ldaxrw(tmp, addr);
2345     cmp(tmp, oldv);
2346     br(Assembler::NE, nope);
2347     // if we store+flush with no intervening write tmp wil be zero
2348     stlxrw(tmp, newv, addr);
2349     cbzw(tmp, succeed);
2350     // retry so we only ever return after a load fails to compare
2351     // ensures we don't return a stale value after a failed write.
2352     b(retry_load);
2353     // if the memory word differs we return it in oldv and signal a fail
2354     bind(nope);
2355     membar(AnyAny);
2356     mov(oldv, tmp);
2357   }
2358   if (fail)
2359     b(*fail);
2360 }
2361 
2362 // A generic CAS; success or failure is in the EQ flag.  A weak CAS
2363 // doesn't retry and may fail spuriously.  If the oldval is wanted,
2364 // Pass a register for the result, otherwise pass noreg.
2365 
2366 // Clobbers rscratch1
2367 void MacroAssembler::cmpxchg(Register addr, Register expected,
2368                              Register new_val,
2369                              enum operand_size size,
2370                              bool acquire, bool release,
2371                              bool weak,
2372                              Register result) {
2373   if (result == noreg)  result = rscratch1;
2374   BLOCK_COMMENT("cmpxchg {");
2375   if (UseLSE) {
2376     mov(result, expected);
2377     lse_cas(result, new_val, addr, size, acquire, release, /*not_pair*/ true);
2378     compare_eq(result, expected, size);
2379   } else {
2380     Label retry_load, done;
2381     if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))
2382       prfm(Address(addr), PSTL1STRM);
2383     bind(retry_load);
2384     load_exclusive(result, addr, size, acquire);
2385     compare_eq(result, expected, size);
2386     br(Assembler::NE, done);
2387     store_exclusive(rscratch1, new_val, addr, size, release);
2388     if (weak) {
2389       cmpw(rscratch1, 0u);  // If the store fails, return NE to our caller.
2390     } else {
2391       cbnzw(rscratch1, retry_load);
2392     }
2393     bind(done);
2394   }
2395   BLOCK_COMMENT("} cmpxchg");
2396 }
2397 
2398 // A generic comparison. Only compares for equality, clobbers rscratch1.
2399 void MacroAssembler::compare_eq(Register rm, Register rn, enum operand_size size) {
2400   if (size == xword) {
2401     cmp(rm, rn);
2402   } else if (size == word) {
2403     cmpw(rm, rn);
2404   } else if (size == halfword) {
2405     eorw(rscratch1, rm, rn);
2406     ands(zr, rscratch1, 0xffff);
2407   } else if (size == byte) {
2408     eorw(rscratch1, rm, rn);
2409     ands(zr, rscratch1, 0xff);
2410   } else {
2411     ShouldNotReachHere();
2412   }
2413 }
2414 
2415 
2416 static bool different(Register a, RegisterOrConstant b, Register c) {
2417   if (b.is_constant())
2418     return a != c;
2419   else
2420     return a != b.as_register() && a != c && b.as_register() != c;
2421 }
2422 
2423 #define ATOMIC_OP(NAME, LDXR, OP, IOP, AOP, STXR, sz)                   \
2424 void MacroAssembler::atomic_##NAME(Register prev, RegisterOrConstant incr, Register addr) { \
2425   if (UseLSE) {                                                         \
2426     prev = prev->is_valid() ? prev : zr;                                \
2427     if (incr.is_register()) {                                           \
2428       AOP(sz, incr.as_register(), prev, addr);                          \
2429     } else {                                                            \
2430       mov(rscratch2, incr.as_constant());                               \
2431       AOP(sz, rscratch2, prev, addr);                                   \
2432     }                                                                   \
2433     return;                                                             \
2434   }                                                                     \
2435   Register result = rscratch2;                                          \
2436   if (prev->is_valid())                                                 \
2437     result = different(prev, incr, addr) ? prev : rscratch2;            \
2438                                                                         \
2439   Label retry_load;                                                     \
2440   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2441     prfm(Address(addr), PSTL1STRM);                                     \
2442   bind(retry_load);                                                     \
2443   LDXR(result, addr);                                                   \
2444   OP(rscratch1, result, incr);                                          \
2445   STXR(rscratch2, rscratch1, addr);                                     \
2446   cbnzw(rscratch2, retry_load);                                         \
2447   if (prev->is_valid() && prev != result) {                             \
2448     IOP(prev, rscratch1, incr);                                         \
2449   }                                                                     \
2450 }
2451 
2452 ATOMIC_OP(add, ldxr, add, sub, ldadd, stxr, Assembler::xword)
2453 ATOMIC_OP(addw, ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2454 ATOMIC_OP(addal, ldaxr, add, sub, ldaddal, stlxr, Assembler::xword)
2455 ATOMIC_OP(addalw, ldaxrw, addw, subw, ldaddal, stlxrw, Assembler::word)
2456 
2457 #undef ATOMIC_OP
2458 
2459 #define ATOMIC_XCHG(OP, AOP, LDXR, STXR, sz)                            \
2460 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2461   if (UseLSE) {                                                         \
2462     prev = prev->is_valid() ? prev : zr;                                \
2463     AOP(sz, newv, prev, addr);                                          \
2464     return;                                                             \
2465   }                                                                     \
2466   Register result = rscratch2;                                          \
2467   if (prev->is_valid())                                                 \
2468     result = different(prev, newv, addr) ? prev : rscratch2;            \
2469                                                                         \
2470   Label retry_load;                                                     \
2471   if ((VM_Version::features() & VM_Version::CPU_STXR_PREFETCH))         \
2472     prfm(Address(addr), PSTL1STRM);                                     \
2473   bind(retry_load);                                                     \
2474   LDXR(result, addr);                                                   \
2475   STXR(rscratch1, newv, addr);                                          \
2476   cbnzw(rscratch1, retry_load);                                         \
2477   if (prev->is_valid() && prev != result)                               \
2478     mov(prev, result);                                                  \
2479 }
2480 
2481 ATOMIC_XCHG(xchg, swp, ldxr, stxr, Assembler::xword)
2482 ATOMIC_XCHG(xchgw, swp, ldxrw, stxrw, Assembler::word)
2483 ATOMIC_XCHG(xchgal, swpal, ldaxr, stlxr, Assembler::xword)
2484 ATOMIC_XCHG(xchgalw, swpal, ldaxrw, stlxrw, Assembler::word)
2485 
2486 #undef ATOMIC_XCHG
2487 
2488 #ifndef PRODUCT
2489 extern "C" void findpc(intptr_t x);
2490 #endif
2491 
2492 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2493 {
2494   // In order to get locks to work, we need to fake a in_VM state
2495   if (ShowMessageBoxOnError ) {
2496     JavaThread* thread = JavaThread::current();
2497     JavaThreadState saved_state = thread->thread_state();
2498     thread->set_thread_state(_thread_in_vm);
2499 #ifndef PRODUCT
2500     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2501       ttyLocker ttyl;
2502       BytecodeCounter::print();
2503     }
2504 #endif
2505     if (os::message_box(msg, "Execution stopped, print registers?")) {
2506       ttyLocker ttyl;
2507       tty->print_cr(" pc = 0x%016lx", pc);
2508 #ifndef PRODUCT
2509       tty->cr();
2510       findpc(pc);
2511       tty->cr();
2512 #endif
2513       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2514       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2515       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2516       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2517       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2518       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2519       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2520       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2521       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2522       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2523       tty->print_cr("r10 = 0x%016lx", regs[10]);
2524       tty->print_cr("r11 = 0x%016lx", regs[11]);
2525       tty->print_cr("r12 = 0x%016lx", regs[12]);
2526       tty->print_cr("r13 = 0x%016lx", regs[13]);
2527       tty->print_cr("r14 = 0x%016lx", regs[14]);
2528       tty->print_cr("r15 = 0x%016lx", regs[15]);
2529       tty->print_cr("r16 = 0x%016lx", regs[16]);
2530       tty->print_cr("r17 = 0x%016lx", regs[17]);
2531       tty->print_cr("r18 = 0x%016lx", regs[18]);
2532       tty->print_cr("r19 = 0x%016lx", regs[19]);
2533       tty->print_cr("r20 = 0x%016lx", regs[20]);
2534       tty->print_cr("r21 = 0x%016lx", regs[21]);
2535       tty->print_cr("r22 = 0x%016lx", regs[22]);
2536       tty->print_cr("r23 = 0x%016lx", regs[23]);
2537       tty->print_cr("r24 = 0x%016lx", regs[24]);
2538       tty->print_cr("r25 = 0x%016lx", regs[25]);
2539       tty->print_cr("r26 = 0x%016lx", regs[26]);
2540       tty->print_cr("r27 = 0x%016lx", regs[27]);
2541       tty->print_cr("r28 = 0x%016lx", regs[28]);
2542       tty->print_cr("r30 = 0x%016lx", regs[30]);
2543       tty->print_cr("r31 = 0x%016lx", regs[31]);
2544       BREAKPOINT;
2545     }
2546     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2547   } else {
2548     ttyLocker ttyl;
2549     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2550                     msg);
2551     assert(false, "DEBUG MESSAGE: %s", msg);
2552   }
2553 }
2554 
2555 #ifdef BUILTIN_SIM
2556 // routine to generate an x86 prolog for a stub function which
2557 // bootstraps into the generated ARM code which directly follows the
2558 // stub
2559 //
2560 // the argument encodes the number of general and fp registers
2561 // passed by the caller and the callng convention (currently just
2562 // the number of general registers and assumes C argument passing)
2563 
2564 extern "C" {
2565 int aarch64_stub_prolog_size();
2566 void aarch64_stub_prolog();
2567 void aarch64_prolog();
2568 }
2569 
2570 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2571                                    address *prolog_ptr)
2572 {
2573   int calltype = (((ret_type & 0x3) << 8) |
2574                   ((fp_arg_count & 0xf) << 4) |
2575                   (gp_arg_count & 0xf));
2576 
2577   // the addresses for the x86 to ARM entry code we need to use
2578   address start = pc();
2579   // printf("start = %lx\n", start);
2580   int byteCount =  aarch64_stub_prolog_size();
2581   // printf("byteCount = %x\n", byteCount);
2582   int instructionCount = (byteCount + 3)/ 4;
2583   // printf("instructionCount = %x\n", instructionCount);
2584   for (int i = 0; i < instructionCount; i++) {
2585     nop();
2586   }
2587 
2588   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2589 
2590   // write the address of the setup routine and the call format at the
2591   // end of into the copied code
2592   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2593   if (prolog_ptr)
2594     patch_end[-2] = (u_int64_t)prolog_ptr;
2595   patch_end[-1] = calltype;
2596 }
2597 #endif
2598 
2599 void MacroAssembler::push_call_clobbered_registers() {
2600   int step = 4 * wordSize;
2601   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2602   sub(sp, sp, step);
2603   mov(rscratch1, -step);
2604   // Push v0-v7, v16-v31.
2605   for (int i = 31; i>= 4; i -= 4) {
2606     if (i <= v7->encoding() || i >= v16->encoding())
2607       st1(as_FloatRegister(i-3), as_FloatRegister(i-2), as_FloatRegister(i-1),
2608           as_FloatRegister(i), T1D, Address(post(sp, rscratch1)));
2609   }
2610   st1(as_FloatRegister(0), as_FloatRegister(1), as_FloatRegister(2),
2611       as_FloatRegister(3), T1D, Address(sp));
2612 }
2613 
2614 void MacroAssembler::pop_call_clobbered_registers() {
2615   for (int i = 0; i < 32; i += 4) {
2616     if (i <= v7->encoding() || i >= v16->encoding())
2617       ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2618           as_FloatRegister(i+3), T1D, Address(post(sp, 4 * wordSize)));
2619   }
2620 
2621   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2622 }
2623 
2624 void MacroAssembler::push_CPU_state(bool save_vectors) {
2625   int step = (save_vectors ? 8 : 4) * wordSize;
2626   push(0x3fffffff, sp);         // integer registers except lr & sp
2627   mov(rscratch1, -step);
2628   sub(sp, sp, step);
2629   for (int i = 28; i >= 4; i -= 4) {
2630     st1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2631         as_FloatRegister(i+3), save_vectors ? T2D : T1D, Address(post(sp, rscratch1)));
2632   }
2633   st1(v0, v1, v2, v3, save_vectors ? T2D : T1D, sp);
2634 }
2635 
2636 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2637   int step = (restore_vectors ? 8 : 4) * wordSize;
2638   for (int i = 0; i <= 28; i += 4)
2639     ld1(as_FloatRegister(i), as_FloatRegister(i+1), as_FloatRegister(i+2),
2640         as_FloatRegister(i+3), restore_vectors ? T2D : T1D, Address(post(sp, step)));
2641   pop(0x3fffffff, sp);         // integer registers except lr & sp
2642 }
2643 
2644 /**
2645  * Helpers for multiply_to_len().
2646  */
2647 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2648                                      Register src1, Register src2) {
2649   adds(dest_lo, dest_lo, src1);
2650   adc(dest_hi, dest_hi, zr);
2651   adds(dest_lo, dest_lo, src2);
2652   adc(final_dest_hi, dest_hi, zr);
2653 }
2654 
2655 // Generate an address from (r + r1 extend offset).  "size" is the
2656 // size of the operand.  The result may be in rscratch2.
2657 Address MacroAssembler::offsetted_address(Register r, Register r1,
2658                                           Address::extend ext, int offset, int size) {
2659   if (offset || (ext.shift() % size != 0)) {
2660     lea(rscratch2, Address(r, r1, ext));
2661     return Address(rscratch2, offset);
2662   } else {
2663     return Address(r, r1, ext);
2664   }
2665 }
2666 
2667 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2668 {
2669   assert(offset >= 0, "spill to negative address?");
2670   // Offset reachable ?
2671   //   Not aligned - 9 bits signed offset
2672   //   Aligned - 12 bits unsigned offset shifted
2673   Register base = sp;
2674   if ((offset & (size-1)) && offset >= (1<<8)) {
2675     add(tmp, base, offset & ((1<<12)-1));
2676     base = tmp;
2677     offset &= -1<<12;
2678   }
2679 
2680   if (offset >= (1<<12) * size) {
2681     add(tmp, base, offset & (((1<<12)-1)<<12));
2682     base = tmp;
2683     offset &= ~(((1<<12)-1)<<12);
2684   }
2685 
2686   return Address(base, offset);
2687 }
2688 
2689 // Checks whether offset is aligned.
2690 // Returns true if it is, else false.
2691 bool MacroAssembler::merge_alignment_check(Register base,
2692                                            size_t size,
2693                                            long cur_offset,
2694                                            long prev_offset) const {
2695   if (AvoidUnalignedAccesses) {
2696     if (base == sp) {
2697       // Checks whether low offset if aligned to pair of registers.
2698       long pair_mask = size * 2 - 1;
2699       long offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2700       return (offset & pair_mask) == 0;
2701     } else { // If base is not sp, we can't guarantee the access is aligned.
2702       return false;
2703     }
2704   } else {
2705     long mask = size - 1;
2706     // Load/store pair instruction only supports element size aligned offset.
2707     return (cur_offset & mask) == 0 && (prev_offset & mask) == 0;
2708   }
2709 }
2710 
2711 // Checks whether current and previous loads/stores can be merged.
2712 // Returns true if it can be merged, else false.
2713 bool MacroAssembler::ldst_can_merge(Register rt,
2714                                     const Address &adr,
2715                                     size_t cur_size_in_bytes,
2716                                     bool is_store) const {
2717   address prev = pc() - NativeInstruction::instruction_size;
2718   address last = code()->last_insn();
2719 
2720   if (last == NULL || !nativeInstruction_at(last)->is_Imm_LdSt()) {
2721     return false;
2722   }
2723 
2724   if (adr.getMode() != Address::base_plus_offset || prev != last) {
2725     return false;
2726   }
2727 
2728   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2729   size_t prev_size_in_bytes = prev_ldst->size_in_bytes();
2730 
2731   assert(prev_size_in_bytes == 4 || prev_size_in_bytes == 8, "only supports 64/32bit merging.");
2732   assert(cur_size_in_bytes == 4 || cur_size_in_bytes == 8, "only supports 64/32bit merging.");
2733 
2734   if (cur_size_in_bytes != prev_size_in_bytes || is_store != prev_ldst->is_store()) {
2735     return false;
2736   }
2737 
2738   long max_offset = 63 * prev_size_in_bytes;
2739   long min_offset = -64 * prev_size_in_bytes;
2740 
2741   assert(prev_ldst->is_not_pre_post_index(), "pre-index or post-index is not supported to be merged.");
2742 
2743   // Only same base can be merged.
2744   if (adr.base() != prev_ldst->base()) {
2745     return false;
2746   }
2747 
2748   long cur_offset = adr.offset();
2749   long prev_offset = prev_ldst->offset();
2750   size_t diff = abs(cur_offset - prev_offset);
2751   if (diff != prev_size_in_bytes) {
2752     return false;
2753   }
2754 
2755   // Following cases can not be merged:
2756   // ldr x2, [x2, #8]
2757   // ldr x3, [x2, #16]
2758   // or:
2759   // ldr x2, [x3, #8]
2760   // ldr x2, [x3, #16]
2761   // If t1 and t2 is the same in "ldp t1, t2, [xn, #imm]", we'll get SIGILL.
2762   if (!is_store && (adr.base() == prev_ldst->target() || rt == prev_ldst->target())) {
2763     return false;
2764   }
2765 
2766   long low_offset = prev_offset > cur_offset ? cur_offset : prev_offset;
2767   // Offset range must be in ldp/stp instruction's range.
2768   if (low_offset > max_offset || low_offset < min_offset) {
2769     return false;
2770   }
2771 
2772   if (merge_alignment_check(adr.base(), prev_size_in_bytes, cur_offset, prev_offset)) {
2773     return true;
2774   }
2775 
2776   return false;
2777 }
2778 
2779 // Merge current load/store with previous load/store into ldp/stp.
2780 void MacroAssembler::merge_ldst(Register rt,
2781                                 const Address &adr,
2782                                 size_t cur_size_in_bytes,
2783                                 bool is_store) {
2784 
2785   assert(ldst_can_merge(rt, adr, cur_size_in_bytes, is_store) == true, "cur and prev must be able to be merged.");
2786 
2787   Register rt_low, rt_high;
2788   address prev = pc() - NativeInstruction::instruction_size;
2789   NativeLdSt* prev_ldst = NativeLdSt_at(prev);
2790 
2791   long offset;
2792 
2793   if (adr.offset() < prev_ldst->offset()) {
2794     offset = adr.offset();
2795     rt_low = rt;
2796     rt_high = prev_ldst->target();
2797   } else {
2798     offset = prev_ldst->offset();
2799     rt_low = prev_ldst->target();
2800     rt_high = rt;
2801   }
2802 
2803   Address adr_p = Address(prev_ldst->base(), offset);
2804   // Overwrite previous generated binary.
2805   code_section()->set_end(prev);
2806 
2807   const int sz = prev_ldst->size_in_bytes();
2808   assert(sz == 8 || sz == 4, "only supports 64/32bit merging.");
2809   if (!is_store) {
2810     BLOCK_COMMENT("merged ldr pair");
2811     if (sz == 8) {
2812       ldp(rt_low, rt_high, adr_p);
2813     } else {
2814       ldpw(rt_low, rt_high, adr_p);
2815     }
2816   } else {
2817     BLOCK_COMMENT("merged str pair");
2818     if (sz == 8) {
2819       stp(rt_low, rt_high, adr_p);
2820     } else {
2821       stpw(rt_low, rt_high, adr_p);
2822     }
2823   }
2824 }
2825 
2826 /**
2827  * Multiply 64 bit by 64 bit first loop.
2828  */
2829 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2830                                            Register y, Register y_idx, Register z,
2831                                            Register carry, Register product,
2832                                            Register idx, Register kdx) {
2833   //
2834   //  jlong carry, x[], y[], z[];
2835   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2836   //    huge_128 product = y[idx] * x[xstart] + carry;
2837   //    z[kdx] = (jlong)product;
2838   //    carry  = (jlong)(product >>> 64);
2839   //  }
2840   //  z[xstart] = carry;
2841   //
2842 
2843   Label L_first_loop, L_first_loop_exit;
2844   Label L_one_x, L_one_y, L_multiply;
2845 
2846   subsw(xstart, xstart, 1);
2847   br(Assembler::MI, L_one_x);
2848 
2849   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2850   ldr(x_xstart, Address(rscratch1));
2851   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2852 
2853   bind(L_first_loop);
2854   subsw(idx, idx, 1);
2855   br(Assembler::MI, L_first_loop_exit);
2856   subsw(idx, idx, 1);
2857   br(Assembler::MI, L_one_y);
2858   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2859   ldr(y_idx, Address(rscratch1));
2860   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2861   bind(L_multiply);
2862 
2863   // AArch64 has a multiply-accumulate instruction that we can't use
2864   // here because it has no way to process carries, so we have to use
2865   // separate add and adc instructions.  Bah.
2866   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2867   mul(product, x_xstart, y_idx);
2868   adds(product, product, carry);
2869   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2870 
2871   subw(kdx, kdx, 2);
2872   ror(product, product, 32); // back to big-endian
2873   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2874 
2875   b(L_first_loop);
2876 
2877   bind(L_one_y);
2878   ldrw(y_idx, Address(y,  0));
2879   b(L_multiply);
2880 
2881   bind(L_one_x);
2882   ldrw(x_xstart, Address(x,  0));
2883   b(L_first_loop);
2884 
2885   bind(L_first_loop_exit);
2886 }
2887 
2888 /**
2889  * Multiply 128 bit by 128. Unrolled inner loop.
2890  *
2891  */
2892 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2893                                              Register carry, Register carry2,
2894                                              Register idx, Register jdx,
2895                                              Register yz_idx1, Register yz_idx2,
2896                                              Register tmp, Register tmp3, Register tmp4,
2897                                              Register tmp6, Register product_hi) {
2898 
2899   //   jlong carry, x[], y[], z[];
2900   //   int kdx = ystart+1;
2901   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2902   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2903   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2904   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2905   //     carry  = (jlong)(tmp4 >>> 64);
2906   //     z[kdx+idx+1] = (jlong)tmp3;
2907   //     z[kdx+idx] = (jlong)tmp4;
2908   //   }
2909   //   idx += 2;
2910   //   if (idx > 0) {
2911   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2912   //     z[kdx+idx] = (jlong)yz_idx1;
2913   //     carry  = (jlong)(yz_idx1 >>> 64);
2914   //   }
2915   //
2916 
2917   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2918 
2919   lsrw(jdx, idx, 2);
2920 
2921   bind(L_third_loop);
2922 
2923   subsw(jdx, jdx, 1);
2924   br(Assembler::MI, L_third_loop_exit);
2925   subw(idx, idx, 4);
2926 
2927   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2928 
2929   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2930 
2931   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2932 
2933   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2934   ror(yz_idx2, yz_idx2, 32);
2935 
2936   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2937 
2938   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2939   umulh(tmp4, product_hi, yz_idx1);
2940 
2941   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2942   ror(rscratch2, rscratch2, 32);
2943 
2944   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2945   umulh(carry2, product_hi, yz_idx2);
2946 
2947   // propagate sum of both multiplications into carry:tmp4:tmp3
2948   adds(tmp3, tmp3, carry);
2949   adc(tmp4, tmp4, zr);
2950   adds(tmp3, tmp3, rscratch1);
2951   adcs(tmp4, tmp4, tmp);
2952   adc(carry, carry2, zr);
2953   adds(tmp4, tmp4, rscratch2);
2954   adc(carry, carry, zr);
2955 
2956   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2957   ror(tmp4, tmp4, 32);
2958   stp(tmp4, tmp3, Address(tmp6, 0));
2959 
2960   b(L_third_loop);
2961   bind (L_third_loop_exit);
2962 
2963   andw (idx, idx, 0x3);
2964   cbz(idx, L_post_third_loop_done);
2965 
2966   Label L_check_1;
2967   subsw(idx, idx, 2);
2968   br(Assembler::MI, L_check_1);
2969 
2970   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2971   ldr(yz_idx1, Address(rscratch1, 0));
2972   ror(yz_idx1, yz_idx1, 32);
2973   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2974   umulh(tmp4, product_hi, yz_idx1);
2975   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2976   ldr(yz_idx2, Address(rscratch1, 0));
2977   ror(yz_idx2, yz_idx2, 32);
2978 
2979   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2980 
2981   ror(tmp3, tmp3, 32);
2982   str(tmp3, Address(rscratch1, 0));
2983 
2984   bind (L_check_1);
2985 
2986   andw (idx, idx, 0x1);
2987   subsw(idx, idx, 1);
2988   br(Assembler::MI, L_post_third_loop_done);
2989   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2990   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2991   umulh(carry2, tmp4, product_hi);
2992   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2993 
2994   add2_with_carry(carry2, tmp3, tmp4, carry);
2995 
2996   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2997   extr(carry, carry2, tmp3, 32);
2998 
2999   bind(L_post_third_loop_done);
3000 }
3001 
3002 /**
3003  * Code for BigInteger::multiplyToLen() instrinsic.
3004  *
3005  * r0: x
3006  * r1: xlen
3007  * r2: y
3008  * r3: ylen
3009  * r4:  z
3010  * r5: zlen
3011  * r10: tmp1
3012  * r11: tmp2
3013  * r12: tmp3
3014  * r13: tmp4
3015  * r14: tmp5
3016  * r15: tmp6
3017  * r16: tmp7
3018  *
3019  */
3020 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
3021                                      Register z, Register zlen,
3022                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
3023                                      Register tmp5, Register tmp6, Register product_hi) {
3024 
3025   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
3026 
3027   const Register idx = tmp1;
3028   const Register kdx = tmp2;
3029   const Register xstart = tmp3;
3030 
3031   const Register y_idx = tmp4;
3032   const Register carry = tmp5;
3033   const Register product  = xlen;
3034   const Register x_xstart = zlen;  // reuse register
3035 
3036   // First Loop.
3037   //
3038   //  final static long LONG_MASK = 0xffffffffL;
3039   //  int xstart = xlen - 1;
3040   //  int ystart = ylen - 1;
3041   //  long carry = 0;
3042   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
3043   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
3044   //    z[kdx] = (int)product;
3045   //    carry = product >>> 32;
3046   //  }
3047   //  z[xstart] = (int)carry;
3048   //
3049 
3050   movw(idx, ylen);      // idx = ylen;
3051   movw(kdx, zlen);      // kdx = xlen+ylen;
3052   mov(carry, zr);       // carry = 0;
3053 
3054   Label L_done;
3055 
3056   movw(xstart, xlen);
3057   subsw(xstart, xstart, 1);
3058   br(Assembler::MI, L_done);
3059 
3060   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
3061 
3062   Label L_second_loop;
3063   cbzw(kdx, L_second_loop);
3064 
3065   Label L_carry;
3066   subw(kdx, kdx, 1);
3067   cbzw(kdx, L_carry);
3068 
3069   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3070   lsr(carry, carry, 32);
3071   subw(kdx, kdx, 1);
3072 
3073   bind(L_carry);
3074   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
3075 
3076   // Second and third (nested) loops.
3077   //
3078   // for (int i = xstart-1; i >= 0; i--) { // Second loop
3079   //   carry = 0;
3080   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
3081   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
3082   //                    (z[k] & LONG_MASK) + carry;
3083   //     z[k] = (int)product;
3084   //     carry = product >>> 32;
3085   //   }
3086   //   z[i] = (int)carry;
3087   // }
3088   //
3089   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
3090 
3091   const Register jdx = tmp1;
3092 
3093   bind(L_second_loop);
3094   mov(carry, zr);                // carry = 0;
3095   movw(jdx, ylen);               // j = ystart+1
3096 
3097   subsw(xstart, xstart, 1);      // i = xstart-1;
3098   br(Assembler::MI, L_done);
3099 
3100   str(z, Address(pre(sp, -4 * wordSize)));
3101 
3102   Label L_last_x;
3103   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
3104   subsw(xstart, xstart, 1);       // i = xstart-1;
3105   br(Assembler::MI, L_last_x);
3106 
3107   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
3108   ldr(product_hi, Address(rscratch1));
3109   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
3110 
3111   Label L_third_loop_prologue;
3112   bind(L_third_loop_prologue);
3113 
3114   str(ylen, Address(sp, wordSize));
3115   stp(x, xstart, Address(sp, 2 * wordSize));
3116   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
3117                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
3118   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
3119   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
3120 
3121   addw(tmp3, xlen, 1);
3122   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3123   subsw(tmp3, tmp3, 1);
3124   br(Assembler::MI, L_done);
3125 
3126   lsr(carry, carry, 32);
3127   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
3128   b(L_second_loop);
3129 
3130   // Next infrequent code is moved outside loops.
3131   bind(L_last_x);
3132   ldrw(product_hi, Address(x,  0));
3133   b(L_third_loop_prologue);
3134 
3135   bind(L_done);
3136 }
3137 
3138 // Code for BigInteger::mulAdd instrinsic
3139 // out     = r0
3140 // in      = r1
3141 // offset  = r2  (already out.length-offset)
3142 // len     = r3
3143 // k       = r4
3144 //
3145 // pseudo code from java implementation:
3146 // carry = 0;
3147 // offset = out.length-offset - 1;
3148 // for (int j=len-1; j >= 0; j--) {
3149 //     product = (in[j] & LONG_MASK) * kLong + (out[offset] & LONG_MASK) + carry;
3150 //     out[offset--] = (int)product;
3151 //     carry = product >>> 32;
3152 // }
3153 // return (int)carry;
3154 void MacroAssembler::mul_add(Register out, Register in, Register offset,
3155       Register len, Register k) {
3156     Label LOOP, END;
3157     // pre-loop
3158     cmp(len, zr); // cmp, not cbz/cbnz: to use condition twice => less branches
3159     csel(out, zr, out, Assembler::EQ);
3160     br(Assembler::EQ, END);
3161     add(in, in, len, LSL, 2); // in[j+1] address
3162     add(offset, out, offset, LSL, 2); // out[offset + 1] address
3163     mov(out, zr); // used to keep carry now
3164     BIND(LOOP);
3165     ldrw(rscratch1, Address(pre(in, -4)));
3166     madd(rscratch1, rscratch1, k, out);
3167     ldrw(rscratch2, Address(pre(offset, -4)));
3168     add(rscratch1, rscratch1, rscratch2);
3169     strw(rscratch1, Address(offset));
3170     lsr(out, rscratch1, 32);
3171     subs(len, len, 1);
3172     br(Assembler::NE, LOOP);
3173     BIND(END);
3174 }
3175 
3176 /**
3177  * Emits code to update CRC-32 with a byte value according to constants in table
3178  *
3179  * @param [in,out]crc   Register containing the crc.
3180  * @param [in]val       Register containing the byte to fold into the CRC.
3181  * @param [in]table     Register containing the table of crc constants.
3182  *
3183  * uint32_t crc;
3184  * val = crc_table[(val ^ crc) & 0xFF];
3185  * crc = val ^ (crc >> 8);
3186  *
3187  */
3188 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3189   eor(val, val, crc);
3190   andr(val, val, 0xff);
3191   ldrw(val, Address(table, val, Address::lsl(2)));
3192   eor(crc, val, crc, Assembler::LSR, 8);
3193 }
3194 
3195 /**
3196  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3197  *
3198  * @param [in,out]crc   Register containing the crc.
3199  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3200  * @param [in]table0    Register containing table 0 of crc constants.
3201  * @param [in]table1    Register containing table 1 of crc constants.
3202  * @param [in]table2    Register containing table 2 of crc constants.
3203  * @param [in]table3    Register containing table 3 of crc constants.
3204  *
3205  * uint32_t crc;
3206  *   v = crc ^ v
3207  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3208  *
3209  */
3210 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3211         Register table0, Register table1, Register table2, Register table3,
3212         bool upper) {
3213   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
3214   uxtb(tmp, v);
3215   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
3216   ubfx(tmp, v, 8, 8);
3217   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
3218   eor(crc, crc, tmp);
3219   ubfx(tmp, v, 16, 8);
3220   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
3221   eor(crc, crc, tmp);
3222   ubfx(tmp, v, 24, 8);
3223   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
3224   eor(crc, crc, tmp);
3225 }
3226 
3227 void MacroAssembler::kernel_crc32_using_crc32(Register crc, Register buf,
3228         Register len, Register tmp0, Register tmp1, Register tmp2,
3229         Register tmp3) {
3230     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3231     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3232 
3233     mvnw(crc, crc);
3234 
3235     subs(len, len, 128);
3236     br(Assembler::GE, CRC_by64_pre);
3237   BIND(CRC_less64);
3238     adds(len, len, 128-32);
3239     br(Assembler::GE, CRC_by32_loop);
3240   BIND(CRC_less32);
3241     adds(len, len, 32-4);
3242     br(Assembler::GE, CRC_by4_loop);
3243     adds(len, len, 4);
3244     br(Assembler::GT, CRC_by1_loop);
3245     b(L_exit);
3246 
3247   BIND(CRC_by32_loop);
3248     ldp(tmp0, tmp1, Address(post(buf, 16)));
3249     subs(len, len, 32);
3250     crc32x(crc, crc, tmp0);
3251     ldr(tmp2, Address(post(buf, 8)));
3252     crc32x(crc, crc, tmp1);
3253     ldr(tmp3, Address(post(buf, 8)));
3254     crc32x(crc, crc, tmp2);
3255     crc32x(crc, crc, tmp3);
3256     br(Assembler::GE, CRC_by32_loop);
3257     cmn(len, 32);
3258     br(Assembler::NE, CRC_less32);
3259     b(L_exit);
3260 
3261   BIND(CRC_by4_loop);
3262     ldrw(tmp0, Address(post(buf, 4)));
3263     subs(len, len, 4);
3264     crc32w(crc, crc, tmp0);
3265     br(Assembler::GE, CRC_by4_loop);
3266     adds(len, len, 4);
3267     br(Assembler::LE, L_exit);
3268   BIND(CRC_by1_loop);
3269     ldrb(tmp0, Address(post(buf, 1)));
3270     subs(len, len, 1);
3271     crc32b(crc, crc, tmp0);
3272     br(Assembler::GT, CRC_by1_loop);
3273     b(L_exit);
3274 
3275   BIND(CRC_by64_pre);
3276     sub(buf, buf, 8);
3277     ldp(tmp0, tmp1, Address(buf, 8));
3278     crc32x(crc, crc, tmp0);
3279     ldr(tmp2, Address(buf, 24));
3280     crc32x(crc, crc, tmp1);
3281     ldr(tmp3, Address(buf, 32));
3282     crc32x(crc, crc, tmp2);
3283     ldr(tmp0, Address(buf, 40));
3284     crc32x(crc, crc, tmp3);
3285     ldr(tmp1, Address(buf, 48));
3286     crc32x(crc, crc, tmp0);
3287     ldr(tmp2, Address(buf, 56));
3288     crc32x(crc, crc, tmp1);
3289     ldr(tmp3, Address(pre(buf, 64)));
3290 
3291     b(CRC_by64_loop);
3292 
3293     align(CodeEntryAlignment);
3294   BIND(CRC_by64_loop);
3295     subs(len, len, 64);
3296     crc32x(crc, crc, tmp2);
3297     ldr(tmp0, Address(buf, 8));
3298     crc32x(crc, crc, tmp3);
3299     ldr(tmp1, Address(buf, 16));
3300     crc32x(crc, crc, tmp0);
3301     ldr(tmp2, Address(buf, 24));
3302     crc32x(crc, crc, tmp1);
3303     ldr(tmp3, Address(buf, 32));
3304     crc32x(crc, crc, tmp2);
3305     ldr(tmp0, Address(buf, 40));
3306     crc32x(crc, crc, tmp3);
3307     ldr(tmp1, Address(buf, 48));
3308     crc32x(crc, crc, tmp0);
3309     ldr(tmp2, Address(buf, 56));
3310     crc32x(crc, crc, tmp1);
3311     ldr(tmp3, Address(pre(buf, 64)));
3312     br(Assembler::GE, CRC_by64_loop);
3313 
3314     // post-loop
3315     crc32x(crc, crc, tmp2);
3316     crc32x(crc, crc, tmp3);
3317 
3318     sub(len, len, 64);
3319     add(buf, buf, 8);
3320     cmn(len, 128);
3321     br(Assembler::NE, CRC_less64);
3322   BIND(L_exit);
3323     mvnw(crc, crc);
3324 }
3325 
3326 /**
3327  * @param crc   register containing existing CRC (32-bit)
3328  * @param buf   register pointing to input byte buffer (byte*)
3329  * @param len   register containing number of bytes
3330  * @param table register that will contain address of CRC table
3331  * @param tmp   scratch register
3332  */
3333 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3334         Register table0, Register table1, Register table2, Register table3,
3335         Register tmp, Register tmp2, Register tmp3) {
3336   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
3337   unsigned long offset;
3338 
3339   if (UseCRC32) {
3340       kernel_crc32_using_crc32(crc, buf, len, table0, table1, table2, table3);
3341       return;
3342   }
3343 
3344     mvnw(crc, crc);
3345 
3346     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3347     if (offset) add(table0, table0, offset);
3348     add(table1, table0, 1*256*sizeof(juint));
3349     add(table2, table0, 2*256*sizeof(juint));
3350     add(table3, table0, 3*256*sizeof(juint));
3351 
3352   if (UseNeon) {
3353       cmp(len, (u1)64);
3354       br(Assembler::LT, L_by16);
3355       eor(v16, T16B, v16, v16);
3356 
3357     Label L_fold;
3358 
3359       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3360 
3361       ld1(v0, v1, T2D, post(buf, 32));
3362       ld1r(v4, T2D, post(tmp, 8));
3363       ld1r(v5, T2D, post(tmp, 8));
3364       ld1r(v6, T2D, post(tmp, 8));
3365       ld1r(v7, T2D, post(tmp, 8));
3366       mov(v16, T4S, 0, crc);
3367 
3368       eor(v0, T16B, v0, v16);
3369       sub(len, len, 64);
3370 
3371     BIND(L_fold);
3372       pmull(v22, T8H, v0, v5, T8B);
3373       pmull(v20, T8H, v0, v7, T8B);
3374       pmull(v23, T8H, v0, v4, T8B);
3375       pmull(v21, T8H, v0, v6, T8B);
3376 
3377       pmull2(v18, T8H, v0, v5, T16B);
3378       pmull2(v16, T8H, v0, v7, T16B);
3379       pmull2(v19, T8H, v0, v4, T16B);
3380       pmull2(v17, T8H, v0, v6, T16B);
3381 
3382       uzp1(v24, T8H, v20, v22);
3383       uzp2(v25, T8H, v20, v22);
3384       eor(v20, T16B, v24, v25);
3385 
3386       uzp1(v26, T8H, v16, v18);
3387       uzp2(v27, T8H, v16, v18);
3388       eor(v16, T16B, v26, v27);
3389 
3390       ushll2(v22, T4S, v20, T8H, 8);
3391       ushll(v20, T4S, v20, T4H, 8);
3392 
3393       ushll2(v18, T4S, v16, T8H, 8);
3394       ushll(v16, T4S, v16, T4H, 8);
3395 
3396       eor(v22, T16B, v23, v22);
3397       eor(v18, T16B, v19, v18);
3398       eor(v20, T16B, v21, v20);
3399       eor(v16, T16B, v17, v16);
3400 
3401       uzp1(v17, T2D, v16, v20);
3402       uzp2(v21, T2D, v16, v20);
3403       eor(v17, T16B, v17, v21);
3404 
3405       ushll2(v20, T2D, v17, T4S, 16);
3406       ushll(v16, T2D, v17, T2S, 16);
3407 
3408       eor(v20, T16B, v20, v22);
3409       eor(v16, T16B, v16, v18);
3410 
3411       uzp1(v17, T2D, v20, v16);
3412       uzp2(v21, T2D, v20, v16);
3413       eor(v28, T16B, v17, v21);
3414 
3415       pmull(v22, T8H, v1, v5, T8B);
3416       pmull(v20, T8H, v1, v7, T8B);
3417       pmull(v23, T8H, v1, v4, T8B);
3418       pmull(v21, T8H, v1, v6, T8B);
3419 
3420       pmull2(v18, T8H, v1, v5, T16B);
3421       pmull2(v16, T8H, v1, v7, T16B);
3422       pmull2(v19, T8H, v1, v4, T16B);
3423       pmull2(v17, T8H, v1, v6, T16B);
3424 
3425       ld1(v0, v1, T2D, post(buf, 32));
3426 
3427       uzp1(v24, T8H, v20, v22);
3428       uzp2(v25, T8H, v20, v22);
3429       eor(v20, T16B, v24, v25);
3430 
3431       uzp1(v26, T8H, v16, v18);
3432       uzp2(v27, T8H, v16, v18);
3433       eor(v16, T16B, v26, v27);
3434 
3435       ushll2(v22, T4S, v20, T8H, 8);
3436       ushll(v20, T4S, v20, T4H, 8);
3437 
3438       ushll2(v18, T4S, v16, T8H, 8);
3439       ushll(v16, T4S, v16, T4H, 8);
3440 
3441       eor(v22, T16B, v23, v22);
3442       eor(v18, T16B, v19, v18);
3443       eor(v20, T16B, v21, v20);
3444       eor(v16, T16B, v17, v16);
3445 
3446       uzp1(v17, T2D, v16, v20);
3447       uzp2(v21, T2D, v16, v20);
3448       eor(v16, T16B, v17, v21);
3449 
3450       ushll2(v20, T2D, v16, T4S, 16);
3451       ushll(v16, T2D, v16, T2S, 16);
3452 
3453       eor(v20, T16B, v22, v20);
3454       eor(v16, T16B, v16, v18);
3455 
3456       uzp1(v17, T2D, v20, v16);
3457       uzp2(v21, T2D, v20, v16);
3458       eor(v20, T16B, v17, v21);
3459 
3460       shl(v16, T2D, v28, 1);
3461       shl(v17, T2D, v20, 1);
3462 
3463       eor(v0, T16B, v0, v16);
3464       eor(v1, T16B, v1, v17);
3465 
3466       subs(len, len, 32);
3467       br(Assembler::GE, L_fold);
3468 
3469       mov(crc, 0);
3470       mov(tmp, v0, T1D, 0);
3471       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3472       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3473       mov(tmp, v0, T1D, 1);
3474       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3475       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3476       mov(tmp, v1, T1D, 0);
3477       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3478       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3479       mov(tmp, v1, T1D, 1);
3480       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3481       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3482 
3483       add(len, len, 32);
3484   }
3485 
3486   BIND(L_by16);
3487     subs(len, len, 16);
3488     br(Assembler::GE, L_by16_loop);
3489     adds(len, len, 16-4);
3490     br(Assembler::GE, L_by4_loop);
3491     adds(len, len, 4);
3492     br(Assembler::GT, L_by1_loop);
3493     b(L_exit);
3494 
3495   BIND(L_by4_loop);
3496     ldrw(tmp, Address(post(buf, 4)));
3497     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3498     subs(len, len, 4);
3499     br(Assembler::GE, L_by4_loop);
3500     adds(len, len, 4);
3501     br(Assembler::LE, L_exit);
3502   BIND(L_by1_loop);
3503     subs(len, len, 1);
3504     ldrb(tmp, Address(post(buf, 1)));
3505     update_byte_crc32(crc, tmp, table0);
3506     br(Assembler::GT, L_by1_loop);
3507     b(L_exit);
3508 
3509     align(CodeEntryAlignment);
3510   BIND(L_by16_loop);
3511     subs(len, len, 16);
3512     ldp(tmp, tmp3, Address(post(buf, 16)));
3513     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3514     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3515     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3516     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3517     br(Assembler::GE, L_by16_loop);
3518     adds(len, len, 16-4);
3519     br(Assembler::GE, L_by4_loop);
3520     adds(len, len, 4);
3521     br(Assembler::GT, L_by1_loop);
3522   BIND(L_exit);
3523     mvnw(crc, crc);
3524 }
3525 
3526 void MacroAssembler::kernel_crc32c_using_crc32c(Register crc, Register buf,
3527         Register len, Register tmp0, Register tmp1, Register tmp2,
3528         Register tmp3) {
3529     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop, CRC_less64, CRC_by64_pre, CRC_by32_loop, CRC_less32, L_exit;
3530     assert_different_registers(crc, buf, len, tmp0, tmp1, tmp2, tmp3);
3531 
3532     subs(len, len, 128);
3533     br(Assembler::GE, CRC_by64_pre);
3534   BIND(CRC_less64);
3535     adds(len, len, 128-32);
3536     br(Assembler::GE, CRC_by32_loop);
3537   BIND(CRC_less32);
3538     adds(len, len, 32-4);
3539     br(Assembler::GE, CRC_by4_loop);
3540     adds(len, len, 4);
3541     br(Assembler::GT, CRC_by1_loop);
3542     b(L_exit);
3543 
3544   BIND(CRC_by32_loop);
3545     ldp(tmp0, tmp1, Address(post(buf, 16)));
3546     subs(len, len, 32);
3547     crc32cx(crc, crc, tmp0);
3548     ldr(tmp2, Address(post(buf, 8)));
3549     crc32cx(crc, crc, tmp1);
3550     ldr(tmp3, Address(post(buf, 8)));
3551     crc32cx(crc, crc, tmp2);
3552     crc32cx(crc, crc, tmp3);
3553     br(Assembler::GE, CRC_by32_loop);
3554     cmn(len, 32);
3555     br(Assembler::NE, CRC_less32);
3556     b(L_exit);
3557 
3558   BIND(CRC_by4_loop);
3559     ldrw(tmp0, Address(post(buf, 4)));
3560     subs(len, len, 4);
3561     crc32cw(crc, crc, tmp0);
3562     br(Assembler::GE, CRC_by4_loop);
3563     adds(len, len, 4);
3564     br(Assembler::LE, L_exit);
3565   BIND(CRC_by1_loop);
3566     ldrb(tmp0, Address(post(buf, 1)));
3567     subs(len, len, 1);
3568     crc32cb(crc, crc, tmp0);
3569     br(Assembler::GT, CRC_by1_loop);
3570     b(L_exit);
3571 
3572   BIND(CRC_by64_pre);
3573     sub(buf, buf, 8);
3574     ldp(tmp0, tmp1, Address(buf, 8));
3575     crc32cx(crc, crc, tmp0);
3576     ldr(tmp2, Address(buf, 24));
3577     crc32cx(crc, crc, tmp1);
3578     ldr(tmp3, Address(buf, 32));
3579     crc32cx(crc, crc, tmp2);
3580     ldr(tmp0, Address(buf, 40));
3581     crc32cx(crc, crc, tmp3);
3582     ldr(tmp1, Address(buf, 48));
3583     crc32cx(crc, crc, tmp0);
3584     ldr(tmp2, Address(buf, 56));
3585     crc32cx(crc, crc, tmp1);
3586     ldr(tmp3, Address(pre(buf, 64)));
3587 
3588     b(CRC_by64_loop);
3589 
3590     align(CodeEntryAlignment);
3591   BIND(CRC_by64_loop);
3592     subs(len, len, 64);
3593     crc32cx(crc, crc, tmp2);
3594     ldr(tmp0, Address(buf, 8));
3595     crc32cx(crc, crc, tmp3);
3596     ldr(tmp1, Address(buf, 16));
3597     crc32cx(crc, crc, tmp0);
3598     ldr(tmp2, Address(buf, 24));
3599     crc32cx(crc, crc, tmp1);
3600     ldr(tmp3, Address(buf, 32));
3601     crc32cx(crc, crc, tmp2);
3602     ldr(tmp0, Address(buf, 40));
3603     crc32cx(crc, crc, tmp3);
3604     ldr(tmp1, Address(buf, 48));
3605     crc32cx(crc, crc, tmp0);
3606     ldr(tmp2, Address(buf, 56));
3607     crc32cx(crc, crc, tmp1);
3608     ldr(tmp3, Address(pre(buf, 64)));
3609     br(Assembler::GE, CRC_by64_loop);
3610 
3611     // post-loop
3612     crc32cx(crc, crc, tmp2);
3613     crc32cx(crc, crc, tmp3);
3614 
3615     sub(len, len, 64);
3616     add(buf, buf, 8);
3617     cmn(len, 128);
3618     br(Assembler::NE, CRC_less64);
3619   BIND(L_exit);
3620 }
3621 
3622 /**
3623  * @param crc   register containing existing CRC (32-bit)
3624  * @param buf   register pointing to input byte buffer (byte*)
3625  * @param len   register containing number of bytes
3626  * @param table register that will contain address of CRC table
3627  * @param tmp   scratch register
3628  */
3629 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
3630         Register table0, Register table1, Register table2, Register table3,
3631         Register tmp, Register tmp2, Register tmp3) {
3632   kernel_crc32c_using_crc32c(crc, buf, len, table0, table1, table2, table3);
3633 }
3634 
3635 
3636 SkipIfEqual::SkipIfEqual(
3637     MacroAssembler* masm, const bool* flag_addr, bool value) {
3638   _masm = masm;
3639   unsigned long offset;
3640   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3641   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3642   _masm->cbzw(rscratch1, _label);
3643 }
3644 
3645 SkipIfEqual::~SkipIfEqual() {
3646   _masm->bind(_label);
3647 }
3648 
3649 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3650   Address adr;
3651   switch(dst.getMode()) {
3652   case Address::base_plus_offset:
3653     // This is the expected mode, although we allow all the other
3654     // forms below.
3655     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3656     break;
3657   default:
3658     lea(rscratch2, dst);
3659     adr = Address(rscratch2);
3660     break;
3661   }
3662   ldr(rscratch1, adr);
3663   add(rscratch1, rscratch1, src);
3664   str(rscratch1, adr);
3665 }
3666 
3667 void MacroAssembler::cmpptr(Register src1, Address src2) {
3668   unsigned long offset;
3669   adrp(rscratch1, src2, offset);
3670   ldr(rscratch1, Address(rscratch1, offset));
3671   cmp(src1, rscratch1);
3672 }
3673 
3674 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
3675   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
3676   bs->obj_equals(this, obj1, obj2);
3677 }
3678 
3679 void MacroAssembler::load_klass(Register dst, Register src) {
3680   if (UseCompressedClassPointers) {
3681     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3682     decode_klass_not_null(dst);
3683   } else {
3684     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3685   }
3686 }
3687 
3688 // ((OopHandle)result).resolve();
3689 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
3690   // OopHandle::resolve is an indirection.
3691   access_load_at(T_OBJECT, IN_NATIVE, result, Address(result, 0), tmp, noreg);
3692 }
3693 
3694 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
3695   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
3696   ldr(dst, Address(rmethod, Method::const_offset()));
3697   ldr(dst, Address(dst, ConstMethod::constants_offset()));
3698   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
3699   ldr(dst, Address(dst, mirror_offset));
3700   resolve_oop_handle(dst, tmp);
3701 }
3702 
3703 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3704   if (UseCompressedClassPointers) {
3705     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3706     if (Universe::narrow_klass_base() == NULL) {
3707       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3708       return;
3709     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3710                && Universe::narrow_klass_shift() == 0) {
3711       // Only the bottom 32 bits matter
3712       cmpw(trial_klass, tmp);
3713       return;
3714     }
3715     decode_klass_not_null(tmp);
3716   } else {
3717     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3718   }
3719   cmp(trial_klass, tmp);
3720 }
3721 
3722 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3723   load_klass(dst, src);
3724   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3725 }
3726 
3727 void MacroAssembler::store_klass(Register dst, Register src) {
3728   // FIXME: Should this be a store release?  concurrent gcs assumes
3729   // klass length is valid if klass field is not null.
3730   if (UseCompressedClassPointers) {
3731     encode_klass_not_null(src);
3732     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3733   } else {
3734     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3735   }
3736 }
3737 
3738 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3739   if (UseCompressedClassPointers) {
3740     // Store to klass gap in destination
3741     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3742   }
3743 }
3744 
3745 // Algorithm must match CompressedOops::encode.
3746 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3747 #ifdef ASSERT
3748   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3749 #endif
3750   verify_oop(s, "broken oop in encode_heap_oop");
3751   if (Universe::narrow_oop_base() == NULL) {
3752     if (Universe::narrow_oop_shift() != 0) {
3753       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3754       lsr(d, s, LogMinObjAlignmentInBytes);
3755     } else {
3756       mov(d, s);
3757     }
3758   } else {
3759     subs(d, s, rheapbase);
3760     csel(d, d, zr, Assembler::HS);
3761     lsr(d, d, LogMinObjAlignmentInBytes);
3762 
3763     /*  Old algorithm: is this any worse?
3764     Label nonnull;
3765     cbnz(r, nonnull);
3766     sub(r, r, rheapbase);
3767     bind(nonnull);
3768     lsr(r, r, LogMinObjAlignmentInBytes);
3769     */
3770   }
3771 }
3772 
3773 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3774 #ifdef ASSERT
3775   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3776   if (CheckCompressedOops) {
3777     Label ok;
3778     cbnz(r, ok);
3779     stop("null oop passed to encode_heap_oop_not_null");
3780     bind(ok);
3781   }
3782 #endif
3783   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3784   if (Universe::narrow_oop_base() != NULL) {
3785     sub(r, r, rheapbase);
3786   }
3787   if (Universe::narrow_oop_shift() != 0) {
3788     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3789     lsr(r, r, LogMinObjAlignmentInBytes);
3790   }
3791 }
3792 
3793 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3794 #ifdef ASSERT
3795   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3796   if (CheckCompressedOops) {
3797     Label ok;
3798     cbnz(src, ok);
3799     stop("null oop passed to encode_heap_oop_not_null2");
3800     bind(ok);
3801   }
3802 #endif
3803   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3804 
3805   Register data = src;
3806   if (Universe::narrow_oop_base() != NULL) {
3807     sub(dst, src, rheapbase);
3808     data = dst;
3809   }
3810   if (Universe::narrow_oop_shift() != 0) {
3811     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3812     lsr(dst, data, LogMinObjAlignmentInBytes);
3813     data = dst;
3814   }
3815   if (data == src)
3816     mov(dst, src);
3817 }
3818 
3819 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3820 #ifdef ASSERT
3821   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3822 #endif
3823   if (Universe::narrow_oop_base() == NULL) {
3824     if (Universe::narrow_oop_shift() != 0 || d != s) {
3825       lsl(d, s, Universe::narrow_oop_shift());
3826     }
3827   } else {
3828     Label done;
3829     if (d != s)
3830       mov(d, s);
3831     cbz(s, done);
3832     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3833     bind(done);
3834   }
3835   verify_oop(d, "broken oop in decode_heap_oop");
3836 }
3837 
3838 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3839   assert (UseCompressedOops, "should only be used for compressed headers");
3840   assert (Universe::heap() != NULL, "java heap should be initialized");
3841   // Cannot assert, unverified entry point counts instructions (see .ad file)
3842   // vtableStubs also counts instructions in pd_code_size_limit.
3843   // Also do not verify_oop as this is called by verify_oop.
3844   if (Universe::narrow_oop_shift() != 0) {
3845     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3846     if (Universe::narrow_oop_base() != NULL) {
3847       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3848     } else {
3849       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3850     }
3851   } else {
3852     assert (Universe::narrow_oop_base() == NULL, "sanity");
3853   }
3854 }
3855 
3856 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3857   assert (UseCompressedOops, "should only be used for compressed headers");
3858   assert (Universe::heap() != NULL, "java heap should be initialized");
3859   // Cannot assert, unverified entry point counts instructions (see .ad file)
3860   // vtableStubs also counts instructions in pd_code_size_limit.
3861   // Also do not verify_oop as this is called by verify_oop.
3862   if (Universe::narrow_oop_shift() != 0) {
3863     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3864     if (Universe::narrow_oop_base() != NULL) {
3865       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3866     } else {
3867       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3868     }
3869   } else {
3870     assert (Universe::narrow_oop_base() == NULL, "sanity");
3871     if (dst != src) {
3872       mov(dst, src);
3873     }
3874   }
3875 }
3876 
3877 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3878   if (Universe::narrow_klass_base() == NULL) {
3879     if (Universe::narrow_klass_shift() != 0) {
3880       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3881       lsr(dst, src, LogKlassAlignmentInBytes);
3882     } else {
3883       if (dst != src) mov(dst, src);
3884     }
3885     return;
3886   }
3887 
3888   if (use_XOR_for_compressed_class_base) {
3889     if (Universe::narrow_klass_shift() != 0) {
3890       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3891       lsr(dst, dst, LogKlassAlignmentInBytes);
3892     } else {
3893       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3894     }
3895     return;
3896   }
3897 
3898   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3899       && Universe::narrow_klass_shift() == 0) {
3900     movw(dst, src);
3901     return;
3902   }
3903 
3904 #ifdef ASSERT
3905   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3906 #endif
3907 
3908   Register rbase = dst;
3909   if (dst == src) rbase = rheapbase;
3910   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3911   sub(dst, src, rbase);
3912   if (Universe::narrow_klass_shift() != 0) {
3913     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3914     lsr(dst, dst, LogKlassAlignmentInBytes);
3915   }
3916   if (dst == src) reinit_heapbase();
3917 }
3918 
3919 void MacroAssembler::encode_klass_not_null(Register r) {
3920   encode_klass_not_null(r, r);
3921 }
3922 
3923 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3924   Register rbase = dst;
3925   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3926 
3927   if (Universe::narrow_klass_base() == NULL) {
3928     if (Universe::narrow_klass_shift() != 0) {
3929       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3930       lsl(dst, src, LogKlassAlignmentInBytes);
3931     } else {
3932       if (dst != src) mov(dst, src);
3933     }
3934     return;
3935   }
3936 
3937   if (use_XOR_for_compressed_class_base) {
3938     if (Universe::narrow_klass_shift() != 0) {
3939       lsl(dst, src, LogKlassAlignmentInBytes);
3940       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3941     } else {
3942       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3943     }
3944     return;
3945   }
3946 
3947   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3948       && Universe::narrow_klass_shift() == 0) {
3949     if (dst != src)
3950       movw(dst, src);
3951     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3952     return;
3953   }
3954 
3955   // Cannot assert, unverified entry point counts instructions (see .ad file)
3956   // vtableStubs also counts instructions in pd_code_size_limit.
3957   // Also do not verify_oop as this is called by verify_oop.
3958   if (dst == src) rbase = rheapbase;
3959   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3960   if (Universe::narrow_klass_shift() != 0) {
3961     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3962     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3963   } else {
3964     add(dst, rbase, src);
3965   }
3966   if (dst == src) reinit_heapbase();
3967 }
3968 
3969 void  MacroAssembler::decode_klass_not_null(Register r) {
3970   decode_klass_not_null(r, r);
3971 }
3972 
3973 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3974 #ifdef ASSERT
3975   {
3976     ThreadInVMfromUnknown tiv;
3977     assert (UseCompressedOops, "should only be used for compressed oops");
3978     assert (Universe::heap() != NULL, "java heap should be initialized");
3979     assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3980     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3981   }
3982 #endif
3983   int oop_index = oop_recorder()->find_index(obj);
3984   InstructionMark im(this);
3985   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3986   code_section()->relocate(inst_mark(), rspec);
3987   movz(dst, 0xDEAD, 16);
3988   movk(dst, 0xBEEF);
3989 }
3990 
3991 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3992   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3993   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3994   int index = oop_recorder()->find_index(k);
3995   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3996 
3997   InstructionMark im(this);
3998   RelocationHolder rspec = metadata_Relocation::spec(index);
3999   code_section()->relocate(inst_mark(), rspec);
4000   narrowKlass nk = Klass::encode_klass(k);
4001   movz(dst, (nk >> 16), 16);
4002   movk(dst, nk & 0xffff);
4003 }
4004 
4005 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators,
4006                                     Register dst, Address src,
4007                                     Register tmp1, Register thread_tmp) {
4008   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4009   decorators = AccessInternal::decorator_fixup(decorators);
4010   bool as_raw = (decorators & AS_RAW) != 0;
4011   if (as_raw) {
4012     bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4013   } else {
4014     bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4015   }
4016 }
4017 
4018 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators,
4019                                      Address dst, Register src,
4020                                      Register tmp1, Register thread_tmp) {
4021   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4022   decorators = AccessInternal::decorator_fixup(decorators);
4023   bool as_raw = (decorators & AS_RAW) != 0;
4024   if (as_raw) {
4025     bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4026   } else {
4027     bs->store_at(this, decorators, type, dst, src, tmp1, thread_tmp);
4028   }
4029 }
4030 
4031 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) {
4032   // Use stronger ACCESS_WRITE|ACCESS_READ by default.
4033   if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) {
4034     decorators |= ACCESS_READ | ACCESS_WRITE;
4035   }
4036   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
4037   return bs->resolve(this, decorators, obj);
4038 }
4039 
4040 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
4041                                    Register thread_tmp, DecoratorSet decorators) {
4042   access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4043 }
4044 
4045 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
4046                                             Register thread_tmp, DecoratorSet decorators) {
4047   access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
4048 }
4049 
4050 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
4051                                     Register thread_tmp, DecoratorSet decorators) {
4052   access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
4053 }
4054 
4055 // Used for storing NULLs.
4056 void MacroAssembler::store_heap_oop_null(Address dst) {
4057   access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg);
4058 }
4059 
4060 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
4061   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
4062   int index = oop_recorder()->allocate_metadata_index(obj);
4063   RelocationHolder rspec = metadata_Relocation::spec(index);
4064   return Address((address)obj, rspec);
4065 }
4066 
4067 // Move an oop into a register.  immediate is true if we want
4068 // immediate instrcutions, i.e. we are not going to patch this
4069 // instruction while the code is being executed by another thread.  In
4070 // that case we can use move immediates rather than the constant pool.
4071 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
4072   int oop_index;
4073   if (obj == NULL) {
4074     oop_index = oop_recorder()->allocate_oop_index(obj);
4075   } else {
4076 #ifdef ASSERT
4077     {
4078       ThreadInVMfromUnknown tiv;
4079       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
4080     }
4081 #endif
4082     oop_index = oop_recorder()->find_index(obj);
4083   }
4084   RelocationHolder rspec = oop_Relocation::spec(oop_index);
4085   if (! immediate) {
4086     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
4087     ldr_constant(dst, Address(dummy, rspec));
4088   } else
4089     mov(dst, Address((address)obj, rspec));
4090 }
4091 
4092 // Move a metadata address into a register.
4093 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
4094   int oop_index;
4095   if (obj == NULL) {
4096     oop_index = oop_recorder()->allocate_metadata_index(obj);
4097   } else {
4098     oop_index = oop_recorder()->find_index(obj);
4099   }
4100   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
4101   mov(dst, Address((address)obj, rspec));
4102 }
4103 
4104 Address MacroAssembler::constant_oop_address(jobject obj) {
4105 #ifdef ASSERT
4106   {
4107     ThreadInVMfromUnknown tiv;
4108     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
4109     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
4110   }
4111 #endif
4112   int oop_index = oop_recorder()->find_index(obj);
4113   return Address((address)obj, oop_Relocation::spec(oop_index));
4114 }
4115 
4116 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
4117 void MacroAssembler::tlab_allocate(Register obj,
4118                                    Register var_size_in_bytes,
4119                                    int con_size_in_bytes,
4120                                    Register t1,
4121                                    Register t2,
4122                                    Label& slow_case) {
4123   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4124   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
4125 }
4126 
4127 // Defines obj, preserves var_size_in_bytes
4128 void MacroAssembler::eden_allocate(Register obj,
4129                                    Register var_size_in_bytes,
4130                                    int con_size_in_bytes,
4131                                    Register t1,
4132                                    Label& slow_case) {
4133   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
4134   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
4135 }
4136 
4137 // Zero words; len is in bytes
4138 // Destroys all registers except addr
4139 // len must be a nonzero multiple of wordSize
4140 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
4141   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
4142 
4143 #ifdef ASSERT
4144   { Label L;
4145     tst(len, BytesPerWord - 1);
4146     br(Assembler::EQ, L);
4147     stop("len is not a multiple of BytesPerWord");
4148     bind(L);
4149   }
4150 #endif
4151 
4152 #ifndef PRODUCT
4153   block_comment("zero memory");
4154 #endif
4155 
4156   Label loop;
4157   Label entry;
4158 
4159 //  Algorithm:
4160 //
4161 //    scratch1 = cnt & 7;
4162 //    cnt -= scratch1;
4163 //    p += scratch1;
4164 //    switch (scratch1) {
4165 //      do {
4166 //        cnt -= 8;
4167 //          p[-8] = 0;
4168 //        case 7:
4169 //          p[-7] = 0;
4170 //        case 6:
4171 //          p[-6] = 0;
4172 //          // ...
4173 //        case 1:
4174 //          p[-1] = 0;
4175 //        case 0:
4176 //          p += 8;
4177 //      } while (cnt);
4178 //    }
4179 
4180   const int unroll = 8; // Number of str(zr) instructions we'll unroll
4181 
4182   lsr(len, len, LogBytesPerWord);
4183   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
4184   sub(len, len, rscratch1);      // cnt -= unroll
4185   // t1 always points to the end of the region we're about to zero
4186   add(t1, addr, rscratch1, Assembler::LSL, LogBytesPerWord);
4187   adr(rscratch2, entry);
4188   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 2);
4189   br(rscratch2);
4190   bind(loop);
4191   sub(len, len, unroll);
4192   for (int i = -unroll; i < 0; i++)
4193     Assembler::str(zr, Address(t1, i * wordSize));
4194   bind(entry);
4195   add(t1, t1, unroll * wordSize);
4196   cbnz(len, loop);
4197 }
4198 
4199 void MacroAssembler::verify_tlab() {
4200 #ifdef ASSERT
4201   if (UseTLAB && VerifyOops) {
4202     Label next, ok;
4203 
4204     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4205 
4206     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4207     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4208     cmp(rscratch2, rscratch1);
4209     br(Assembler::HS, next);
4210     STOP("assert(top >= start)");
4211     should_not_reach_here();
4212 
4213     bind(next);
4214     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4215     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4216     cmp(rscratch2, rscratch1);
4217     br(Assembler::HS, ok);
4218     STOP("assert(top <= end)");
4219     should_not_reach_here();
4220 
4221     bind(ok);
4222     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4223   }
4224 #endif
4225 }
4226 
4227 // Writes to stack successive pages until offset reached to check for
4228 // stack overflow + shadow pages.  This clobbers tmp.
4229 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4230   assert_different_registers(tmp, size, rscratch1);
4231   mov(tmp, sp);
4232   // Bang stack for total size given plus shadow page size.
4233   // Bang one page at a time because large size can bang beyond yellow and
4234   // red zones.
4235   Label loop;
4236   mov(rscratch1, os::vm_page_size());
4237   bind(loop);
4238   lea(tmp, Address(tmp, -os::vm_page_size()));
4239   subsw(size, size, rscratch1);
4240   str(size, Address(tmp));
4241   br(Assembler::GT, loop);
4242 
4243   // Bang down shadow pages too.
4244   // At this point, (tmp-0) is the last address touched, so don't
4245   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
4246   // was post-decremented.)  Skip this address by starting at i=1, and
4247   // touch a few more pages below.  N.B.  It is important to touch all
4248   // the way down to and including i=StackShadowPages.
4249   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
4250     // this could be any sized move but this is can be a debugging crumb
4251     // so the bigger the better.
4252     lea(tmp, Address(tmp, -os::vm_page_size()));
4253     str(size, Address(tmp));
4254   }
4255 }
4256 
4257 
4258 // Move the address of the polling page into dest.
4259 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
4260   if (SafepointMechanism::uses_thread_local_poll()) {
4261     ldr(dest, Address(rthread, Thread::polling_page_offset()));
4262   } else {
4263     unsigned long off;
4264     adrp(dest, Address(page, rtype), off);
4265     assert(off == 0, "polling page must be page aligned");
4266   }
4267 }
4268 
4269 // Move the address of the polling page into r, then read the polling
4270 // page.
4271 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4272   get_polling_page(r, page, rtype);
4273   return read_polling_page(r, rtype);
4274 }
4275 
4276 // Read the polling page.  The address of the polling page must
4277 // already be in r.
4278 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4279   InstructionMark im(this);
4280   code_section()->relocate(inst_mark(), rtype);
4281   ldrw(zr, Address(r, 0));
4282   return inst_mark();
4283 }
4284 
4285 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4286   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4287   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4288   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4289   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4290   long offset_low = dest_page - low_page;
4291   long offset_high = dest_page - high_page;
4292 
4293   assert(is_valid_AArch64_address(dest.target()), "bad address");
4294   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4295 
4296   InstructionMark im(this);
4297   code_section()->relocate(inst_mark(), dest.rspec());
4298   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4299   // the code cache so that if it is relocated we know it will still reach
4300   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4301     _adrp(reg1, dest.target());
4302   } else {
4303     unsigned long target = (unsigned long)dest.target();
4304     unsigned long adrp_target
4305       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4306 
4307     _adrp(reg1, (address)adrp_target);
4308     movk(reg1, target >> 32, 32);
4309   }
4310   byte_offset = (unsigned long)dest.target() & 0xfff;
4311 }
4312 
4313 void MacroAssembler::load_byte_map_base(Register reg) {
4314   jbyte *byte_map_base =
4315     ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
4316 
4317   if (is_valid_AArch64_address((address)byte_map_base)) {
4318     // Strictly speaking the byte_map_base isn't an address at all,
4319     // and it might even be negative.
4320     unsigned long offset;
4321     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4322     // We expect offset to be zero with most collectors.
4323     if (offset != 0) {
4324       add(reg, reg, offset);
4325     }
4326   } else {
4327     mov(reg, (uint64_t)byte_map_base);
4328   }
4329 }
4330 
4331 void MacroAssembler::build_frame(int framesize) {
4332   assert(framesize > 0, "framesize must be > 0");
4333   if (framesize < ((1 << 9) + 2 * wordSize)) {
4334     sub(sp, sp, framesize);
4335     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4336     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
4337   } else {
4338     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4339     if (PreserveFramePointer) mov(rfp, sp);
4340     if (framesize < ((1 << 12) + 2 * wordSize))
4341       sub(sp, sp, framesize - 2 * wordSize);
4342     else {
4343       mov(rscratch1, framesize - 2 * wordSize);
4344       sub(sp, sp, rscratch1);
4345     }
4346   }
4347 }
4348 
4349 void MacroAssembler::remove_frame(int framesize) {
4350   assert(framesize > 0, "framesize must be > 0");
4351   if (framesize < ((1 << 9) + 2 * wordSize)) {
4352     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4353     add(sp, sp, framesize);
4354   } else {
4355     if (framesize < ((1 << 12) + 2 * wordSize))
4356       add(sp, sp, framesize - 2 * wordSize);
4357     else {
4358       mov(rscratch1, framesize - 2 * wordSize);
4359       add(sp, sp, rscratch1);
4360     }
4361     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4362   }
4363 }
4364 
4365 #ifdef COMPILER2
4366 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4367 
4368 // Search for str1 in str2 and return index or -1
4369 void MacroAssembler::string_indexof(Register str2, Register str1,
4370                                     Register cnt2, Register cnt1,
4371                                     Register tmp1, Register tmp2,
4372                                     Register tmp3, Register tmp4,
4373                                     Register tmp5, Register tmp6,
4374                                     int icnt1, Register result, int ae) {
4375   // NOTE: tmp5, tmp6 can be zr depending on specific method version
4376   Label LINEARSEARCH, LINEARSTUB, LINEAR_MEDIUM, DONE, NOMATCH, MATCH;
4377 
4378   Register ch1 = rscratch1;
4379   Register ch2 = rscratch2;
4380   Register cnt1tmp = tmp1;
4381   Register cnt2tmp = tmp2;
4382   Register cnt1_neg = cnt1;
4383   Register cnt2_neg = cnt2;
4384   Register result_tmp = tmp4;
4385 
4386   bool isL = ae == StrIntrinsicNode::LL;
4387 
4388   bool str1_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL;
4389   bool str2_isL = ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::LU;
4390   int str1_chr_shift = str1_isL ? 0:1;
4391   int str2_chr_shift = str2_isL ? 0:1;
4392   int str1_chr_size = str1_isL ? 1:2;
4393   int str2_chr_size = str2_isL ? 1:2;
4394   chr_insn str1_load_1chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
4395                                       (chr_insn)&MacroAssembler::ldrh;
4396   chr_insn str2_load_1chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
4397                                       (chr_insn)&MacroAssembler::ldrh;
4398   chr_insn load_2chr = isL ? (chr_insn)&MacroAssembler::ldrh : (chr_insn)&MacroAssembler::ldrw;
4399   chr_insn load_4chr = isL ? (chr_insn)&MacroAssembler::ldrw : (chr_insn)&MacroAssembler::ldr;
4400 
4401   // Note, inline_string_indexOf() generates checks:
4402   // if (substr.count > string.count) return -1;
4403   // if (substr.count == 0) return 0;
4404 
4405   // We have two strings, a source string in str2, cnt2 and a pattern string
4406   // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4407 
4408   // For larger pattern and source we use a simplified Boyer Moore algorithm.
4409   // With a small pattern and source we use linear scan.
4410 
4411   if (icnt1 == -1) {
4412     sub(result_tmp, cnt2, cnt1);
4413     cmp(cnt1, (u1)8);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4414     br(LT, LINEARSEARCH);
4415     dup(v0, T16B, cnt1); // done in separate FPU pipeline. Almost no penalty
4416     subs(zr, cnt1, 256);
4417     lsr(tmp1, cnt2, 2);
4418     ccmp(cnt1, tmp1, 0b0000, LT); // Source must be 4 * pattern for BM
4419     br(GE, LINEARSTUB);
4420   }
4421 
4422 // The Boyer Moore alogorithm is based on the description here:-
4423 //
4424 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4425 //
4426 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4427 // and the 'Good Suffix' rule.
4428 //
4429 // These rules are essentially heuristics for how far we can shift the
4430 // pattern along the search string.
4431 //
4432 // The implementation here uses the 'Bad Character' rule only because of the
4433 // complexity of initialisation for the 'Good Suffix' rule.
4434 //
4435 // This is also known as the Boyer-Moore-Horspool algorithm:-
4436 //
4437 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4438 //
4439 // This particular implementation has few java-specific optimizations.
4440 //
4441 // #define ASIZE 256
4442 //
4443 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4444 //       int i, j;
4445 //       unsigned c;
4446 //       unsigned char bc[ASIZE];
4447 //
4448 //       /* Preprocessing */
4449 //       for (i = 0; i < ASIZE; ++i)
4450 //          bc[i] = m;
4451 //       for (i = 0; i < m - 1; ) {
4452 //          c = x[i];
4453 //          ++i;
4454 //          // c < 256 for Latin1 string, so, no need for branch
4455 //          #ifdef PATTERN_STRING_IS_LATIN1
4456 //          bc[c] = m - i;
4457 //          #else
4458 //          if (c < ASIZE) bc[c] = m - i;
4459 //          #endif
4460 //       }
4461 //
4462 //       /* Searching */
4463 //       j = 0;
4464 //       while (j <= n - m) {
4465 //          c = y[i+j];
4466 //          if (x[m-1] == c)
4467 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4468 //          if (i < 0) return j;
4469 //          // c < 256 for Latin1 string, so, no need for branch
4470 //          #ifdef SOURCE_STRING_IS_LATIN1
4471 //          // LL case: (c< 256) always true. Remove branch
4472 //          j += bc[y[j+m-1]];
4473 //          #endif
4474 //          #ifndef PATTERN_STRING_IS_UTF
4475 //          // UU case: need if (c<ASIZE) check. Skip 1 character if not.
4476 //          if (c < ASIZE)
4477 //            j += bc[y[j+m-1]];
4478 //          else
4479 //            j += 1
4480 //          #endif
4481 //          #ifdef PATTERN_IS_LATIN1_AND_SOURCE_IS_UTF
4482 //          // UL case: need if (c<ASIZE) check. Skip <pattern length> if not.
4483 //          if (c < ASIZE)
4484 //            j += bc[y[j+m-1]];
4485 //          else
4486 //            j += m
4487 //          #endif
4488 //       }
4489 //    }
4490 
4491   if (icnt1 == -1) {
4492     Label BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP, BMADV, BMMATCH,
4493         BMLOOPSTR1_LASTCMP, BMLOOPSTR1_CMP, BMLOOPSTR1_AFTER_LOAD, BM_INIT_LOOP;
4494     Register cnt1end = tmp2;
4495     Register str2end = cnt2;
4496     Register skipch = tmp2;
4497 
4498     // str1 length is >=8, so, we can read at least 1 register for cases when
4499     // UTF->Latin1 conversion is not needed(8 LL or 4UU) and half register for
4500     // UL case. We'll re-read last character in inner pre-loop code to have
4501     // single outer pre-loop load
4502     const int firstStep = isL ? 7 : 3;
4503 
4504     const int ASIZE = 256;
4505     const int STORED_BYTES = 32; // amount of bytes stored per instruction
4506     sub(sp, sp, ASIZE);
4507     mov(tmp5, ASIZE/STORED_BYTES); // loop iterations
4508     mov(ch1, sp);
4509     BIND(BM_INIT_LOOP);
4510       stpq(v0, v0, Address(post(ch1, STORED_BYTES)));
4511       subs(tmp5, tmp5, 1);
4512       br(GT, BM_INIT_LOOP);
4513 
4514       sub(cnt1tmp, cnt1, 1);
4515       mov(tmp5, str2);
4516       add(str2end, str2, result_tmp, LSL, str2_chr_shift);
4517       sub(ch2, cnt1, 1);
4518       mov(tmp3, str1);
4519     BIND(BCLOOP);
4520       (this->*str1_load_1chr)(ch1, Address(post(tmp3, str1_chr_size)));
4521       if (!str1_isL) {
4522         subs(zr, ch1, ASIZE);
4523         br(HS, BCSKIP);
4524       }
4525       strb(ch2, Address(sp, ch1));
4526     BIND(BCSKIP);
4527       subs(ch2, ch2, 1);
4528       br(GT, BCLOOP);
4529 
4530       add(tmp6, str1, cnt1, LSL, str1_chr_shift); // address after str1
4531       if (str1_isL == str2_isL) {
4532         // load last 8 bytes (8LL/4UU symbols)
4533         ldr(tmp6, Address(tmp6, -wordSize));
4534       } else {
4535         ldrw(tmp6, Address(tmp6, -wordSize/2)); // load last 4 bytes(4 symbols)
4536         // convert Latin1 to UTF. We'll have to wait until load completed, but
4537         // it's still faster than per-character loads+checks
4538         lsr(tmp3, tmp6, BitsPerByte * (wordSize/2 - str1_chr_size)); // str1[N-1]
4539         ubfx(ch1, tmp6, 8, 8); // str1[N-2]
4540         ubfx(ch2, tmp6, 16, 8); // str1[N-3]
4541         andr(tmp6, tmp6, 0xFF); // str1[N-4]
4542         orr(ch2, ch1, ch2, LSL, 16);
4543         orr(tmp6, tmp6, tmp3, LSL, 48);
4544         orr(tmp6, tmp6, ch2, LSL, 16);
4545       }
4546     BIND(BMLOOPSTR2);
4547       (this->*str2_load_1chr)(skipch, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4548       sub(cnt1tmp, cnt1tmp, firstStep); // cnt1tmp is positive here, because cnt1 >= 8
4549       if (str1_isL == str2_isL) {
4550         // re-init tmp3. It's for free because it's executed in parallel with
4551         // load above. Alternative is to initialize it before loop, but it'll
4552         // affect performance on in-order systems with 2 or more ld/st pipelines
4553         lsr(tmp3, tmp6, BitsPerByte * (wordSize - str1_chr_size));
4554       }
4555       if (!isL) { // UU/UL case
4556         lsl(ch2, cnt1tmp, 1); // offset in bytes
4557       }
4558       cmp(tmp3, skipch);
4559       br(NE, BMSKIP);
4560       ldr(ch2, Address(str2, isL ? cnt1tmp : ch2));
4561       mov(ch1, tmp6);
4562       if (isL) {
4563         b(BMLOOPSTR1_AFTER_LOAD);
4564       } else {
4565         sub(cnt1tmp, cnt1tmp, 1); // no need to branch for UU/UL case. cnt1 >= 8
4566         b(BMLOOPSTR1_CMP);
4567       }
4568     BIND(BMLOOPSTR1);
4569       (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp, Address::lsl(str1_chr_shift)));
4570       (this->*str2_load_1chr)(ch2, Address(str2, cnt1tmp, Address::lsl(str2_chr_shift)));
4571     BIND(BMLOOPSTR1_AFTER_LOAD);
4572       subs(cnt1tmp, cnt1tmp, 1);
4573       br(LT, BMLOOPSTR1_LASTCMP);
4574     BIND(BMLOOPSTR1_CMP);
4575       cmp(ch1, ch2);
4576       br(EQ, BMLOOPSTR1);
4577     BIND(BMSKIP);
4578       if (!isL) {
4579         // if we've met UTF symbol while searching Latin1 pattern, then we can
4580         // skip cnt1 symbols
4581         if (str1_isL != str2_isL) {
4582           mov(result_tmp, cnt1);
4583         } else {
4584           mov(result_tmp, 1);
4585         }
4586         subs(zr, skipch, ASIZE);
4587         br(HS, BMADV);
4588       }
4589       ldrb(result_tmp, Address(sp, skipch)); // load skip distance
4590     BIND(BMADV);
4591       sub(cnt1tmp, cnt1, 1);
4592       add(str2, str2, result_tmp, LSL, str2_chr_shift);
4593       cmp(str2, str2end);
4594       br(LE, BMLOOPSTR2);
4595       add(sp, sp, ASIZE);
4596       b(NOMATCH);
4597     BIND(BMLOOPSTR1_LASTCMP);
4598       cmp(ch1, ch2);
4599       br(NE, BMSKIP);
4600     BIND(BMMATCH);
4601       sub(result, str2, tmp5);
4602       if (!str2_isL) lsr(result, result, 1);
4603       add(sp, sp, ASIZE);
4604       b(DONE);
4605 
4606     BIND(LINEARSTUB);
4607     cmp(cnt1, (u1)16); // small patterns still should be handled by simple algorithm
4608     br(LT, LINEAR_MEDIUM);
4609     mov(result, zr);
4610     RuntimeAddress stub = NULL;
4611     if (isL) {
4612       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ll());
4613       assert(stub.target() != NULL, "string_indexof_linear_ll stub has not been generated");
4614     } else if (str1_isL) {
4615       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_ul());
4616        assert(stub.target() != NULL, "string_indexof_linear_ul stub has not been generated");
4617     } else {
4618       stub = RuntimeAddress(StubRoutines::aarch64::string_indexof_linear_uu());
4619       assert(stub.target() != NULL, "string_indexof_linear_uu stub has not been generated");
4620     }
4621     trampoline_call(stub);
4622     b(DONE);
4623   }
4624 
4625   BIND(LINEARSEARCH);
4626   {
4627     Label DO1, DO2, DO3;
4628 
4629     Register str2tmp = tmp2;
4630     Register first = tmp3;
4631 
4632     if (icnt1 == -1)
4633     {
4634         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT;
4635 
4636         cmp(cnt1, u1(str1_isL == str2_isL ? 4 : 2));
4637         br(LT, DOSHORT);
4638       BIND(LINEAR_MEDIUM);
4639         (this->*str1_load_1chr)(first, Address(str1));
4640         lea(str1, Address(str1, cnt1, Address::lsl(str1_chr_shift)));
4641         sub(cnt1_neg, zr, cnt1, LSL, str1_chr_shift);
4642         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4643         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4644 
4645       BIND(FIRST_LOOP);
4646         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4647         cmp(first, ch2);
4648         br(EQ, STR1_LOOP);
4649       BIND(STR2_NEXT);
4650         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4651         br(LE, FIRST_LOOP);
4652         b(NOMATCH);
4653 
4654       BIND(STR1_LOOP);
4655         adds(cnt1tmp, cnt1_neg, str1_chr_size);
4656         add(cnt2tmp, cnt2_neg, str2_chr_size);
4657         br(GE, MATCH);
4658 
4659       BIND(STR1_NEXT);
4660         (this->*str1_load_1chr)(ch1, Address(str1, cnt1tmp));
4661         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4662         cmp(ch1, ch2);
4663         br(NE, STR2_NEXT);
4664         adds(cnt1tmp, cnt1tmp, str1_chr_size);
4665         add(cnt2tmp, cnt2tmp, str2_chr_size);
4666         br(LT, STR1_NEXT);
4667         b(MATCH);
4668 
4669       BIND(DOSHORT);
4670       if (str1_isL == str2_isL) {
4671         cmp(cnt1, (u1)2);
4672         br(LT, DO1);
4673         br(GT, DO3);
4674       }
4675     }
4676 
4677     if (icnt1 == 4) {
4678       Label CH1_LOOP;
4679 
4680         (this->*load_4chr)(ch1, str1);
4681         sub(result_tmp, cnt2, 4);
4682         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4683         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4684 
4685       BIND(CH1_LOOP);
4686         (this->*load_4chr)(ch2, Address(str2, cnt2_neg));
4687         cmp(ch1, ch2);
4688         br(EQ, MATCH);
4689         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4690         br(LE, CH1_LOOP);
4691         b(NOMATCH);
4692       }
4693 
4694     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 2) {
4695       Label CH1_LOOP;
4696 
4697       BIND(DO2);
4698         (this->*load_2chr)(ch1, str1);
4699         if (icnt1 == 2) {
4700           sub(result_tmp, cnt2, 2);
4701         }
4702         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4703         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4704       BIND(CH1_LOOP);
4705         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4706         cmp(ch1, ch2);
4707         br(EQ, MATCH);
4708         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4709         br(LE, CH1_LOOP);
4710         b(NOMATCH);
4711     }
4712 
4713     if ((icnt1 == -1 && str1_isL == str2_isL) || icnt1 == 3) {
4714       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4715 
4716       BIND(DO3);
4717         (this->*load_2chr)(first, str1);
4718         (this->*str1_load_1chr)(ch1, Address(str1, 2*str1_chr_size));
4719         if (icnt1 == 3) {
4720           sub(result_tmp, cnt2, 3);
4721         }
4722         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4723         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4724       BIND(FIRST_LOOP);
4725         (this->*load_2chr)(ch2, Address(str2, cnt2_neg));
4726         cmpw(first, ch2);
4727         br(EQ, STR1_LOOP);
4728       BIND(STR2_NEXT);
4729         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4730         br(LE, FIRST_LOOP);
4731         b(NOMATCH);
4732 
4733       BIND(STR1_LOOP);
4734         add(cnt2tmp, cnt2_neg, 2*str2_chr_size);
4735         (this->*str2_load_1chr)(ch2, Address(str2, cnt2tmp));
4736         cmp(ch1, ch2);
4737         br(NE, STR2_NEXT);
4738         b(MATCH);
4739     }
4740 
4741     if (icnt1 == -1 || icnt1 == 1) {
4742       Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP;
4743 
4744       BIND(DO1);
4745         (this->*str1_load_1chr)(ch1, str1);
4746         cmp(cnt2, (u1)8);
4747         br(LT, DO1_SHORT);
4748 
4749         sub(result_tmp, cnt2, 8/str2_chr_size);
4750         sub(cnt2_neg, zr, result_tmp, LSL, str2_chr_shift);
4751         mov(tmp3, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4752         lea(str2, Address(str2, result_tmp, Address::lsl(str2_chr_shift)));
4753 
4754         if (str2_isL) {
4755           orr(ch1, ch1, ch1, LSL, 8);
4756         }
4757         orr(ch1, ch1, ch1, LSL, 16);
4758         orr(ch1, ch1, ch1, LSL, 32);
4759       BIND(CH1_LOOP);
4760         ldr(ch2, Address(str2, cnt2_neg));
4761         eor(ch2, ch1, ch2);
4762         sub(tmp1, ch2, tmp3);
4763         orr(tmp2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4764         bics(tmp1, tmp1, tmp2);
4765         br(NE, HAS_ZERO);
4766         adds(cnt2_neg, cnt2_neg, 8);
4767         br(LT, CH1_LOOP);
4768 
4769         cmp(cnt2_neg, (u1)8);
4770         mov(cnt2_neg, 0);
4771         br(LT, CH1_LOOP);
4772         b(NOMATCH);
4773 
4774       BIND(HAS_ZERO);
4775         rev(tmp1, tmp1);
4776         clz(tmp1, tmp1);
4777         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4778         b(MATCH);
4779 
4780       BIND(DO1_SHORT);
4781         mov(result_tmp, cnt2);
4782         lea(str2, Address(str2, cnt2, Address::lsl(str2_chr_shift)));
4783         sub(cnt2_neg, zr, cnt2, LSL, str2_chr_shift);
4784       BIND(DO1_LOOP);
4785         (this->*str2_load_1chr)(ch2, Address(str2, cnt2_neg));
4786         cmpw(ch1, ch2);
4787         br(EQ, MATCH);
4788         adds(cnt2_neg, cnt2_neg, str2_chr_size);
4789         br(LT, DO1_LOOP);
4790     }
4791   }
4792   BIND(NOMATCH);
4793     mov(result, -1);
4794     b(DONE);
4795   BIND(MATCH);
4796     add(result, result_tmp, cnt2_neg, ASR, str2_chr_shift);
4797   BIND(DONE);
4798 }
4799 
4800 typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr);
4801 typedef void (MacroAssembler::* uxt_insn)(Register Rd, Register Rn);
4802 
4803 void MacroAssembler::string_indexof_char(Register str1, Register cnt1,
4804                                          Register ch, Register result,
4805                                          Register tmp1, Register tmp2, Register tmp3)
4806 {
4807   Label CH1_LOOP, HAS_ZERO, DO1_SHORT, DO1_LOOP, MATCH, NOMATCH, DONE;
4808   Register cnt1_neg = cnt1;
4809   Register ch1 = rscratch1;
4810   Register result_tmp = rscratch2;
4811 
4812   cmp(cnt1, (u1)4);
4813   br(LT, DO1_SHORT);
4814 
4815   orr(ch, ch, ch, LSL, 16);
4816   orr(ch, ch, ch, LSL, 32);
4817 
4818   sub(cnt1, cnt1, 4);
4819   mov(result_tmp, cnt1);
4820   lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4821   sub(cnt1_neg, zr, cnt1, LSL, 1);
4822 
4823   mov(tmp3, 0x0001000100010001);
4824 
4825   BIND(CH1_LOOP);
4826     ldr(ch1, Address(str1, cnt1_neg));
4827     eor(ch1, ch, ch1);
4828     sub(tmp1, ch1, tmp3);
4829     orr(tmp2, ch1, 0x7fff7fff7fff7fff);
4830     bics(tmp1, tmp1, tmp2);
4831     br(NE, HAS_ZERO);
4832     adds(cnt1_neg, cnt1_neg, 8);
4833     br(LT, CH1_LOOP);
4834 
4835     cmp(cnt1_neg, (u1)8);
4836     mov(cnt1_neg, 0);
4837     br(LT, CH1_LOOP);
4838     b(NOMATCH);
4839 
4840   BIND(HAS_ZERO);
4841     rev(tmp1, tmp1);
4842     clz(tmp1, tmp1);
4843     add(cnt1_neg, cnt1_neg, tmp1, LSR, 3);
4844     b(MATCH);
4845 
4846   BIND(DO1_SHORT);
4847     mov(result_tmp, cnt1);
4848     lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4849     sub(cnt1_neg, zr, cnt1, LSL, 1);
4850   BIND(DO1_LOOP);
4851     ldrh(ch1, Address(str1, cnt1_neg));
4852     cmpw(ch, ch1);
4853     br(EQ, MATCH);
4854     adds(cnt1_neg, cnt1_neg, 2);
4855     br(LT, DO1_LOOP);
4856   BIND(NOMATCH);
4857     mov(result, -1);
4858     b(DONE);
4859   BIND(MATCH);
4860     add(result, result_tmp, cnt1_neg, ASR, 1);
4861   BIND(DONE);
4862 }
4863 
4864 // Summary: Compare strings intrinsic implementation. All combinations of UTF-16
4865 //          and Latin1 encodings for both strings are considered. Comparison
4866 //          is performed in lexical order.
4867 //
4868 // Input:   str1: pointer to 1st string
4869 //          str2: pointer to 2nd string
4870 //          cnt1: number of bytes in 1st string
4871 //          cnt2: number of bytes in 2nd string
4872 //
4873 // Algorithm parameter:
4874 //          ae: encodings used in 1st and 2nd strings
4875 //
4876 // Temporary registers:
4877 //          tmp1, tmp2, rscratch1, rscratch2: always used
4878 //          vtmp1, vtmp2, vtmp3: used in case encodings are different
4879 //
4880 // Output:  result - return 0 if strings are equal. Returns positive value
4881 //          if 1st string > 2nd string in lexical order. Returns
4882 //          negative value if 1st string < 2nd string.
4883 //
4884 // Side effects: str1, str2, cnt1, cnt2, tmp1, tmp2, rscratch1, rscratch2: clobbered.
4885 //               vtmp1, vtmp2, vtmp2: clobbered if encodings are different
4886 //
4887 // Additional data: boolean values: isLL, isLU, isUL, str1_isL, str2_isL and
4888 // int minCharInWords are derived from ae parameter based on encodings used
4889 // in strings. Different code is generated depending of these values:
4890 //
4891 // isLL = both strings are Latin1
4892 // isLU = 1st string is Latin1, 2nd string is UTF-16
4893 // isUL = 1st string is UTF-16, 2nd string is Latin1
4894 // str1_isL = 1st string is Latin1
4895 // str2_isL = 2nd string is Latin1
4896 // str1_chr_shift = shift value to convert between characters counter to byte counter for 1st string
4897 // str2_chr_shift = shift value to convert between characters counter to byte counter for 2nd string
4898 // minCharInWords = minimum number of characters that fit in register (8 for LL case, 4 otherwise)
4899 //
4900 //
4901 // PSEUDO CODE:
4902 //
4903 // // N.B.: this pseudo-code doesn't strictly follow implementation details.
4904 // // It is here to help understand the basics. Detailed implementation
4905 // // description is listed after this code.
4906 //
4907 // <convert byte counters cnt1, cnt2 into character counters if UTF-16 encoding is used>;
4908 // result = cnt1 - cnt2; // length difference. Used in if all min(cnt1, cnt2) characters are same
4909 // cnt2 = min(cnt1, cnt2); // amount of characters to check
4910 // if (result <= minCharInWords) { // <= wordSize bytes should be loaded for comparison
4911 //   if (cnt2 == 0) return result;
4912 //   while (cnt2 != 0) {
4913 //     char str1char = str1[0];
4914 //     char str2char = str2[0];
4915 //     str1 += 1 << str1_chr_shift; // advance pointer by size of str1 character
4916 //     str2 += 1 << str2_chr_shift; // advance pointer by size of str2 character
4917 //     if (str1[0] != str2[0]) return str1[0] - str2[0];
4918 //     cnt2--;
4919 //   }
4920 // } else { // > wordSize bytes should be loaded for comparison
4921 //     // This code checks string in 8-byte blocks. If encodings are
4922 //     // different, Latin1 string will be loaded via 4-byte blocks and then
4923 //     // each block will be converted to 8-byte UTF-16 equivalent. Then 8 byte
4924 //     // blocks are compared. Each load is 8 characters for LL case and 4
4925 //     // characters for LU/UL/UU.
4926 //     // This set of instructions (load 8 Latin1 character OR load 4 Latin1
4927 //     // characters and convert it to 4 UTF-16 character OR load 4 UTF-16
4928 //     // character) is referred as <load-and-probably-convert ...> below.
4929 //
4930 //     // First iteration in the loop is unrolled to add initialization.
4931 //
4932 //     // The code below calculates addresses of each string last load: addresses
4933 //     // of last 8 characters for LL case and last 4 characters otherwise.
4934 //     // Then offsets from the addresses to the beginning of the strings are
4935 //     // calculated. Offset is then use as loop counter. When offset is >= 0, then
4936 //     // only last loads (possible overlapped) are left to be checked.
4937 //     // N.B.: in case of same encodings, offsets are the same for both strings.
4938 //     // Then offset for 2nd string is used for both strings.
4939 //
4940 //     tmp1 = <load-and-probably-convert str1>;
4941 //     if (str1 == str2) return result;
4942 //     tmp2 = <load-and-probably-convert str2>;
4943 //
4944 //     // use special implementation optimized for large strings. See detailed code and stub comments.
4945 //     if (cnt2 >= 72) return compare_long_string_implementation(<args>);
4946 //
4947 //     cnt2 -= <amount of loaded characters>; // 8 for isLL case. 4 otherwise.
4948 //
4949 //     if (str1_isL == str2_isL) {
4950 //       // Optional optimization for same encoding cases. Can be applied for all
4951 //       // cases, but is faster in same encoding cases only. Without this branch
4952 //       // smallest string (8 character for LL and 4 characters for others) would
4953 //       // be checked twice.
4954 //       if (cnt2 == 0) goto TAIL_CHECK; // no more characters to be loaded. Just check already loaded data.
4955 //     }
4956 //
4957 //     // calculate addresses of last loads. use str1 and str2 pointers for that
4958 //     str1 = str1 + cnt2 << str1_chr_shift;
4959 //     str2 = str2 + cnt2 << str2_chr_shift;
4960 //
4961 //     // calculate offsets for both strings. cnt1 and cnt2 can be reused
4962 //     if (str1_isL != str2_isL) cnt1 = - (cnt2 << str1_chr_shift);
4963 //     cnt2 = - (cnt2 << str2_chr_shift);
4964 //
4965 //     // increment calculated offsets by the number of already loaded bytes
4966 //     if (isLU) cnt1 += 4;
4967 //     if (isUL) cnt1 += 8;
4968 //     cnt2 += isUL ? 4 : 8;
4969 //
4970 //     if (cnt2 >= 0) goto TAIL; // only last loads remains. Still need to check currently loaded data.
4971 //
4972 //     rscratch2 = tmp1 BIT_XOR tmp2;
4973 //     if (rscratch2 != 0) goto DIFFERENCE;
4974 //
4975 //     // main loop. Label = NEXT_WORD
4976 //     do {
4977 //       tmp1 = <load-and-probably-convert str1 at offset of (str1_isL == str2_isL ? cnt2 : cnt1)>;
4978 //       tmp2 = <load-and-probably-convert str2 at offset of cnt2>;
4979 //
4980 //       // update offsets by the number of loaded bytes
4981 //       cnt2 += isUL ? 4 : 8;
4982 //       if (isLU) cnt1 += 4;
4983 //       if (isUL) cnt1 += 8;
4984 //
4985 //       if (cnt2 >= 0) goto TAIL; // last block left to be loaded. Still need to check currently loaded block.
4986 //       rscratch2 = tmp1 BIT_XOR tmp2;
4987 //     } while (rscratch2 == 0);
4988 //     goto DIFFERENCE:
4989 //
4990 //   TAIL: // last block left to be loaded. Still need to check currently loaded block.
4991 //     rscratch2 = tmp1 BIT_XOR tmp2;
4992 //     if (rscratch2 != 0) goto DIFFERENCE;
4993 //     tmp1 = <load-and-probably-convert str1>;
4994 //     tmp2 = <load-and-probably-convert str2>;
4995 //     // fallthrough to TAIL_CHECK
4996 //   TAIL_CHECK:
4997 //     rscratch2 = tmp1 BIT_XOR tmp2;
4998 //     if (rscratch2 == 0) return result;
4999 //   DIFFERENCE: // different character found. Find it and compute difference
5000 //     // tmp1 and tmp2 have current data with at least 1 different character.
5001 //     // Find index of first such character.
5002 //     rscratch2 = REVERSE_BITS(rscratch2);
5003 //     rscratch2 = COUNT_LEADING_ZEROES(rscratch2); // position of different bit in current 8 bytes
5004 //     rscratch2 = rscratch2 & (isLL ? -8 : -16); // number of bits until (possibly converted) different characters in tmp1 and tmp2
5005 //     tmp1 = tmp1 >> rscratch2; // now first character in tmp1 is the one sought for
5006 //     tmp1 = tmp1 & (isLL ? 0xFF : 0xFFFF); // only first different character left
5007 //     tmp2 = tmp2 >> rscratch2; // now first character in tmp2 is the one sought for
5008 //     tmp2 = tmp2 & (isLL ? 0xFF : 0xFFFF); // only first different character left
5009 //     result = tmp1 - tmp2;
5010 // }
5011 // return result;
5012 //
5013 //
5014 //
5015 // DETAILED CODE:
5016 //
5017 //  if (!str1_isL) cnt1 = cnt1 >> 1;                // counter for 1st string (in characters)
5018 //  if (!str2_isL) cnt2 = cnt2 >> 1;                // counter for 2nd string (in characters)
5019 //  result = cnt1 - cnt2;                           // keep in flags the result of operation
5020 //  cnt2 = min(cnt1, cnt2);                         // implemented as csel instruction using stored flag value above
5021 //  bool shortStringsCase = cnt2 <= minCharInWords; // kept in flag
5022 //  if (shortStringsCase) goto SHORT_STRING;        // separate code for short strings
5023 //  if (str1_isL == str2_isL) {                     // same encoding case
5024 //    tmp1 = LOAD8BYTES(str1);
5025 //    bool sameString = str1 == str2;               // kept in flags
5026 //    if (sameString) goto DONE;                    // the string is the same, return
5027 //    tmp2 = LOAD8BYTES(str2);
5028 //    bool largeStrings = cnt2 >= 72;               // kept in flags
5029 //    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
5030 //    cnt2 = cnt2 - minCharsInWord;                 // decrement counter by the number of loaded characters
5031 //    bool noMoreLoadsAvailable = cnt2 == 0;        // kept in flags
5032 //    if (noMoreLoadsAvailable) goto TAIL_CHECK;
5033 //    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
5034 //    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
5035 //    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in each string
5036 //  } else if (isLU) {
5037 //    vtmp = LOAD4BYTES(str1);
5038 //    bool sameString = str1 == str2;               // kept in flags
5039 //    if (sameString) goto DONE;                    // return
5040 //    tmp2 = LOAD8BYTES(str2);
5041 //    bool largeStrings = cnt2 >= 72;               // kept in flags
5042 //    if (largeStrings) goto STUB;                  // handled in separate stub implementation for large strings
5043 //    cnt2 = cnt2 - 4;                              // decrement counter by the number  of loaded characters
5044 //    vtmpz = 0;                                    // implemented as eor
5045 //    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
5046 //    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
5047 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. Implemented as zip instruction
5048 //    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
5049 //    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
5050 //    cnt1 = cnt1 + 4;                              // advance 1st string offset by the number of loaded bytes
5051 //    tmp1 = vtmp;                                  // move converted characters from FPU register to GPR
5052 //  } else { // UL
5053 //    tmp1 = LOAD8BYTES(str1);
5054 //    bool sameString = str1 == str2;               // kept in flags
5055 //    if (sameString) goto DONE;                    // return
5056 //    vtmp = LOAD4BYTES(str2);
5057 //    bool largeStrings = cnt2 >= 72;               // kept in flags
5058 //    if (largeStrings) goto STUB;                  // separate stub implementation for large strings
5059 //    cnt2 = cnt2 - 4;                              // update counter by the number of loaded characters
5060 //    str1 = str1 + cnt2 << str1_chr_shift;         // address of str1 last load
5061 //    vtmpz = 0;                                    // implemented as eor
5062 //    str2 = str2 + cnt2 << str2_chr_shift;         // address of str2 last load
5063 //    cnt1 = -(cnt2 << str1_chr_shift);             // byte offset to 1st character in 1st string
5064 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);  // convert Latin1 to UTF16 because it'll be compared with UTF16. implemented as zip instruction
5065 //    cnt2 = -(cnt2 << str2_chr_shift);             // byte offset to 1st character in 2nd string
5066 //    cnt1 = cnt1 + 8;                              // advance 1st string offset by the number of loaded bytes
5067 //    tmp2 = vtmp;                                  // move converted characters from FPU register to GPR
5068 //  }
5069 //  cnt2 = cnt2 + (isUL ? 4 : 8);                   // update offset by the number of loaded bytes
5070 //  bool onlyLastLoadRemains = cnt2 >= 0;           // kept in flags
5071 //  if (onlyLastLoadRemains) goto TAIL;
5072 //  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
5073 //  if (rscratch2 != 0) goto DIFFERENCE;            // found different characters in current block
5074 // NEXT_WORD:                                       // main loop
5075 //  // implementation for each encoding loads 4 or 8 characters at calculated
5076 //  // offsets from each string and convert encodings if necessary. Then offsets
5077 //  // are updated.
5078 //  if (str1_isL == str2_isL) {
5079 //    tmp1 = LOAD8BYTES(str1, cnt2);
5080 //    tmp2 = LOAD8BYTES(str2, cnt2);
5081 //    cnt2 = cnt2 + 8;                              // update counter by the number of loaded bytes
5082 //    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
5083 //  } else if (isLU) {
5084 //    vtmp = LOAD4BYTES(str1, cnt1);
5085 //    tmp2 = LOAD8BYTES(str2, cnt2);
5086 //    cnt1 = cnt1 + 4;
5087 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
5088 //    tmp1 = vtmp;
5089 //    cnt2 = cnt2 + 8;
5090 //    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
5091 //  } else { // UL
5092 //    vtmp = LOAD4BYTES(str2, cnt2);
5093 //    tmp1 = LOAD8BYTES(str1, cnt1);
5094 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
5095 //    cnt1 = cnt1 + 8;
5096 //    tmp2 = vtmp;
5097 //    cnt2 = cnt2 + 4;
5098 //    onlyLastLoadRemains = cnt2 >= 0;              // kept in flags
5099 //  }
5100 //  if (onlyLastLoadRemains) goto TAIL;
5101 //  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
5102 //  if (rscratch2 == 0) goto NEXT_WORD;
5103 //  goto DIFFERENCE;
5104 // TAIL: // check already loaded data and last load
5105 //  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
5106 //  if (rscratch2 != 0) goto DIFFERENCE;
5107 //
5108 //  // last load (and convert if needed) from each string
5109 //  if (str1_isL == str2_isL) {
5110 //    tmp1 = LOAD8BYTES(str1);
5111 //    tmp2 = LOAD8BYTES(str2);
5112 //  } else if (isLU) {
5113 //    vtmp = LOAD4BYTES(str1);
5114 //    tmp2 = LOAD8BYTES(str2);
5115 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
5116 //    tmp1 = vtmp;
5117 //  } else { // UL
5118 //    vtmp = LOAD4BYTES(str2);
5119 //    tmp1 = LOAD8BYTES(str1);
5120 //    vtmp = CONVERT_LATIN1_TO_UTF16(vtmp, vtmpz);
5121 //    tmp2 = vtmp;
5122 //  }
5123 // TAIL_CHECK:                                      // last check
5124 //  rscratch2 = BIT_XOR(tmp1, tmp2);                // current block comparison result
5125 //  if (rscratch2 == 0) goto DONE;                  // return
5126 // DIFFERENCE:
5127 //  rscratch2 = REVERSE_BITS(rscratch2);            // It's not possible to count trailing zeroes. Reverse bits and then count leading zeroes instead.
5128 //  rscratch2 = COUNT_LEADING_ZEROES(rscratch2);    // position of different bit in current 8 bytes
5129 //  rscratch2 = rscratch2 & (isLL ? -8 : -16);      // number of bits until (possibly converted) different characters in tmp1 and tmp2
5130 //  tmp1 = tmp1 >> rscratch2;                       // first character in tmp1 is the one sought for
5131 //  tmp1 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp1) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
5132 //  tmp2 = tmp2 >> rscratch2;                       // first character in tmp2 is the one sought for
5133 //  tmp2 = isLL ? UNSIGNED_EXTEND_BYTE2INT(tmp2) : UNSIGNED_EXTEND_SHORT2INT(tmp1); // only first different character left
5134 //  result = tmp1 - tmp2;
5135 //  goto DONE;
5136 // }
5137 //
5138 // STUB:
5139 //  <get address of compare_long_string_[LL|UU|LU|UL] stub routine and call it>
5140 //  goto DONE;
5141 //
5142 // // Short strings comparison code. Instead of simple per-character loop with
5143 // // load-and-compare code it uses loop than issues 2 per-character loads from
5144 // // each string per iteration. Different registers are used for that to
5145 // // remove dependencies: (tmp1, cnt1) and (tmp2, rscratch1) pairs.
5146 // // First characters loads are issued in pre-loop.
5147 // SHORT_STRING:
5148 //  if (cnt2 == 0) goto DONE;                       // no characters to compare. Length difference (already calculated) should be used as result
5149 //  tmp1 = LOAD_STR1_CHAR(str1);
5150 //  str1 = str1 + str1_chr_size);                   // merged with load above as post-increment
5151 //  cnt2 = cnt2 - 1;                                // calculate remaining length after first character is loaded
5152 //  bool endReached = cnt2 == 0;                    // kept in flags
5153 //  if (endReached) goto SHORT_LAST_INIT;           // load 1 character to complete from 2nd string to complete init and compare it with 1st string character
5154 //  cnt1 = LOAD_STR2_CHAR(str2);
5155 //  str2 = str2 + str2_chr_size;                    // merged with load above as post-increment
5156 //  goto SHORT_LOOP_START;                          // per-character loop entry point
5157 // SHORT_LOOP:                                      // per-character loop
5158 //  cnt2 = cnt2 - 1;                                // calculate remaining length
5159 //  endReached = cnt2 == 0;
5160 //  if (endReached) goto SHORT_LAST_INIT;
5161 // SHORT_LOOP_START:                                // per-character loop entry point
5162 //  tmp2 = LOAD_STR1_CHAR(str1);
5163 //  rscratch1 = LOAD_STR2_CHAR(str2);
5164 //  bool differentResult = tmp1 != cnt1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
5165 //  if (differentResult) goto SHORT_LOOP_TAIL;      // calculate character difference and return
5166 //  cnt2 = cnt2 - 1;                                // calculate remaining length
5167 //  endReached = cnt2 == 0;
5168 //  if (endReached) goto SHORT_LAST2;               // last comparison of second pair of registers (tmp2, rscratch1) is left
5169 //  tmp1 = LOAD_STR1_CHAR(str1);
5170 //  cnt1 = LOAD_STR2_CHAR(str2);
5171 //  bool sameResult = tmp2 == rscratch1;            // check difference of previously loaded pair of registers while last pair is still loading. Kept in flags
5172 //  if (sameResult) goto SHORT_LOOP;
5173 //  result = tmp2 - rscratch1;
5174 //  goto DONE;
5175 // SHORT_LAST2:                                     // last comparison is left: (tmp2, rscratch1)
5176 //  sameResult = tmp2 == rscratch1;
5177 //  if (sameResult) goto DONE;
5178 //  result = tmp2 - rscratch1;
5179 //  goto DONE;
5180 // SHORT_LAST_INIT:
5181 //  cnt1 = LOAD_STR2_CHAR(str2);
5182 // SHORT_LAST:                                      // last comparison of second pair of registers (tmp1, cnt1) is left
5183 //  sameResult = tmp1 == cnt1;
5184 //  if (sameResult) goto DONE;
5185 //  result = tmp1 - cnt1;
5186 // DONE:
5187 //  return;                                         // result
5188 
5189 void MacroAssembler::string_compare(Register str1, Register str2,
5190     Register cnt1, Register cnt2, Register result, Register tmp1, Register tmp2,
5191     FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, int ae) {
5192   Label DONE, SHORT_LOOP, SHORT_STRING, SHORT_LAST, TAIL, STUB,
5193       DIFFERENCE, NEXT_WORD, SHORT_LOOP_TAIL, SHORT_LAST2, SHORT_LAST_INIT,
5194       SHORT_LOOP_START, TAIL_CHECK;
5195 
5196   const u1 STUB_THRESHOLD = 64 + 8;
5197   bool isLL = ae == StrIntrinsicNode::LL;
5198   bool isLU = ae == StrIntrinsicNode::LU;
5199   bool isUL = ae == StrIntrinsicNode::UL;
5200 
5201   bool str1_isL = isLL || isLU;
5202   bool str2_isL = isLL || isUL;
5203 
5204   int str1_chr_shift = str1_isL ? 0 : 1;
5205   int str2_chr_shift = str2_isL ? 0 : 1;
5206   int str1_chr_size = str1_isL ? 1 : 2;
5207   int str2_chr_size = str2_isL ? 1 : 2;
5208   int minCharsInWord = isLL ? wordSize : wordSize/2;
5209 
5210   FloatRegister vtmpZ = vtmp1, vtmp = vtmp2;
5211   chr_insn str1_load_chr = str1_isL ? (chr_insn)&MacroAssembler::ldrb :
5212                                       (chr_insn)&MacroAssembler::ldrh;
5213   chr_insn str2_load_chr = str2_isL ? (chr_insn)&MacroAssembler::ldrb :
5214                                       (chr_insn)&MacroAssembler::ldrh;
5215   uxt_insn ext_chr = isLL ? (uxt_insn)&MacroAssembler::uxtbw :
5216                             (uxt_insn)&MacroAssembler::uxthw;
5217 
5218   BLOCK_COMMENT("string_compare {");
5219 
5220   // Bizzarely, the counts are passed in bytes, regardless of whether they
5221   // are L or U strings, however the result is always in characters.
5222   if (!str1_isL) asrw(cnt1, cnt1, 1);
5223   if (!str2_isL) asrw(cnt2, cnt2, 1);
5224 
5225   // Compute the minimum of the string lengths and save the difference.
5226   subsw(result, cnt1, cnt2);
5227   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
5228 
5229   // A very short string
5230   cmpw(cnt2, minCharsInWord);
5231   br(Assembler::LE, SHORT_STRING);
5232 
5233   // Compare longwords
5234   // load first parts of strings and finish initialization while loading
5235   {
5236     if (str1_isL == str2_isL) { // LL or UU
5237       ldr(tmp1, Address(str1));
5238       cmp(str1, str2);
5239       br(Assembler::EQ, DONE);
5240       ldr(tmp2, Address(str2));
5241       cmp(cnt2, STUB_THRESHOLD);
5242       br(GE, STUB);
5243       subsw(cnt2, cnt2, minCharsInWord);
5244       br(EQ, TAIL_CHECK);
5245       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5246       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5247       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5248     } else if (isLU) {
5249       ldrs(vtmp, Address(str1));
5250       cmp(str1, str2);
5251       br(Assembler::EQ, DONE);
5252       ldr(tmp2, Address(str2));
5253       cmp(cnt2, STUB_THRESHOLD);
5254       br(GE, STUB);
5255       subw(cnt2, cnt2, 4);
5256       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5257       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5258       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5259       zip1(vtmp, T8B, vtmp, vtmpZ);
5260       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5261       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5262       add(cnt1, cnt1, 4);
5263       fmovd(tmp1, vtmp);
5264     } else { // UL case
5265       ldr(tmp1, Address(str1));
5266       cmp(str1, str2);
5267       br(Assembler::EQ, DONE);
5268       ldrs(vtmp, Address(str2));
5269       cmp(cnt2, STUB_THRESHOLD);
5270       br(GE, STUB);
5271       subw(cnt2, cnt2, 4);
5272       lea(str1, Address(str1, cnt2, Address::uxtw(str1_chr_shift)));
5273       eor(vtmpZ, T16B, vtmpZ, vtmpZ);
5274       lea(str2, Address(str2, cnt2, Address::uxtw(str2_chr_shift)));
5275       sub(cnt1, zr, cnt2, LSL, str1_chr_shift);
5276       zip1(vtmp, T8B, vtmp, vtmpZ);
5277       sub(cnt2, zr, cnt2, LSL, str2_chr_shift);
5278       add(cnt1, cnt1, 8);
5279       fmovd(tmp2, vtmp);
5280     }
5281     adds(cnt2, cnt2, isUL ? 4 : 8);
5282     br(GE, TAIL);
5283     eor(rscratch2, tmp1, tmp2);
5284     cbnz(rscratch2, DIFFERENCE);
5285     // main loop
5286     bind(NEXT_WORD);
5287     if (str1_isL == str2_isL) {
5288       ldr(tmp1, Address(str1, cnt2));
5289       ldr(tmp2, Address(str2, cnt2));
5290       adds(cnt2, cnt2, 8);
5291     } else if (isLU) {
5292       ldrs(vtmp, Address(str1, cnt1));
5293       ldr(tmp2, Address(str2, cnt2));
5294       add(cnt1, cnt1, 4);
5295       zip1(vtmp, T8B, vtmp, vtmpZ);
5296       fmovd(tmp1, vtmp);
5297       adds(cnt2, cnt2, 8);
5298     } else { // UL
5299       ldrs(vtmp, Address(str2, cnt2));
5300       ldr(tmp1, Address(str1, cnt1));
5301       zip1(vtmp, T8B, vtmp, vtmpZ);
5302       add(cnt1, cnt1, 8);
5303       fmovd(tmp2, vtmp);
5304       adds(cnt2, cnt2, 4);
5305     }
5306     br(GE, TAIL);
5307 
5308     eor(rscratch2, tmp1, tmp2);
5309     cbz(rscratch2, NEXT_WORD);
5310     b(DIFFERENCE);
5311     bind(TAIL);
5312     eor(rscratch2, tmp1, tmp2);
5313     cbnz(rscratch2, DIFFERENCE);
5314     // Last longword.  In the case where length == 4 we compare the
5315     // same longword twice, but that's still faster than another
5316     // conditional branch.
5317     if (str1_isL == str2_isL) {
5318       ldr(tmp1, Address(str1));
5319       ldr(tmp2, Address(str2));
5320     } else if (isLU) {
5321       ldrs(vtmp, Address(str1));
5322       ldr(tmp2, Address(str2));
5323       zip1(vtmp, T8B, vtmp, vtmpZ);
5324       fmovd(tmp1, vtmp);
5325     } else { // UL
5326       ldrs(vtmp, Address(str2));
5327       ldr(tmp1, Address(str1));
5328       zip1(vtmp, T8B, vtmp, vtmpZ);
5329       fmovd(tmp2, vtmp);
5330     }
5331     bind(TAIL_CHECK);
5332     eor(rscratch2, tmp1, tmp2);
5333     cbz(rscratch2, DONE);
5334 
5335     // Find the first different characters in the longwords and
5336     // compute their difference.
5337     bind(DIFFERENCE);
5338     rev(rscratch2, rscratch2);
5339     clz(rscratch2, rscratch2);
5340     andr(rscratch2, rscratch2, isLL ? -8 : -16);
5341     lsrv(tmp1, tmp1, rscratch2);
5342     (this->*ext_chr)(tmp1, tmp1);
5343     lsrv(tmp2, tmp2, rscratch2);
5344     (this->*ext_chr)(tmp2, tmp2);
5345     subw(result, tmp1, tmp2);
5346     b(DONE);
5347   }
5348 
5349   bind(STUB);
5350     RuntimeAddress stub = NULL;
5351     switch(ae) {
5352       case StrIntrinsicNode::LL:
5353         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LL());
5354         break;
5355       case StrIntrinsicNode::UU:
5356         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UU());
5357         break;
5358       case StrIntrinsicNode::LU:
5359         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_LU());
5360         break;
5361       case StrIntrinsicNode::UL:
5362         stub = RuntimeAddress(StubRoutines::aarch64::compare_long_string_UL());
5363         break;
5364       default:
5365         ShouldNotReachHere();
5366      }
5367     assert(stub.target() != NULL, "compare_long_string stub has not been generated");
5368     trampoline_call(stub);
5369     b(DONE);
5370 
5371   bind(SHORT_STRING);
5372   // Is the minimum length zero?
5373   cbz(cnt2, DONE);
5374   // arrange code to do most branches while loading and loading next characters
5375   // while comparing previous
5376   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5377   subs(cnt2, cnt2, 1);
5378   br(EQ, SHORT_LAST_INIT);
5379   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5380   b(SHORT_LOOP_START);
5381   bind(SHORT_LOOP);
5382   subs(cnt2, cnt2, 1);
5383   br(EQ, SHORT_LAST);
5384   bind(SHORT_LOOP_START);
5385   (this->*str1_load_chr)(tmp2, Address(post(str1, str1_chr_size)));
5386   (this->*str2_load_chr)(rscratch1, Address(post(str2, str2_chr_size)));
5387   cmp(tmp1, cnt1);
5388   br(NE, SHORT_LOOP_TAIL);
5389   subs(cnt2, cnt2, 1);
5390   br(EQ, SHORT_LAST2);
5391   (this->*str1_load_chr)(tmp1, Address(post(str1, str1_chr_size)));
5392   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5393   cmp(tmp2, rscratch1);
5394   br(EQ, SHORT_LOOP);
5395   sub(result, tmp2, rscratch1);
5396   b(DONE);
5397   bind(SHORT_LOOP_TAIL);
5398   sub(result, tmp1, cnt1);
5399   b(DONE);
5400   bind(SHORT_LAST2);
5401   cmp(tmp2, rscratch1);
5402   br(EQ, DONE);
5403   sub(result, tmp2, rscratch1);
5404 
5405   b(DONE);
5406   bind(SHORT_LAST_INIT);
5407   (this->*str2_load_chr)(cnt1, Address(post(str2, str2_chr_size)));
5408   bind(SHORT_LAST);
5409   cmp(tmp1, cnt1);
5410   br(EQ, DONE);
5411   sub(result, tmp1, cnt1);
5412 
5413   bind(DONE);
5414 
5415   BLOCK_COMMENT("} string_compare");
5416 }
5417 #endif // COMPILER2
5418 
5419 // This method checks if provided byte array contains byte with highest bit set.
5420 void MacroAssembler::has_negatives(Register ary1, Register len, Register result) {
5421     // Simple and most common case of aligned small array which is not at the
5422     // end of memory page is placed here. All other cases are in stub.
5423     Label LOOP, END, STUB, STUB_LONG, SET_RESULT, DONE;
5424     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
5425     assert_different_registers(ary1, len, result);
5426 
5427     cmpw(len, 0);
5428     br(LE, SET_RESULT);
5429     cmpw(len, 4 * wordSize);
5430     br(GE, STUB_LONG); // size > 32 then go to stub
5431 
5432     int shift = 64 - exact_log2(os::vm_page_size());
5433     lsl(rscratch1, ary1, shift);
5434     mov(rscratch2, (size_t)(4 * wordSize) << shift);
5435     adds(rscratch2, rscratch1, rscratch2);  // At end of page?
5436     br(CS, STUB); // at the end of page then go to stub
5437     subs(len, len, wordSize);
5438     br(LT, END);
5439 
5440   BIND(LOOP);
5441     ldr(rscratch1, Address(post(ary1, wordSize)));
5442     tst(rscratch1, UPPER_BIT_MASK);
5443     br(NE, SET_RESULT);
5444     subs(len, len, wordSize);
5445     br(GE, LOOP);
5446     cmpw(len, -wordSize);
5447     br(EQ, SET_RESULT);
5448 
5449   BIND(END);
5450     ldr(result, Address(ary1));
5451     sub(len, zr, len, LSL, 3); // LSL 3 is to get bits from bytes
5452     lslv(result, result, len);
5453     tst(result, UPPER_BIT_MASK);
5454     b(SET_RESULT);
5455 
5456   BIND(STUB);
5457     RuntimeAddress has_neg =  RuntimeAddress(StubRoutines::aarch64::has_negatives());
5458     assert(has_neg.target() != NULL, "has_negatives stub has not been generated");
5459     trampoline_call(has_neg);
5460     b(DONE);
5461 
5462   BIND(STUB_LONG);
5463     RuntimeAddress has_neg_long =  RuntimeAddress(
5464             StubRoutines::aarch64::has_negatives_long());
5465     assert(has_neg_long.target() != NULL, "has_negatives stub has not been generated");
5466     trampoline_call(has_neg_long);
5467     b(DONE);
5468 
5469   BIND(SET_RESULT);
5470     cset(result, NE); // set true or false
5471 
5472   BIND(DONE);
5473 }
5474 
5475 void MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3,
5476                                    Register tmp4, Register tmp5, Register result,
5477                                    Register cnt1, int elem_size) {
5478   Label DONE, SAME;
5479   Register tmp1 = rscratch1;
5480   Register tmp2 = rscratch2;
5481   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5482   int elem_per_word = wordSize/elem_size;
5483   int log_elem_size = exact_log2(elem_size);
5484   int length_offset = arrayOopDesc::length_offset_in_bytes();
5485   int base_offset
5486     = arrayOopDesc::base_offset_in_bytes(elem_size == 2 ? T_CHAR : T_BYTE);
5487   int stubBytesThreshold = 3 * 64 + (UseSIMDForArrayEquals ? 0 : 16);
5488 
5489   assert(elem_size == 1 || elem_size == 2, "must be char or byte");
5490   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5491 
5492 #ifndef PRODUCT
5493   {
5494     const char kind = (elem_size == 2) ? 'U' : 'L';
5495     char comment[64];
5496     snprintf(comment, sizeof comment, "array_equals%c{", kind);
5497     BLOCK_COMMENT(comment);
5498   }
5499 #endif
5500 
5501   // if (a1 == a2)
5502   //     return true;
5503   cmpoop(a1, a2); // May have read barriers for a1 and a2.
5504   br(EQ, SAME);
5505 
5506   if (UseSimpleArrayEquals) {
5507     Label NEXT_WORD, SHORT, TAIL03, TAIL01, A_MIGHT_BE_NULL, A_IS_NOT_NULL;
5508     // if (a1 == null || a2 == null)
5509     //     return false;
5510     // a1 & a2 == 0 means (some-pointer is null) or
5511     // (very-rare-or-even-probably-impossible-pointer-values)
5512     // so, we can save one branch in most cases
5513     tst(a1, a2);
5514     mov(result, false);
5515     br(EQ, A_MIGHT_BE_NULL);
5516     // if (a1.length != a2.length)
5517     //      return false;
5518     bind(A_IS_NOT_NULL);
5519     ldrw(cnt1, Address(a1, length_offset));
5520     ldrw(cnt2, Address(a2, length_offset));
5521     eorw(tmp5, cnt1, cnt2);
5522     cbnzw(tmp5, DONE);
5523     lea(a1, Address(a1, base_offset));
5524     lea(a2, Address(a2, base_offset));
5525     // Check for short strings, i.e. smaller than wordSize.
5526     subs(cnt1, cnt1, elem_per_word);
5527     br(Assembler::LT, SHORT);
5528     // Main 8 byte comparison loop.
5529     bind(NEXT_WORD); {
5530       ldr(tmp1, Address(post(a1, wordSize)));
5531       ldr(tmp2, Address(post(a2, wordSize)));
5532       subs(cnt1, cnt1, elem_per_word);
5533       eor(tmp5, tmp1, tmp2);
5534       cbnz(tmp5, DONE);
5535     } br(GT, NEXT_WORD);
5536     // Last longword.  In the case where length == 4 we compare the
5537     // same longword twice, but that's still faster than another
5538     // conditional branch.
5539     // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5540     // length == 4.
5541     if (log_elem_size > 0)
5542       lsl(cnt1, cnt1, log_elem_size);
5543     ldr(tmp3, Address(a1, cnt1));
5544     ldr(tmp4, Address(a2, cnt1));
5545     eor(tmp5, tmp3, tmp4);
5546     cbnz(tmp5, DONE);
5547     b(SAME);
5548     bind(A_MIGHT_BE_NULL);
5549     // in case both a1 and a2 are not-null, proceed with loads
5550     cbz(a1, DONE);
5551     cbz(a2, DONE);
5552     b(A_IS_NOT_NULL);
5553     bind(SHORT);
5554 
5555     tbz(cnt1, 2 - log_elem_size, TAIL03); // 0-7 bytes left.
5556     {
5557       ldrw(tmp1, Address(post(a1, 4)));
5558       ldrw(tmp2, Address(post(a2, 4)));
5559       eorw(tmp5, tmp1, tmp2);
5560       cbnzw(tmp5, DONE);
5561     }
5562     bind(TAIL03);
5563     tbz(cnt1, 1 - log_elem_size, TAIL01); // 0-3 bytes left.
5564     {
5565       ldrh(tmp3, Address(post(a1, 2)));
5566       ldrh(tmp4, Address(post(a2, 2)));
5567       eorw(tmp5, tmp3, tmp4);
5568       cbnzw(tmp5, DONE);
5569     }
5570     bind(TAIL01);
5571     if (elem_size == 1) { // Only needed when comparing byte arrays.
5572       tbz(cnt1, 0, SAME); // 0-1 bytes left.
5573       {
5574         ldrb(tmp1, a1);
5575         ldrb(tmp2, a2);
5576         eorw(tmp5, tmp1, tmp2);
5577         cbnzw(tmp5, DONE);
5578       }
5579     }
5580   } else {
5581     Label NEXT_DWORD, SHORT, TAIL, TAIL2, STUB, EARLY_OUT,
5582         CSET_EQ, LAST_CHECK;
5583     mov(result, false);
5584     cbz(a1, DONE);
5585     ldrw(cnt1, Address(a1, length_offset));
5586     cbz(a2, DONE);
5587     ldrw(cnt2, Address(a2, length_offset));
5588     // on most CPUs a2 is still "locked"(surprisingly) in ldrw and it's
5589     // faster to perform another branch before comparing a1 and a2
5590     cmp(cnt1, (u1)elem_per_word);
5591     br(LE, SHORT); // short or same
5592     ldr(tmp3, Address(pre(a1, base_offset)));
5593     subs(zr, cnt1, stubBytesThreshold);
5594     br(GE, STUB);
5595     ldr(tmp4, Address(pre(a2, base_offset)));
5596     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5597     cmp(cnt2, cnt1);
5598     br(NE, DONE);
5599 
5600     // Main 16 byte comparison loop with 2 exits
5601     bind(NEXT_DWORD); {
5602       ldr(tmp1, Address(pre(a1, wordSize)));
5603       ldr(tmp2, Address(pre(a2, wordSize)));
5604       subs(cnt1, cnt1, 2 * elem_per_word);
5605       br(LE, TAIL);
5606       eor(tmp4, tmp3, tmp4);
5607       cbnz(tmp4, DONE);
5608       ldr(tmp3, Address(pre(a1, wordSize)));
5609       ldr(tmp4, Address(pre(a2, wordSize)));
5610       cmp(cnt1, (u1)elem_per_word);
5611       br(LE, TAIL2);
5612       cmp(tmp1, tmp2);
5613     } br(EQ, NEXT_DWORD);
5614     b(DONE);
5615 
5616     bind(TAIL);
5617     eor(tmp4, tmp3, tmp4);
5618     eor(tmp2, tmp1, tmp2);
5619     lslv(tmp2, tmp2, tmp5);
5620     orr(tmp5, tmp4, tmp2);
5621     cmp(tmp5, zr);
5622     b(CSET_EQ);
5623 
5624     bind(TAIL2);
5625     eor(tmp2, tmp1, tmp2);
5626     cbnz(tmp2, DONE);
5627     b(LAST_CHECK);
5628 
5629     bind(STUB);
5630     ldr(tmp4, Address(pre(a2, base_offset)));
5631     cmp(cnt2, cnt1);
5632     br(NE, DONE);
5633     if (elem_size == 2) { // convert to byte counter
5634       lsl(cnt1, cnt1, 1);
5635     }
5636     eor(tmp5, tmp3, tmp4);
5637     cbnz(tmp5, DONE);
5638     RuntimeAddress stub = RuntimeAddress(StubRoutines::aarch64::large_array_equals());
5639     assert(stub.target() != NULL, "array_equals_long stub has not been generated");
5640     trampoline_call(stub);
5641     b(DONE);
5642 
5643     bind(EARLY_OUT);
5644     // (a1 != null && a2 == null) || (a1 != null && a2 != null && a1 == a2)
5645     // so, if a2 == null => return false(0), else return true, so we can return a2
5646     mov(result, a2);
5647     b(DONE);
5648     bind(SHORT);
5649     cmp(cnt2, cnt1);
5650     br(NE, DONE);
5651     cbz(cnt1, SAME);
5652     sub(tmp5, zr, cnt1, LSL, 3 + log_elem_size);
5653     ldr(tmp3, Address(a1, base_offset));
5654     ldr(tmp4, Address(a2, base_offset));
5655     bind(LAST_CHECK);
5656     eor(tmp4, tmp3, tmp4);
5657     lslv(tmp5, tmp4, tmp5);
5658     cmp(tmp5, zr);
5659     bind(CSET_EQ);
5660     cset(result, EQ);
5661     b(DONE);
5662   }
5663 
5664   bind(SAME);
5665   mov(result, true);
5666   // That's it.
5667   bind(DONE);
5668 
5669   BLOCK_COMMENT("} array_equals");
5670 }
5671 
5672 // Compare Strings
5673 
5674 // For Strings we're passed the address of the first characters in a1
5675 // and a2 and the length in cnt1.
5676 // elem_size is the element size in bytes: either 1 or 2.
5677 // There are two implementations.  For arrays >= 8 bytes, all
5678 // comparisons (including the final one, which may overlap) are
5679 // performed 8 bytes at a time.  For strings < 8 bytes, we compare a
5680 // halfword, then a short, and then a byte.
5681 
5682 void MacroAssembler::string_equals(Register a1, Register a2,
5683                                    Register result, Register cnt1, int elem_size)
5684 {
5685   Label SAME, DONE, SHORT, NEXT_WORD;
5686   Register tmp1 = rscratch1;
5687   Register tmp2 = rscratch2;
5688   Register cnt2 = tmp2;  // cnt2 only used in array length compare
5689 
5690   assert(elem_size == 1 || elem_size == 2, "must be 2 or 1 byte");
5691   assert_different_registers(a1, a2, result, cnt1, rscratch1, rscratch2);
5692 
5693 #ifndef PRODUCT
5694   {
5695     const char kind = (elem_size == 2) ? 'U' : 'L';
5696     char comment[64];
5697     snprintf(comment, sizeof comment, "{string_equals%c", kind);
5698     BLOCK_COMMENT(comment);
5699   }
5700 #endif
5701 
5702   mov(result, false);
5703 
5704   // Check for short strings, i.e. smaller than wordSize.
5705   subs(cnt1, cnt1, wordSize);
5706   br(Assembler::LT, SHORT);
5707   // Main 8 byte comparison loop.
5708   bind(NEXT_WORD); {
5709     ldr(tmp1, Address(post(a1, wordSize)));
5710     ldr(tmp2, Address(post(a2, wordSize)));
5711     subs(cnt1, cnt1, wordSize);
5712     eor(tmp1, tmp1, tmp2);
5713     cbnz(tmp1, DONE);
5714   } br(GT, NEXT_WORD);
5715   // Last longword.  In the case where length == 4 we compare the
5716   // same longword twice, but that's still faster than another
5717   // conditional branch.
5718   // cnt1 could be 0, -1, -2, -3, -4 for chars; -4 only happens when
5719   // length == 4.
5720   ldr(tmp1, Address(a1, cnt1));
5721   ldr(tmp2, Address(a2, cnt1));
5722   eor(tmp2, tmp1, tmp2);
5723   cbnz(tmp2, DONE);
5724   b(SAME);
5725 
5726   bind(SHORT);
5727   Label TAIL03, TAIL01;
5728 
5729   tbz(cnt1, 2, TAIL03); // 0-7 bytes left.
5730   {
5731     ldrw(tmp1, Address(post(a1, 4)));
5732     ldrw(tmp2, Address(post(a2, 4)));
5733     eorw(tmp1, tmp1, tmp2);
5734     cbnzw(tmp1, DONE);
5735   }
5736   bind(TAIL03);
5737   tbz(cnt1, 1, TAIL01); // 0-3 bytes left.
5738   {
5739     ldrh(tmp1, Address(post(a1, 2)));
5740     ldrh(tmp2, Address(post(a2, 2)));
5741     eorw(tmp1, tmp1, tmp2);
5742     cbnzw(tmp1, DONE);
5743   }
5744   bind(TAIL01);
5745   if (elem_size == 1) { // Only needed when comparing 1-byte elements
5746     tbz(cnt1, 0, SAME); // 0-1 bytes left.
5747     {
5748       ldrb(tmp1, a1);
5749       ldrb(tmp2, a2);
5750       eorw(tmp1, tmp1, tmp2);
5751       cbnzw(tmp1, DONE);
5752     }
5753   }
5754   // Arrays are equal.
5755   bind(SAME);
5756   mov(result, true);
5757 
5758   // That's it.
5759   bind(DONE);
5760   BLOCK_COMMENT("} string_equals");
5761 }
5762 
5763 
5764 // The size of the blocks erased by the zero_blocks stub.  We must
5765 // handle anything smaller than this ourselves in zero_words().
5766 const int MacroAssembler::zero_words_block_size = 8;
5767 
5768 // zero_words() is used by C2 ClearArray patterns.  It is as small as
5769 // possible, handling small word counts locally and delegating
5770 // anything larger to the zero_blocks stub.  It is expanded many times
5771 // in compiled code, so it is important to keep it short.
5772 
5773 // ptr:   Address of a buffer to be zeroed.
5774 // cnt:   Count in HeapWords.
5775 //
5776 // ptr, cnt, rscratch1, and rscratch2 are clobbered.
5777 void MacroAssembler::zero_words(Register ptr, Register cnt)
5778 {
5779   assert(is_power_of_2(zero_words_block_size), "adjust this");
5780   assert(ptr == r10 && cnt == r11, "mismatch in register usage");
5781 
5782   BLOCK_COMMENT("zero_words {");
5783   cmp(cnt, (u1)zero_words_block_size);
5784   Label around;
5785   br(LO, around);
5786   {
5787     RuntimeAddress zero_blocks =  RuntimeAddress(StubRoutines::aarch64::zero_blocks());
5788     assert(zero_blocks.target() != NULL, "zero_blocks stub has not been generated");
5789     if (StubRoutines::aarch64::complete()) {
5790       trampoline_call(zero_blocks);
5791     } else {
5792       bl(zero_blocks);
5793     }
5794   }
5795   bind(around);
5796   for (int i = zero_words_block_size >> 1; i > 1; i >>= 1) {
5797     Label l;
5798     tbz(cnt, exact_log2(i), l);
5799     for (int j = 0; j < i; j += 2) {
5800       stp(zr, zr, post(ptr, 16));
5801     }
5802     bind(l);
5803   }
5804   {
5805     Label l;
5806     tbz(cnt, 0, l);
5807     str(zr, Address(ptr));
5808     bind(l);
5809   }
5810   BLOCK_COMMENT("} zero_words");
5811 }
5812 
5813 // base:         Address of a buffer to be zeroed, 8 bytes aligned.
5814 // cnt:          Immediate count in HeapWords.
5815 #define SmallArraySize (18 * BytesPerLong)
5816 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
5817 {
5818   BLOCK_COMMENT("zero_words {");
5819   int i = cnt & 1;  // store any odd word to start
5820   if (i) str(zr, Address(base));
5821 
5822   if (cnt <= SmallArraySize / BytesPerLong) {
5823     for (; i < (int)cnt; i += 2)
5824       stp(zr, zr, Address(base, i * wordSize));
5825   } else {
5826     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
5827     int remainder = cnt % (2 * unroll);
5828     for (; i < remainder; i += 2)
5829       stp(zr, zr, Address(base, i * wordSize));
5830 
5831     Label loop;
5832     Register cnt_reg = rscratch1;
5833     Register loop_base = rscratch2;
5834     cnt = cnt - remainder;
5835     mov(cnt_reg, cnt);
5836     // adjust base and prebias by -2 * wordSize so we can pre-increment
5837     add(loop_base, base, (remainder - 2) * wordSize);
5838     bind(loop);
5839     sub(cnt_reg, cnt_reg, 2 * unroll);
5840     for (i = 1; i < unroll; i++)
5841       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
5842     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
5843     cbnz(cnt_reg, loop);
5844   }
5845   BLOCK_COMMENT("} zero_words");
5846 }
5847 
5848 // Zero blocks of memory by using DC ZVA.
5849 //
5850 // Aligns the base address first sufficently for DC ZVA, then uses
5851 // DC ZVA repeatedly for every full block.  cnt is the size to be
5852 // zeroed in HeapWords.  Returns the count of words left to be zeroed
5853 // in cnt.
5854 //
5855 // NOTE: This is intended to be used in the zero_blocks() stub.  If
5856 // you want to use it elsewhere, note that cnt must be >= 2*zva_length.
5857 void MacroAssembler::zero_dcache_blocks(Register base, Register cnt) {
5858   Register tmp = rscratch1;
5859   Register tmp2 = rscratch2;
5860   int zva_length = VM_Version::zva_length();
5861   Label initial_table_end, loop_zva;
5862   Label fini;
5863 
5864   // Base must be 16 byte aligned. If not just return and let caller handle it
5865   tst(base, 0x0f);
5866   br(Assembler::NE, fini);
5867   // Align base with ZVA length.
5868   neg(tmp, base);
5869   andr(tmp, tmp, zva_length - 1);
5870 
5871   // tmp: the number of bytes to be filled to align the base with ZVA length.
5872   add(base, base, tmp);
5873   sub(cnt, cnt, tmp, Assembler::ASR, 3);
5874   adr(tmp2, initial_table_end);
5875   sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
5876   br(tmp2);
5877 
5878   for (int i = -zva_length + 16; i < 0; i += 16)
5879     stp(zr, zr, Address(base, i));
5880   bind(initial_table_end);
5881 
5882   sub(cnt, cnt, zva_length >> 3);
5883   bind(loop_zva);
5884   dc(Assembler::ZVA, base);
5885   subs(cnt, cnt, zva_length >> 3);
5886   add(base, base, zva_length);
5887   br(Assembler::GE, loop_zva);
5888   add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
5889   bind(fini);
5890 }
5891 
5892 // base:   Address of a buffer to be filled, 8 bytes aligned.
5893 // cnt:    Count in 8-byte unit.
5894 // value:  Value to be filled with.
5895 // base will point to the end of the buffer after filling.
5896 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
5897 {
5898 //  Algorithm:
5899 //
5900 //    scratch1 = cnt & 7;
5901 //    cnt -= scratch1;
5902 //    p += scratch1;
5903 //    switch (scratch1) {
5904 //      do {
5905 //        cnt -= 8;
5906 //          p[-8] = v;
5907 //        case 7:
5908 //          p[-7] = v;
5909 //        case 6:
5910 //          p[-6] = v;
5911 //          // ...
5912 //        case 1:
5913 //          p[-1] = v;
5914 //        case 0:
5915 //          p += 8;
5916 //      } while (cnt);
5917 //    }
5918 
5919   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
5920 
5921   Label fini, skip, entry, loop;
5922   const int unroll = 8; // Number of stp instructions we'll unroll
5923 
5924   cbz(cnt, fini);
5925   tbz(base, 3, skip);
5926   str(value, Address(post(base, 8)));
5927   sub(cnt, cnt, 1);
5928   bind(skip);
5929 
5930   andr(rscratch1, cnt, (unroll-1) * 2);
5931   sub(cnt, cnt, rscratch1);
5932   add(base, base, rscratch1, Assembler::LSL, 3);
5933   adr(rscratch2, entry);
5934   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
5935   br(rscratch2);
5936 
5937   bind(loop);
5938   add(base, base, unroll * 16);
5939   for (int i = -unroll; i < 0; i++)
5940     stp(value, value, Address(base, i * 16));
5941   bind(entry);
5942   subs(cnt, cnt, unroll * 2);
5943   br(Assembler::GE, loop);
5944 
5945   tbz(cnt, 0, fini);
5946   str(value, Address(post(base, 8)));
5947   bind(fini);
5948 }
5949 
5950 // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and
5951 // java/lang/StringUTF16.compress.
5952 void MacroAssembler::encode_iso_array(Register src, Register dst,
5953                       Register len, Register result,
5954                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5955                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5956 {
5957     Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1,
5958         NEXT_32_START, NEXT_32_PRFM_START;
5959     Register tmp1 = rscratch1, tmp2 = rscratch2;
5960 
5961       mov(result, len); // Save initial len
5962 
5963 #ifndef BUILTIN_SIM
5964       cmp(len, (u1)8); // handle shortest strings first
5965       br(LT, LOOP_1);
5966       cmp(len, (u1)32);
5967       br(LT, NEXT_8);
5968       // The following code uses the SIMD 'uzp1' and 'uzp2' instructions
5969       // to convert chars to bytes
5970       if (SoftwarePrefetchHintDistance >= 0) {
5971         ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5972         subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5973         br(LE, NEXT_32_START);
5974         b(NEXT_32_PRFM_START);
5975         BIND(NEXT_32_PRFM);
5976           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5977         BIND(NEXT_32_PRFM_START);
5978           prfm(Address(src, SoftwarePrefetchHintDistance));
5979           orr(v4, T16B, Vtmp1, Vtmp2);
5980           orr(v5, T16B, Vtmp3, Vtmp4);
5981           uzp1(Vtmp1, T16B, Vtmp1, Vtmp2);
5982           uzp1(Vtmp3, T16B, Vtmp3, Vtmp4);
5983           uzp2(v5, T16B, v4, v5); // high bytes
5984           umov(tmp2, v5, D, 1);
5985           fmovd(tmp1, v5);
5986           orr(tmp1, tmp1, tmp2);
5987           cbnz(tmp1, LOOP_8);
5988           stpq(Vtmp1, Vtmp3, dst);
5989           sub(len, len, 32);
5990           add(dst, dst, 32);
5991           add(src, src, 64);
5992           subs(tmp2, len, SoftwarePrefetchHintDistance/2 + 16);
5993           br(GE, NEXT_32_PRFM);
5994           cmp(len, (u1)32);
5995           br(LT, LOOP_8);
5996         BIND(NEXT_32);
5997           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5998         BIND(NEXT_32_START);
5999       } else {
6000         BIND(NEXT_32);
6001           ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
6002       }
6003       prfm(Address(src, SoftwarePrefetchHintDistance));
6004       uzp1(v4, T16B, Vtmp1, Vtmp2);
6005       uzp1(v5, T16B, Vtmp3, Vtmp4);
6006       orr(Vtmp1, T16B, Vtmp1, Vtmp2);
6007       orr(Vtmp3, T16B, Vtmp3, Vtmp4);
6008       uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes
6009       umov(tmp2, Vtmp1, D, 1);
6010       fmovd(tmp1, Vtmp1);
6011       orr(tmp1, tmp1, tmp2);
6012       cbnz(tmp1, LOOP_8);
6013       stpq(v4, v5, dst);
6014       sub(len, len, 32);
6015       add(dst, dst, 32);
6016       add(src, src, 64);
6017       cmp(len, (u1)32);
6018       br(GE, NEXT_32);
6019       cbz(len, DONE);
6020 
6021     BIND(LOOP_8);
6022       cmp(len, (u1)8);
6023       br(LT, LOOP_1);
6024     BIND(NEXT_8);
6025       ld1(Vtmp1, T8H, src);
6026       uzp1(Vtmp2, T16B, Vtmp1, Vtmp1); // low bytes
6027       uzp2(Vtmp3, T16B, Vtmp1, Vtmp1); // high bytes
6028       fmovd(tmp1, Vtmp3);
6029       cbnz(tmp1, NEXT_1);
6030       strd(Vtmp2, dst);
6031 
6032       sub(len, len, 8);
6033       add(dst, dst, 8);
6034       add(src, src, 16);
6035       cmp(len, (u1)8);
6036       br(GE, NEXT_8);
6037 
6038     BIND(LOOP_1);
6039 #endif
6040     cbz(len, DONE);
6041     BIND(NEXT_1);
6042       ldrh(tmp1, Address(post(src, 2)));
6043       tst(tmp1, 0xff00);
6044       br(NE, SET_RESULT);
6045       strb(tmp1, Address(post(dst, 1)));
6046       subs(len, len, 1);
6047       br(GT, NEXT_1);
6048 
6049     BIND(SET_RESULT);
6050       sub(result, result, len); // Return index where we stopped
6051                                 // Return len == 0 if we processed all
6052                                 // characters
6053     BIND(DONE);
6054 }
6055 
6056 
6057 // Inflate byte[] array to char[].
6058 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len,
6059                                         FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3,
6060                                         Register tmp4) {
6061   Label big, done, after_init, to_stub;
6062 
6063   assert_different_registers(src, dst, len, tmp4, rscratch1);
6064 
6065   fmovd(vtmp1, zr);
6066   lsrw(tmp4, len, 3);
6067   bind(after_init);
6068   cbnzw(tmp4, big);
6069   // Short string: less than 8 bytes.
6070   {
6071     Label loop, tiny;
6072 
6073     cmpw(len, 4);
6074     br(LT, tiny);
6075     // Use SIMD to do 4 bytes.
6076     ldrs(vtmp2, post(src, 4));
6077     zip1(vtmp3, T8B, vtmp2, vtmp1);
6078     subw(len, len, 4);
6079     strd(vtmp3, post(dst, 8));
6080 
6081     cbzw(len, done);
6082 
6083     // Do the remaining bytes by steam.
6084     bind(loop);
6085     ldrb(tmp4, post(src, 1));
6086     strh(tmp4, post(dst, 2));
6087     subw(len, len, 1);
6088 
6089     bind(tiny);
6090     cbnz(len, loop);
6091 
6092     b(done);
6093   }
6094 
6095   if (SoftwarePrefetchHintDistance >= 0) {
6096     bind(to_stub);
6097       RuntimeAddress stub =  RuntimeAddress(StubRoutines::aarch64::large_byte_array_inflate());
6098       assert(stub.target() != NULL, "large_byte_array_inflate stub has not been generated");
6099       trampoline_call(stub);
6100       b(after_init);
6101   }
6102 
6103   // Unpack the bytes 8 at a time.
6104   bind(big);
6105   {
6106     Label loop, around, loop_last, loop_start;
6107 
6108     if (SoftwarePrefetchHintDistance >= 0) {
6109       const int large_loop_threshold = (64 + 16)/8;
6110       ldrd(vtmp2, post(src, 8));
6111       andw(len, len, 7);
6112       cmp(tmp4, (u1)large_loop_threshold);
6113       br(GE, to_stub);
6114       b(loop_start);
6115 
6116       bind(loop);
6117       ldrd(vtmp2, post(src, 8));
6118       bind(loop_start);
6119       subs(tmp4, tmp4, 1);
6120       br(EQ, loop_last);
6121       zip1(vtmp2, T16B, vtmp2, vtmp1);
6122       ldrd(vtmp3, post(src, 8));
6123       st1(vtmp2, T8H, post(dst, 16));
6124       subs(tmp4, tmp4, 1);
6125       zip1(vtmp3, T16B, vtmp3, vtmp1);
6126       st1(vtmp3, T8H, post(dst, 16));
6127       br(NE, loop);
6128       b(around);
6129       bind(loop_last);
6130       zip1(vtmp2, T16B, vtmp2, vtmp1);
6131       st1(vtmp2, T8H, post(dst, 16));
6132       bind(around);
6133       cbz(len, done);
6134     } else {
6135       andw(len, len, 7);
6136       bind(loop);
6137       ldrd(vtmp2, post(src, 8));
6138       sub(tmp4, tmp4, 1);
6139       zip1(vtmp3, T16B, vtmp2, vtmp1);
6140       st1(vtmp3, T8H, post(dst, 16));
6141       cbnz(tmp4, loop);
6142     }
6143   }
6144 
6145   // Do the tail of up to 8 bytes.
6146   add(src, src, len);
6147   ldrd(vtmp3, Address(src, -8));
6148   add(dst, dst, len, ext::uxtw, 1);
6149   zip1(vtmp3, T16B, vtmp3, vtmp1);
6150   strq(vtmp3, Address(dst, -16));
6151 
6152   bind(done);
6153 }
6154 
6155 // Compress char[] array to byte[].
6156 void MacroAssembler::char_array_compress(Register src, Register dst, Register len,
6157                                          FloatRegister tmp1Reg, FloatRegister tmp2Reg,
6158                                          FloatRegister tmp3Reg, FloatRegister tmp4Reg,
6159                                          Register result) {
6160   encode_iso_array(src, dst, len, result,
6161                    tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg);
6162   cmp(len, zr);
6163   csel(result, result, zr, EQ);
6164 }
6165 
6166 // get_thread() can be called anywhere inside generated code so we
6167 // need to save whatever non-callee save context might get clobbered
6168 // by the call to JavaThread::aarch64_get_thread_helper() or, indeed,
6169 // the call setup code.
6170 //
6171 // aarch64_get_thread_helper() clobbers only r0, r1, and flags.
6172 //
6173 void MacroAssembler::get_thread(Register dst) {
6174   RegSet saved_regs = RegSet::range(r0, r1) + lr - dst;
6175   push(saved_regs, sp);
6176 
6177   mov(lr, CAST_FROM_FN_PTR(address, JavaThread::aarch64_get_thread_helper));
6178   blrt(lr, 1, 0, 1);
6179   if (dst != c_rarg0) {
6180     mov(dst, c_rarg0);
6181   }
6182 
6183   pop(saved_regs, sp);
6184 }