1 
   2 /*
   3 /*
   4  * Copyright (c) 2013, Red Hat Inc.
   5  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
   6  * All rights reserved.
   7  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   8  *
   9  * This code is free software; you can redistribute it and/or modify it
  10  * under the terms of the GNU General Public License version 2 only, as
  11  * published by the Free Software Foundation.
  12  *
  13  * This code is distributed in the hope that it will be useful, but WITHOUT
  14  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16  * version 2 for more details (a copy is included in the LICENSE file that
  17  * accompanied this code).
  18  *
  19  * You should have received a copy of the GNU General Public License version
  20  * 2 along with this work; if not, write to the Free Software Foundation,
  21  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  22  *
  23  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  24  * or visit www.oracle.com if you need additional information or have any
  25  * questions.
  26  *
  27  */
  28 
  29 #include <sys/types.h>
  30 
  31 #include "precompiled.hpp"
  32 #include "asm/assembler.hpp"
  33 #include "asm/assembler.inline.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 
  36 #include "compiler/disassembler.hpp"
  37 #include "gc_interface/collectedHeap.inline.hpp"
  38 #include "gc_implementation/shenandoah/brooksPointer.hpp"
  39 #include "gc_implementation/shenandoah/shenandoahHeap.hpp"
  40 #include "gc_implementation/shenandoah/shenandoahHeap.inline.hpp"
  41 #include "gc_implementation/shenandoah/shenandoahHeapRegion.hpp"
  42 #include "memory/resourceArea.hpp"
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/interfaceSupport.hpp"
  45 #include "runtime/sharedRuntime.hpp"
  46 
  47 // #include "gc_interface/collectedHeap.inline.hpp"
  48 // #include "interpreter/interpreter.hpp"
  49 // #include "memory/cardTableModRefBS.hpp"
  50 // #include "prims/methodHandles.hpp"
  51 // #include "runtime/biasedLocking.hpp"
  52 // #include "runtime/interfaceSupport.hpp"
  53 // #include "runtime/objectMonitor.hpp"
  54 // #include "runtime/os.hpp"
  55 // #include "runtime/sharedRuntime.hpp"
  56 // #include "runtime/stubRoutines.hpp"
  57 
  58 #if INCLUDE_ALL_GCS
  59 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  60 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  61 #include "gc_implementation/g1/heapRegion.hpp"
  62 #endif
  63 
  64 #ifdef COMPILER2
  65 #include "opto/node.hpp"
  66 #include "opto/compile.hpp"
  67 #endif
  68 
  69 #ifdef PRODUCT
  70 #define BLOCK_COMMENT(str) /* nothing */
  71 #define STOP(error) stop(error)
  72 #else
  73 #define BLOCK_COMMENT(str) block_comment(str)
  74 #define STOP(error) block_comment(error); stop(error)
  75 #endif
  76 
  77 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  78 
  79 // Patch any kind of instruction; there may be several instructions.
  80 // Return the total length (in bytes) of the instructions.
  81 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  82   int instructions = 1;
  83   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  84   long offset = (target - branch) >> 2;
  85   unsigned insn = *(unsigned*)branch;
  86   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  87     // Load register (literal)
  88     Instruction_aarch64::spatch(branch, 23, 5, offset);
  89   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  90     // Unconditional branch (immediate)
  91     Instruction_aarch64::spatch(branch, 25, 0, offset);
  92   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  93     // Conditional branch (immediate)
  94     Instruction_aarch64::spatch(branch, 23, 5, offset);
  95   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  96     // Compare & branch (immediate)
  97     Instruction_aarch64::spatch(branch, 23, 5, offset);
  98   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  99     // Test & branch (immediate)
 100     Instruction_aarch64::spatch(branch, 18, 5, offset);
 101   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 102     // PC-rel. addressing
 103     offset = target-branch;
 104     int shift = Instruction_aarch64::extract(insn, 31, 31);
 105     if (shift) {
 106       u_int64_t dest = (u_int64_t)target;
 107       uint64_t pc_page = (uint64_t)branch >> 12;
 108       uint64_t adr_page = (uint64_t)target >> 12;
 109       unsigned offset_lo = dest & 0xfff;
 110       offset = adr_page - pc_page;
 111 
 112       // We handle 4 types of PC relative addressing
 113       //   1 - adrp    Rx, target_page
 114       //       ldr/str Ry, [Rx, #offset_in_page]
 115       //   2 - adrp    Rx, target_page
 116       //       add     Ry, Rx, #offset_in_page
 117       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 118       //       movk    Rx, #imm16<<32
 119       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 120       // In the first 3 cases we must check that Rx is the same in the adrp and the
 121       // subsequent ldr/str, add or movk instruction. Otherwise we could accidentally end
 122       // up treating a type 4 relocation as a type 1, 2 or 3 just because it happened
 123       // to be followed by a random unrelated ldr/str, add or movk instruction.
 124       //
 125       unsigned insn2 = ((unsigned*)branch)[1];
 126       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 127                 Instruction_aarch64::extract(insn, 4, 0) ==
 128                         Instruction_aarch64::extract(insn2, 9, 5)) {
 129         // Load/store register (unsigned immediate)
 130         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                     21, 10, offset_lo >> size);
 133         guarantee(((dest >> size) << size) == dest, "misaligned target");
 134         instructions = 2;
 135       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 136                 Instruction_aarch64::extract(insn, 4, 0) ==
 137                         Instruction_aarch64::extract(insn2, 4, 0)) {
 138         // add (immediate)
 139         Instruction_aarch64::patch(branch + sizeof (unsigned),
 140                                    21, 10, offset_lo);
 141         instructions = 2;
 142       } else if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110 &&
 143                    Instruction_aarch64::extract(insn, 4, 0) ==
 144                      Instruction_aarch64::extract(insn2, 4, 0)) {
 145         // movk #imm16<<32
 146         Instruction_aarch64::patch(branch + 4, 20, 5, (uint64_t)target >> 32);
 147         long dest = ((long)target & 0xffffffffL) | ((long)branch & 0xffff00000000L);
 148         long pc_page = (long)branch >> 12;
 149         long adr_page = (long)dest >> 12;
 150         offset = adr_page - pc_page;
 151         instructions = 2;
 152       }
 153     }
 154     int offset_lo = offset & 3;
 155     offset >>= 2;
 156     Instruction_aarch64::spatch(branch, 23, 5, offset);
 157     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 158   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 159     u_int64_t dest = (u_int64_t)target;
 160     // Move wide constant
 161     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 162     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 163     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 164     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 165     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 166     assert(target_addr_for_insn(branch) == target, "should be");
 167     instructions = 3;
 168   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 169              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 170     // nothing to do
 171     assert(target == 0, "did not expect to relocate target for polling page load");
 172   } else {
 173     ShouldNotReachHere();
 174   }
 175   return instructions * NativeInstruction::instruction_size;
 176 }
 177 
 178 int MacroAssembler::patch_oop(address insn_addr, address o) {
 179   int instructions;
 180   unsigned insn = *(unsigned*)insn_addr;
 181   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 182 
 183   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 184   // narrow OOPs by setting the upper 16 bits in the first
 185   // instruction.
 186   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 187     // Move narrow OOP
 188     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 189     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 190     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 191     instructions = 2;
 192   } else {
 193     // Move wide OOP
 194     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 195     uintptr_t dest = (uintptr_t)o;
 196     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 197     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 198     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 199     instructions = 3;
 200   }
 201   return instructions * NativeInstruction::instruction_size;
 202 }
 203 
 204 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 205   long offset = 0;
 206   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 207     // Load register (literal)
 208     offset = Instruction_aarch64::sextract(insn, 23, 5);
 209     return address(((uint64_t)insn_addr + (offset << 2)));
 210   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 211     // Unconditional branch (immediate)
 212     offset = Instruction_aarch64::sextract(insn, 25, 0);
 213   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 214     // Conditional branch (immediate)
 215     offset = Instruction_aarch64::sextract(insn, 23, 5);
 216   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 217     // Compare & branch (immediate)
 218     offset = Instruction_aarch64::sextract(insn, 23, 5);
 219    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 220     // Test & branch (immediate)
 221     offset = Instruction_aarch64::sextract(insn, 18, 5);
 222   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 223     // PC-rel. addressing
 224     offset = Instruction_aarch64::extract(insn, 30, 29);
 225     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 226     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 227     if (shift) {
 228       offset <<= shift;
 229       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 230       target_page &= ((uint64_t)-1) << shift;
 231       // Return the target address for the following sequences
 232       //   1 - adrp    Rx, target_page
 233       //       ldr/str Ry, [Rx, #offset_in_page]
 234       //   2 - adrp    Rx, target_page
 235       //       add     Ry, Rx, #offset_in_page
 236       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 237       //       movk    Rx, #imm12<<32
 238       //   4 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 239       //
 240       // In the first two cases  we check that the register is the same and
 241       // return the target_page + the offset within the page.
 242       // Otherwise we assume it is a page aligned relocation and return
 243       // the target page only.
 244       //
 245       unsigned insn2 = ((unsigned*)insn_addr)[1];
 246       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 247                 Instruction_aarch64::extract(insn, 4, 0) ==
 248                         Instruction_aarch64::extract(insn2, 9, 5)) {
 249         // Load/store register (unsigned immediate)
 250         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 251         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 252         return address(target_page + (byte_offset << size));
 253       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 254                 Instruction_aarch64::extract(insn, 4, 0) ==
 255                         Instruction_aarch64::extract(insn2, 4, 0)) {
 256         // add (immediate)
 257         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 258         return address(target_page + byte_offset);
 259       } else {
 260         if (Instruction_aarch64::extract(insn2, 31, 21) == 0b11110010110  &&
 261                Instruction_aarch64::extract(insn, 4, 0) ==
 262                  Instruction_aarch64::extract(insn2, 4, 0)) {
 263           target_page = (target_page & 0xffffffff) |
 264                          ((uint64_t)Instruction_aarch64::extract(insn2, 20, 5) << 32);
 265         }
 266         return (address)target_page;
 267       }
 268     } else {
 269       ShouldNotReachHere();
 270     }
 271   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 272     u_int32_t *insns = (u_int32_t *)insn_addr;
 273     // Move wide constant: movz, movk, movk.  See movptr().
 274     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 275     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 276     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 277                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 278                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 279   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 280              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 281     return 0;
 282   } else {
 283     ShouldNotReachHere();
 284   }
 285   return address(((uint64_t)insn_addr + (offset << 2)));
 286 }
 287 
 288 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 289   dsb(Assembler::SY);
 290 }
 291 
 292 
 293 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 294   // we must set sp to zero to clear frame
 295   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 296 
 297   // must clear fp, so that compiled frames are not confused; it is
 298   // possible that we need it only for debugging
 299   if (clear_fp) {
 300     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 301   }
 302 
 303   // Always clear the pc because it could have been set by make_walkable()
 304   str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 305 }
 306 
 307 // Calls to C land
 308 //
 309 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 310 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 311 // has to be reset to 0. This is required to allow proper stack traversal.
 312 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 313                                          Register last_java_fp,
 314                                          Register last_java_pc,
 315                                          Register scratch) {
 316 
 317   if (last_java_pc->is_valid()) {
 318       str(last_java_pc, Address(rthread,
 319                                 JavaThread::frame_anchor_offset()
 320                                 + JavaFrameAnchor::last_Java_pc_offset()));
 321     }
 322 
 323   // determine last_java_sp register
 324   if (last_java_sp == sp) {
 325     mov(scratch, sp);
 326     last_java_sp = scratch;
 327   } else if (!last_java_sp->is_valid()) {
 328     last_java_sp = esp;
 329   }
 330 
 331   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 332 
 333   // last_java_fp is optional
 334   if (last_java_fp->is_valid()) {
 335     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 336   }
 337 }
 338 
 339 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 340                                          Register last_java_fp,
 341                                          address  last_java_pc,
 342                                          Register scratch) {
 343   if (last_java_pc != NULL) {
 344     adr(scratch, last_java_pc);
 345   } else {
 346     // FIXME: This is almost never correct.  We should delete all
 347     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 348     // correct return address instead.
 349     adr(scratch, pc());
 350   }
 351 
 352   str(scratch, Address(rthread,
 353                        JavaThread::frame_anchor_offset()
 354                        + JavaFrameAnchor::last_Java_pc_offset()));
 355 
 356   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 357 }
 358 
 359 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 360                                          Register last_java_fp,
 361                                          Label &L,
 362                                          Register scratch) {
 363   if (L.is_bound()) {
 364     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 365   } else {
 366     InstructionMark im(this);
 367     L.add_patch_at(code(), locator());
 368     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 369   }
 370 }
 371 
 372 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 373   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 374   assert(CodeCache::find_blob(entry.target()) != NULL,
 375          "destination of far call not found in code cache");
 376   if (far_branches()) {
 377     unsigned long offset;
 378     // We can use ADRP here because we know that the total size of
 379     // the code cache cannot exceed 2Gb.
 380     adrp(tmp, entry, offset);
 381     add(tmp, tmp, offset);
 382     if (cbuf) cbuf->set_insts_mark();
 383     blr(tmp);
 384   } else {
 385     if (cbuf) cbuf->set_insts_mark();
 386     bl(entry);
 387   }
 388 }
 389 
 390 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 391   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 392   assert(CodeCache::find_blob(entry.target()) != NULL,
 393          "destination of far call not found in code cache");
 394   if (far_branches()) {
 395     unsigned long offset;
 396     // We can use ADRP here because we know that the total size of
 397     // the code cache cannot exceed 2Gb.
 398     adrp(tmp, entry, offset);
 399     add(tmp, tmp, offset);
 400     if (cbuf) cbuf->set_insts_mark();
 401     br(tmp);
 402   } else {
 403     if (cbuf) cbuf->set_insts_mark();
 404     b(entry);
 405   }
 406 }
 407 
 408 int MacroAssembler::biased_locking_enter(Register lock_reg,
 409                                          Register obj_reg,
 410                                          Register swap_reg,
 411                                          Register tmp_reg,
 412                                          bool swap_reg_contains_mark,
 413                                          Label& done,
 414                                          Label* slow_case,
 415                                          BiasedLockingCounters* counters) {
 416   assert(UseBiasedLocking, "why call this otherwise?");
 417   assert_different_registers(lock_reg, obj_reg, swap_reg);
 418 
 419   if (PrintBiasedLockingStatistics && counters == NULL)
 420     counters = BiasedLocking::counters();
 421 
 422   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1, rscratch2, noreg);
 423   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 424   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 425   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 426   Address saved_mark_addr(lock_reg, 0);
 427 
 428   // Biased locking
 429   // See whether the lock is currently biased toward our thread and
 430   // whether the epoch is still valid
 431   // Note that the runtime guarantees sufficient alignment of JavaThread
 432   // pointers to allow age to be placed into low bits
 433   // First check to see whether biasing is even enabled for this object
 434   Label cas_label;
 435   int null_check_offset = -1;
 436   if (!swap_reg_contains_mark) {
 437     null_check_offset = offset();
 438     ldr(swap_reg, mark_addr);
 439   }
 440   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 441   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 442   br(Assembler::NE, cas_label);
 443   // The bias pattern is present in the object's header. Need to check
 444   // whether the bias owner and the epoch are both still current.
 445   load_prototype_header(tmp_reg, obj_reg);
 446   orr(tmp_reg, tmp_reg, rthread);
 447   eor(tmp_reg, swap_reg, tmp_reg);
 448   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 449   if (counters != NULL) {
 450     Label around;
 451     cbnz(tmp_reg, around);
 452     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1, rscratch2);
 453     b(done);
 454     bind(around);
 455   } else {
 456     cbz(tmp_reg, done);
 457   }
 458 
 459   Label try_revoke_bias;
 460   Label try_rebias;
 461 
 462   // At this point we know that the header has the bias pattern and
 463   // that we are not the bias owner in the current epoch. We need to
 464   // figure out more details about the state of the header in order to
 465   // know what operations can be legally performed on the object's
 466   // header.
 467 
 468   // If the low three bits in the xor result aren't clear, that means
 469   // the prototype header is no longer biased and we have to revoke
 470   // the bias on this object.
 471   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 472   cbnz(rscratch1, try_revoke_bias);
 473 
 474   // Biasing is still enabled for this data type. See whether the
 475   // epoch of the current bias is still valid, meaning that the epoch
 476   // bits of the mark word are equal to the epoch bits of the
 477   // prototype header. (Note that the prototype header's epoch bits
 478   // only change at a safepoint.) If not, attempt to rebias the object
 479   // toward the current thread. Note that we must be absolutely sure
 480   // that the current epoch is invalid in order to do this because
 481   // otherwise the manipulations it performs on the mark word are
 482   // illegal.
 483   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 484   cbnz(rscratch1, try_rebias);
 485 
 486   // The epoch of the current bias is still valid but we know nothing
 487   // about the owner; it might be set or it might be clear. Try to
 488   // acquire the bias of the object using an atomic operation. If this
 489   // fails we will go in to the runtime to revoke the object's bias.
 490   // Note that we first construct the presumed unbiased header so we
 491   // don't accidentally blow away another thread's valid bias.
 492   {
 493     Label here;
 494     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 495     andr(swap_reg, swap_reg, rscratch1);
 496     orr(tmp_reg, swap_reg, rthread);
 497     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 498     // If the biasing toward our thread failed, this means that
 499     // another thread succeeded in biasing it toward itself and we
 500     // need to revoke that bias. The revocation will occur in the
 501     // interpreter runtime in the slow case.
 502     bind(here);
 503     if (counters != NULL) {
 504       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 505                   tmp_reg, rscratch1, rscratch2);
 506     }
 507   }
 508   b(done);
 509 
 510   bind(try_rebias);
 511   // At this point we know the epoch has expired, meaning that the
 512   // current "bias owner", if any, is actually invalid. Under these
 513   // circumstances _only_, we are allowed to use the current header's
 514   // value as the comparison value when doing the cas to acquire the
 515   // bias in the current epoch. In other words, we allow transfer of
 516   // the bias from one thread to another directly in this situation.
 517   //
 518   // FIXME: due to a lack of registers we currently blow away the age
 519   // bits in this situation. Should attempt to preserve them.
 520   {
 521     Label here;
 522     load_prototype_header(tmp_reg, obj_reg);
 523     orr(tmp_reg, rthread, tmp_reg);
 524     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 525     // If the biasing toward our thread failed, then another thread
 526     // succeeded in biasing it toward itself and we need to revoke that
 527     // bias. The revocation will occur in the runtime in the slow case.
 528     bind(here);
 529     if (counters != NULL) {
 530       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 531                   tmp_reg, rscratch1, rscratch2);
 532     }
 533   }
 534   b(done);
 535 
 536   bind(try_revoke_bias);
 537   // The prototype mark in the klass doesn't have the bias bit set any
 538   // more, indicating that objects of this data type are not supposed
 539   // to be biased any more. We are going to try to reset the mark of
 540   // this object to the prototype value and fall through to the
 541   // CAS-based locking scheme. Note that if our CAS fails, it means
 542   // that another thread raced us for the privilege of revoking the
 543   // bias of this particular object, so it's okay to continue in the
 544   // normal locking code.
 545   //
 546   // FIXME: due to a lack of registers we currently blow away the age
 547   // bits in this situation. Should attempt to preserve them.
 548   {
 549     Label here, nope;
 550     load_prototype_header(tmp_reg, obj_reg);
 551     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 552     bind(here);
 553 
 554     // Fall through to the normal CAS-based lock, because no matter what
 555     // the result of the above CAS, some thread must have succeeded in
 556     // removing the bias bit from the object's header.
 557     if (counters != NULL) {
 558       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 559                   rscratch1, rscratch2);
 560     }
 561     bind(nope);
 562   }
 563 
 564   bind(cas_label);
 565 
 566   return null_check_offset;
 567 }
 568 
 569 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 570   assert(UseBiasedLocking, "why call this otherwise?");
 571 
 572   // Check for biased locking unlock case, which is a no-op
 573   // Note: we do not have to check the thread ID for two reasons.
 574   // First, the interpreter checks for IllegalMonitorStateException at
 575   // a higher level. Second, if the bias was revoked while we held the
 576   // lock, the object could not be rebiased toward another thread, so
 577   // the bias bit would be clear.
 578   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 579   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 580   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 581   br(Assembler::EQ, done);
 582 }
 583 
 584 
 585 // added to make this compile
 586 
 587 REGISTER_DEFINITION(Register, noreg);
 588 
 589 static void pass_arg0(MacroAssembler* masm, Register arg) {
 590   if (c_rarg0 != arg ) {
 591     masm->mov(c_rarg0, arg);
 592   }
 593 }
 594 
 595 static void pass_arg1(MacroAssembler* masm, Register arg) {
 596   if (c_rarg1 != arg ) {
 597     masm->mov(c_rarg1, arg);
 598   }
 599 }
 600 
 601 static void pass_arg2(MacroAssembler* masm, Register arg) {
 602   if (c_rarg2 != arg ) {
 603     masm->mov(c_rarg2, arg);
 604   }
 605 }
 606 
 607 static void pass_arg3(MacroAssembler* masm, Register arg) {
 608   if (c_rarg3 != arg ) {
 609     masm->mov(c_rarg3, arg);
 610   }
 611 }
 612 
 613 void MacroAssembler::call_VM_base(Register oop_result,
 614                                   Register java_thread,
 615                                   Register last_java_sp,
 616                                   address  entry_point,
 617                                   int      number_of_arguments,
 618                                   bool     check_exceptions) {
 619    // determine java_thread register
 620   if (!java_thread->is_valid()) {
 621     java_thread = rthread;
 622   }
 623 
 624   // determine last_java_sp register
 625   if (!last_java_sp->is_valid()) {
 626     last_java_sp = esp;
 627   }
 628 
 629   // debugging support
 630   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 631   assert(java_thread == rthread, "unexpected register");
 632 #ifdef ASSERT
 633   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 634   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 635 #endif // ASSERT
 636 
 637   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 638   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 639 
 640   // push java thread (becomes first argument of C function)
 641 
 642   mov(c_rarg0, java_thread);
 643 
 644   // set last Java frame before call
 645   assert(last_java_sp != rfp, "can't use rfp");
 646 
 647   Label l;
 648   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 649 
 650   // do the call, remove parameters
 651   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 652 
 653   // reset last Java frame
 654   // Only interpreter should have to clear fp
 655   reset_last_Java_frame(true);
 656 
 657    // C++ interp handles this in the interpreter
 658   check_and_handle_popframe(java_thread);
 659   check_and_handle_earlyret(java_thread);
 660 
 661   if (check_exceptions) {
 662     // check for pending exceptions (java_thread is set upon return)
 663     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 664     Label ok;
 665     cbz(rscratch1, ok);
 666     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 667     br(rscratch1);
 668     bind(ok);
 669   }
 670 
 671   // get oop result if there is one and reset the value in the thread
 672   if (oop_result->is_valid()) {
 673     get_vm_result(oop_result, java_thread);
 674   }
 675 }
 676 
 677 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 678   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 679 }
 680 
 681 // Maybe emit a call via a trampoline.  If the code cache is small
 682 // trampolines won't be emitted.
 683 
 684 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 685   assert(entry.rspec().type() == relocInfo::runtime_call_type
 686          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 687          || entry.rspec().type() == relocInfo::static_call_type
 688          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 689 
 690   unsigned int start_offset = offset();
 691 #ifdef COMPILER2
 692   // We need a trampoline if branches are far.
 693   if (far_branches()) {
 694     // We don't want to emit a trampoline if C2 is generating dummy
 695     // code during its branch shortening phase.
 696     CompileTask* task = ciEnv::current()->task();
 697     bool in_scratch_emit_size =
 698       ((task != NULL) && is_c2_compile(task->comp_level())
 699        && Compile::current()->in_scratch_emit_size());
 700     if (! in_scratch_emit_size) {
 701       address stub = emit_trampoline_stub(start_offset, entry.target());
 702       if (stub == NULL) {
 703         return NULL; // CodeCache is full
 704       }
 705     }
 706   }
 707 #endif
 708 
 709   if (cbuf) cbuf->set_insts_mark();
 710   relocate(entry.rspec());
 711 #ifdef COMPILER2
 712   if (!far_branches()) {
 713     bl(entry.target());
 714   } else {
 715     bl(pc());
 716   }
 717 #else
 718     bl(entry.target());
 719 #endif
 720   // just need to return a non-null address
 721   return pc();
 722 }
 723 
 724 
 725 // Emit a trampoline stub for a call to a target which is too far away.
 726 //
 727 // code sequences:
 728 //
 729 // call-site:
 730 //   branch-and-link to <destination> or <trampoline stub>
 731 //
 732 // Related trampoline stub for this call site in the stub section:
 733 //   load the call target from the constant pool
 734 //   branch (LR still points to the call site above)
 735 
 736 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 737                                              address dest) {
 738 #ifdef COMPILER2
 739   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 740   if (stub == NULL) {
 741     return NULL;  // CodeBuffer::expand failed
 742   }
 743 
 744   // Create a trampoline stub relocation which relates this trampoline stub
 745   // with the call instruction at insts_call_instruction_offset in the
 746   // instructions code-section.
 747   align(wordSize);
 748   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 749                                             + insts_call_instruction_offset));
 750   const int stub_start_offset = offset();
 751 
 752   // Now, create the trampoline stub's code:
 753   // - load the call
 754   // - call
 755   Label target;
 756   ldr(rscratch1, target);
 757   br(rscratch1);
 758   bind(target);
 759   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 760          "should be");
 761   emit_int64((int64_t)dest);
 762 
 763   const address stub_start_addr = addr_at(stub_start_offset);
 764 
 765   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 766 
 767   end_a_stub();
 768   return stub;
 769 #else
 770   ShouldNotReachHere();
 771   return NULL;
 772 #endif
 773 }
 774 
 775 void MacroAssembler::c2bool(Register x) {
 776   // implements x == 0 ? 0 : 1
 777   // note: must only look at least-significant byte of x
 778   //       since C-style booleans are stored in one byte
 779   //       only! (was bug)
 780   tst(x, 0xff);
 781   cset(x, Assembler::NE);
 782 }
 783 
 784 address MacroAssembler::ic_call(address entry) {
 785   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 786   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 787   // unsigned long offset;
 788   // ldr_constant(rscratch2, const_ptr);
 789   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 790   return trampoline_call(Address(entry, rh));
 791 }
 792 
 793 // Implementation of call_VM versions
 794 
 795 void MacroAssembler::call_VM(Register oop_result,
 796                              address entry_point,
 797                              bool check_exceptions) {
 798   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 799 }
 800 
 801 void MacroAssembler::call_VM(Register oop_result,
 802                              address entry_point,
 803                              Register arg_1,
 804                              bool check_exceptions) {
 805   pass_arg1(this, arg_1);
 806   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 807 }
 808 
 809 void MacroAssembler::call_VM(Register oop_result,
 810                              address entry_point,
 811                              Register arg_1,
 812                              Register arg_2,
 813                              bool check_exceptions) {
 814   assert(arg_1 != c_rarg2, "smashed arg");
 815   pass_arg2(this, arg_2);
 816   pass_arg1(this, arg_1);
 817   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 818 }
 819 
 820 void MacroAssembler::call_VM(Register oop_result,
 821                              address entry_point,
 822                              Register arg_1,
 823                              Register arg_2,
 824                              Register arg_3,
 825                              bool check_exceptions) {
 826   assert(arg_1 != c_rarg3, "smashed arg");
 827   assert(arg_2 != c_rarg3, "smashed arg");
 828   pass_arg3(this, arg_3);
 829 
 830   assert(arg_1 != c_rarg2, "smashed arg");
 831   pass_arg2(this, arg_2);
 832 
 833   pass_arg1(this, arg_1);
 834   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 835 }
 836 
 837 void MacroAssembler::call_VM(Register oop_result,
 838                              Register last_java_sp,
 839                              address entry_point,
 840                              int number_of_arguments,
 841                              bool check_exceptions) {
 842   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 843 }
 844 
 845 void MacroAssembler::call_VM(Register oop_result,
 846                              Register last_java_sp,
 847                              address entry_point,
 848                              Register arg_1,
 849                              bool check_exceptions) {
 850   pass_arg1(this, arg_1);
 851   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 852 }
 853 
 854 void MacroAssembler::call_VM(Register oop_result,
 855                              Register last_java_sp,
 856                              address entry_point,
 857                              Register arg_1,
 858                              Register arg_2,
 859                              bool check_exceptions) {
 860 
 861   assert(arg_1 != c_rarg2, "smashed arg");
 862   pass_arg2(this, arg_2);
 863   pass_arg1(this, arg_1);
 864   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 865 }
 866 
 867 void MacroAssembler::call_VM(Register oop_result,
 868                              Register last_java_sp,
 869                              address entry_point,
 870                              Register arg_1,
 871                              Register arg_2,
 872                              Register arg_3,
 873                              bool check_exceptions) {
 874   assert(arg_1 != c_rarg3, "smashed arg");
 875   assert(arg_2 != c_rarg3, "smashed arg");
 876   pass_arg3(this, arg_3);
 877   assert(arg_1 != c_rarg2, "smashed arg");
 878   pass_arg2(this, arg_2);
 879   pass_arg1(this, arg_1);
 880   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 881 }
 882 
 883 
 884 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 885   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 886   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 887   verify_oop(oop_result, "broken oop in call_VM_base");
 888 }
 889 
 890 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 891   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 892   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 893 }
 894 
 895 void MacroAssembler::align(int modulus) {
 896   while (offset() % modulus != 0) nop();
 897 }
 898 
 899 // these are no-ops overridden by InterpreterMacroAssembler
 900 
 901 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 902 
 903 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 904 
 905 
 906 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 907                                                       Register tmp,
 908                                                       int offset) {
 909   intptr_t value = *delayed_value_addr;
 910   if (value != 0)
 911     return RegisterOrConstant(value + offset);
 912 
 913   // load indirectly to solve generation ordering problem
 914   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 915 
 916   if (offset != 0)
 917     add(tmp, tmp, offset);
 918 
 919   return RegisterOrConstant(tmp);
 920 }
 921 
 922 
 923 void MacroAssembler:: notify(int type) {
 924   if (type == bytecode_start) {
 925     // set_last_Java_frame(esp, rfp, (address)NULL);
 926     Assembler:: notify(type);
 927     // reset_last_Java_frame(true);
 928   }
 929   else
 930     Assembler:: notify(type);
 931 }
 932 
 933 // Look up the method for a megamorphic invokeinterface call.
 934 // The target method is determined by <intf_klass, itable_index>.
 935 // The receiver klass is in recv_klass.
 936 // On success, the result will be in method_result, and execution falls through.
 937 // On failure, execution transfers to the given label.
 938 void MacroAssembler::lookup_interface_method(Register recv_klass,
 939                                              Register intf_klass,
 940                                              RegisterOrConstant itable_index,
 941                                              Register method_result,
 942                                              Register scan_temp,
 943                                              Label& L_no_such_interface,
 944                                              bool return_method) {
 945   assert_different_registers(recv_klass, intf_klass, scan_temp);
 946   assert_different_registers(method_result, intf_klass, scan_temp);
 947   assert(recv_klass != method_result || !return_method,
 948          "recv_klass can be destroyed when method isn't needed");
 949 
 950   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 951          "caller must use same register for non-constant itable index as for method");
 952 
 953   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 954   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 955   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 956   int scan_step   = itableOffsetEntry::size() * wordSize;
 957   int vte_size    = vtableEntry::size() * wordSize;
 958   assert(vte_size == wordSize, "else adjust times_vte_scale");
 959 
 960   ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 961 
 962   // %%% Could store the aligned, prescaled offset in the klassoop.
 963   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 964   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 965   add(scan_temp, scan_temp, vtable_base);
 966   if (HeapWordsPerLong > 1) {
 967     // Round up to align_object_offset boundary
 968     // see code for instanceKlass::start_of_itable!
 969     round_to(scan_temp, BytesPerLong);
 970   }
 971 
 972   if (return_method) {
 973     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 974     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 975     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 976     lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 977     if (itentry_off)
 978       add(recv_klass, recv_klass, itentry_off);
 979   }
 980 
 981   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 982   //   if (scan->interface() == intf) {
 983   //     result = (klass + scan->offset() + itable_index);
 984   //   }
 985   // }
 986   Label search, found_method;
 987 
 988   for (int peel = 1; peel >= 0; peel--) {
 989     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 990     cmp(intf_klass, method_result);
 991 
 992     if (peel) {
 993       br(Assembler::EQ, found_method);
 994     } else {
 995       br(Assembler::NE, search);
 996       // (invert the test to fall through to found_method...)
 997     }
 998 
 999     if (!peel)  break;
1000 
1001     bind(search);
1002 
1003     // Check that the previous entry is non-null.  A null entry means that
1004     // the receiver class doesn't implement the interface, and wasn't the
1005     // same as when the caller was compiled.
1006     cbz(method_result, L_no_such_interface);
1007     add(scan_temp, scan_temp, scan_step);
1008   }
1009 
1010   bind(found_method);
1011 
1012   if (return_method) {
1013     // Got a hit.
1014     ldrw(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
1015     ldr(method_result, Address(recv_klass, scan_temp, Address::uxtw(0)));
1016   }
1017 }
1018 
1019 // virtual method calling
1020 void MacroAssembler::lookup_virtual_method(Register recv_klass,
1021                                            RegisterOrConstant vtable_index,
1022                                            Register method_result) {
1023   const int base = InstanceKlass::vtable_start_offset() * wordSize;
1024   assert(vtableEntry::size() * wordSize == 8,
1025          "adjust the scaling in the code below");
1026   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
1027 
1028   if (vtable_index.is_register()) {
1029     lea(method_result, Address(recv_klass,
1030                                vtable_index.as_register(),
1031                                Address::lsl(LogBytesPerWord)));
1032     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
1033   } else {
1034     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
1035     ldr(method_result,
1036         form_address(rscratch1, recv_klass, vtable_offset_in_bytes, 0));
1037   }
1038 }
1039 
1040 void MacroAssembler::check_klass_subtype(Register sub_klass,
1041                            Register super_klass,
1042                            Register temp_reg,
1043                            Label& L_success) {
1044   Label L_failure;
1045   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
1046   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
1047   bind(L_failure);
1048 }
1049 
1050 
1051 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1052                                                    Register super_klass,
1053                                                    Register temp_reg,
1054                                                    Label* L_success,
1055                                                    Label* L_failure,
1056                                                    Label* L_slow_path,
1057                                         RegisterOrConstant super_check_offset) {
1058   assert_different_registers(sub_klass, super_klass, temp_reg);
1059   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1060   if (super_check_offset.is_register()) {
1061     assert_different_registers(sub_klass, super_klass,
1062                                super_check_offset.as_register());
1063   } else if (must_load_sco) {
1064     assert(temp_reg != noreg, "supply either a temp or a register offset");
1065   }
1066 
1067   Label L_fallthrough;
1068   int label_nulls = 0;
1069   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1070   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1071   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1072   assert(label_nulls <= 1, "at most one NULL in the batch");
1073 
1074   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1075   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1076   Address super_check_offset_addr(super_klass, sco_offset);
1077 
1078   // Hacked jmp, which may only be used just before L_fallthrough.
1079 #define final_jmp(label)                                                \
1080   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1081   else                            b(label)                /*omit semi*/
1082 
1083   // If the pointers are equal, we are done (e.g., String[] elements).
1084   // This self-check enables sharing of secondary supertype arrays among
1085   // non-primary types such as array-of-interface.  Otherwise, each such
1086   // type would need its own customized SSA.
1087   // We move this check to the front of the fast path because many
1088   // type checks are in fact trivially successful in this manner,
1089   // so we get a nicely predicted branch right at the start of the check.
1090   cmp(sub_klass, super_klass);
1091   br(Assembler::EQ, *L_success);
1092 
1093   // Check the supertype display:
1094   if (must_load_sco) {
1095     // Positive movl does right thing on LP64.
1096     ldrw(temp_reg, super_check_offset_addr);
1097     super_check_offset = RegisterOrConstant(temp_reg);
1098   }
1099   Address super_check_addr(sub_klass, super_check_offset);
1100   ldr(rscratch1, super_check_addr);
1101   cmp(super_klass, rscratch1); // load displayed supertype
1102 
1103   // This check has worked decisively for primary supers.
1104   // Secondary supers are sought in the super_cache ('super_cache_addr').
1105   // (Secondary supers are interfaces and very deeply nested subtypes.)
1106   // This works in the same check above because of a tricky aliasing
1107   // between the super_cache and the primary super display elements.
1108   // (The 'super_check_addr' can address either, as the case requires.)
1109   // Note that the cache is updated below if it does not help us find
1110   // what we need immediately.
1111   // So if it was a primary super, we can just fail immediately.
1112   // Otherwise, it's the slow path for us (no success at this point).
1113 
1114   if (super_check_offset.is_register()) {
1115     br(Assembler::EQ, *L_success);
1116     cmp(super_check_offset.as_register(), sc_offset);
1117     if (L_failure == &L_fallthrough) {
1118       br(Assembler::EQ, *L_slow_path);
1119     } else {
1120       br(Assembler::NE, *L_failure);
1121       final_jmp(*L_slow_path);
1122     }
1123   } else if (super_check_offset.as_constant() == sc_offset) {
1124     // Need a slow path; fast failure is impossible.
1125     if (L_slow_path == &L_fallthrough) {
1126       br(Assembler::EQ, *L_success);
1127     } else {
1128       br(Assembler::NE, *L_slow_path);
1129       final_jmp(*L_success);
1130     }
1131   } else {
1132     // No slow path; it's a fast decision.
1133     if (L_failure == &L_fallthrough) {
1134       br(Assembler::EQ, *L_success);
1135     } else {
1136       br(Assembler::NE, *L_failure);
1137       final_jmp(*L_success);
1138     }
1139   }
1140 
1141   bind(L_fallthrough);
1142 
1143 #undef final_jmp
1144 }
1145 
1146 // These two are taken from x86, but they look generally useful
1147 
1148 // scans count pointer sized words at [addr] for occurence of value,
1149 // generic
1150 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1151                                 Register scratch) {
1152   Label Lloop, Lexit;
1153   cbz(count, Lexit);
1154   bind(Lloop);
1155   ldr(scratch, post(addr, wordSize));
1156   cmp(value, scratch);
1157   br(EQ, Lexit);
1158   sub(count, count, 1);
1159   cbnz(count, Lloop);
1160   bind(Lexit);
1161 }
1162 
1163 // scans count 4 byte words at [addr] for occurence of value,
1164 // generic
1165 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1166                                 Register scratch) {
1167   Label Lloop, Lexit;
1168   cbz(count, Lexit);
1169   bind(Lloop);
1170   ldrw(scratch, post(addr, wordSize));
1171   cmpw(value, scratch);
1172   br(EQ, Lexit);
1173   sub(count, count, 1);
1174   cbnz(count, Lloop);
1175   bind(Lexit);
1176 }
1177 
1178 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1179                                                    Register super_klass,
1180                                                    Register temp_reg,
1181                                                    Register temp2_reg,
1182                                                    Label* L_success,
1183                                                    Label* L_failure,
1184                                                    bool set_cond_codes) {
1185   assert_different_registers(sub_klass, super_klass, temp_reg);
1186   if (temp2_reg != noreg)
1187     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1188 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1189 
1190   Label L_fallthrough;
1191   int label_nulls = 0;
1192   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1193   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1194   assert(label_nulls <= 1, "at most one NULL in the batch");
1195 
1196   // a couple of useful fields in sub_klass:
1197   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1198   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1199   Address secondary_supers_addr(sub_klass, ss_offset);
1200   Address super_cache_addr(     sub_klass, sc_offset);
1201 
1202   BLOCK_COMMENT("check_klass_subtype_slow_path");
1203 
1204   // Do a linear scan of the secondary super-klass chain.
1205   // This code is rarely used, so simplicity is a virtue here.
1206   // The repne_scan instruction uses fixed registers, which we must spill.
1207   // Don't worry too much about pre-existing connections with the input regs.
1208 
1209   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1210   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1211 
1212   // Get super_klass value into r0 (even if it was in r5 or r2).
1213   RegSet pushed_registers;
1214   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1215   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1216 
1217   if (super_klass != r0 || UseCompressedOops) {
1218     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1219   }
1220 
1221   push(pushed_registers, sp);
1222 
1223 #ifndef PRODUCT
1224   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1225   Address pst_counter_addr(rscratch2);
1226   ldr(rscratch1, pst_counter_addr);
1227   add(rscratch1, rscratch1, 1);
1228   str(rscratch1, pst_counter_addr);
1229 #endif //PRODUCT
1230 
1231   // We will consult the secondary-super array.
1232   ldr(r5, secondary_supers_addr);
1233   // Load the array length.  (Positive movl does right thing on LP64.)
1234   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1235   // Skip to start of data.
1236   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1237 
1238   cmp(sp, zr); // Clear Z flag; SP is never zero
1239   // Scan R2 words at [R5] for an occurrence of R0.
1240   // Set NZ/Z based on last compare.
1241   repne_scan(r5, r0, r2, rscratch1);
1242 
1243   // Unspill the temp. registers:
1244   pop(pushed_registers, sp);
1245 
1246   br(Assembler::NE, *L_failure);
1247 
1248   // Success.  Cache the super we found and proceed in triumph.
1249   str(super_klass, super_cache_addr);
1250 
1251   if (L_success != &L_fallthrough) {
1252     b(*L_success);
1253   }
1254 
1255 #undef IS_A_TEMP
1256 
1257   bind(L_fallthrough);
1258 }
1259 
1260 
1261 void MacroAssembler::verify_oop(Register reg, const char* s) {
1262   if (!VerifyOops) return;
1263 
1264   // Pass register number to verify_oop_subroutine
1265   const char* b = NULL;
1266   {
1267     ResourceMark rm;
1268     stringStream ss;
1269     ss.print("verify_oop: %s: %s", reg->name(), s);
1270     b = code_string(ss.as_string());
1271   }
1272   BLOCK_COMMENT("verify_oop {");
1273 
1274   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1275   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1276 
1277   mov(r0, reg);
1278   mov(rscratch1, (address)b);
1279 
1280   // call indirectly to solve generation ordering problem
1281   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1282   ldr(rscratch2, Address(rscratch2));
1283   blr(rscratch2);
1284 
1285   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1286   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1287 
1288   BLOCK_COMMENT("} verify_oop");
1289 }
1290 
1291 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1292   if (!VerifyOops) return;
1293 
1294   const char* b = NULL;
1295   {
1296     ResourceMark rm;
1297     stringStream ss;
1298     ss.print("verify_oop_addr: %s", s);
1299     b = code_string(ss.as_string());
1300   }
1301   BLOCK_COMMENT("verify_oop_addr {");
1302 
1303   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1304   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1305 
1306   // addr may contain sp so we will have to adjust it based on the
1307   // pushes that we just did.
1308   if (addr.uses(sp)) {
1309     lea(r0, addr);
1310     ldr(r0, Address(r0, 4 * wordSize));
1311   } else {
1312     ldr(r0, addr);
1313   }
1314   mov(rscratch1, (address)b);
1315 
1316   // call indirectly to solve generation ordering problem
1317   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1318   ldr(rscratch2, Address(rscratch2));
1319   blr(rscratch2);
1320 
1321   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1322   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1323 
1324   BLOCK_COMMENT("} verify_oop_addr");
1325 }
1326 
1327 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1328                                          int extra_slot_offset) {
1329   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1330   int stackElementSize = Interpreter::stackElementSize;
1331   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1332 #ifdef ASSERT
1333   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1334   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1335 #endif
1336   if (arg_slot.is_constant()) {
1337     return Address(esp, arg_slot.as_constant() * stackElementSize
1338                    + offset);
1339   } else {
1340     add(rscratch1, esp, arg_slot.as_register(),
1341         ext::uxtx, exact_log2(stackElementSize));
1342     return Address(rscratch1, offset);
1343   }
1344 }
1345 
1346 void MacroAssembler::call_VM_leaf_base(address entry_point,
1347                                        int number_of_arguments,
1348                                        Label *retaddr) {
1349   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1350 }
1351 
1352 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1353                                         int number_of_gp_arguments,
1354                                         int number_of_fp_arguments,
1355                                         ret_type type,
1356                                         Label *retaddr) {
1357   Label E, L;
1358 
1359   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1360 
1361   // We add 1 to number_of_arguments because the thread in arg0 is
1362   // not counted
1363   mov(rscratch1, entry_point);
1364   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1365   if (retaddr)
1366     bind(*retaddr);
1367 
1368   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1369   maybe_isb();
1370 }
1371 
1372 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1373   call_VM_leaf_base(entry_point, number_of_arguments);
1374 }
1375 
1376 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1377   pass_arg0(this, arg_0);
1378   call_VM_leaf_base(entry_point, 1);
1379 }
1380 
1381 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1382   pass_arg0(this, arg_0);
1383   pass_arg1(this, arg_1);
1384   call_VM_leaf_base(entry_point, 2);
1385 }
1386 
1387 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1388                                   Register arg_1, Register arg_2) {
1389   pass_arg0(this, arg_0);
1390   pass_arg1(this, arg_1);
1391   pass_arg2(this, arg_2);
1392   call_VM_leaf_base(entry_point, 3);
1393 }
1394 
1395 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1396   pass_arg0(this, arg_0);
1397   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1398 }
1399 
1400 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1401 
1402   assert(arg_0 != c_rarg1, "smashed arg");
1403   pass_arg1(this, arg_1);
1404   pass_arg0(this, arg_0);
1405   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1406 }
1407 
1408 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1409   assert(arg_0 != c_rarg2, "smashed arg");
1410   assert(arg_1 != c_rarg2, "smashed arg");
1411   pass_arg2(this, arg_2);
1412   assert(arg_0 != c_rarg1, "smashed arg");
1413   pass_arg1(this, arg_1);
1414   pass_arg0(this, arg_0);
1415   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1416 }
1417 
1418 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1419   assert(arg_0 != c_rarg3, "smashed arg");
1420   assert(arg_1 != c_rarg3, "smashed arg");
1421   assert(arg_2 != c_rarg3, "smashed arg");
1422   pass_arg3(this, arg_3);
1423   assert(arg_0 != c_rarg2, "smashed arg");
1424   assert(arg_1 != c_rarg2, "smashed arg");
1425   pass_arg2(this, arg_2);
1426   assert(arg_0 != c_rarg1, "smashed arg");
1427   pass_arg1(this, arg_1);
1428   pass_arg0(this, arg_0);
1429   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1430 }
1431 
1432 void MacroAssembler::null_check(Register reg, int offset) {
1433   if (needs_explicit_null_check(offset)) {
1434     // provoke OS NULL exception if reg = NULL by
1435     // accessing M[reg] w/o changing any registers
1436     // NOTE: this is plenty to provoke a segv
1437 
1438     ldr(zr, Address(reg));
1439   } else {
1440     // nothing to do, (later) access of M[reg + offset]
1441     // will provoke OS NULL exception if reg = NULL
1442   }
1443 }
1444 
1445 // MacroAssembler protected routines needed to implement
1446 // public methods
1447 
1448 void MacroAssembler::mov(Register r, Address dest) {
1449   code_section()->relocate(pc(), dest.rspec());
1450   u_int64_t imm64 = (u_int64_t)dest.target();
1451   movptr(r, imm64);
1452 }
1453 
1454 // Move a constant pointer into r.  In AArch64 mode the virtual
1455 // address space is 48 bits in size, so we only need three
1456 // instructions to create a patchable instruction sequence that can
1457 // reach anywhere.
1458 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1459 #ifndef PRODUCT
1460   {
1461     char buffer[64];
1462     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1463     block_comment(buffer);
1464   }
1465 #endif
1466   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1467   movz(r, imm64 & 0xffff);
1468   imm64 >>= 16;
1469   movk(r, imm64 & 0xffff, 16);
1470   imm64 >>= 16;
1471   movk(r, imm64 & 0xffff, 32);
1472 }
1473 
1474 // Macro to mov replicated immediate to vector register.
1475 //  Vd will get the following values for different arrangements in T
1476 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1477 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1478 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1479 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1480 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1481 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1482 //   T1D/T2D: invalid
1483 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1484   assert(T != T1D && T != T2D, "invalid arrangement");
1485   if (T == T8B || T == T16B) {
1486     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1487     movi(Vd, T, imm32 & 0xff, 0);
1488     return;
1489   }
1490   u_int32_t nimm32 = ~imm32;
1491   if (T == T4H || T == T8H) {
1492     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1493     imm32 &= 0xffff;
1494     nimm32 &= 0xffff;
1495   }
1496   u_int32_t x = imm32;
1497   int movi_cnt = 0;
1498   int movn_cnt = 0;
1499   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1500   x = nimm32;
1501   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1502   if (movn_cnt < movi_cnt) imm32 = nimm32;
1503   unsigned lsl = 0;
1504   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1505   if (movn_cnt < movi_cnt)
1506     mvni(Vd, T, imm32 & 0xff, lsl);
1507   else
1508     movi(Vd, T, imm32 & 0xff, lsl);
1509   imm32 >>= 8; lsl += 8;
1510   while (imm32) {
1511     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1512     if (movn_cnt < movi_cnt)
1513       bici(Vd, T, imm32 & 0xff, lsl);
1514     else
1515       orri(Vd, T, imm32 & 0xff, lsl);
1516     lsl += 8; imm32 >>= 8;
1517   }
1518 }
1519 
1520 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1521 {
1522 #ifndef PRODUCT
1523   {
1524     char buffer[64];
1525     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1526     block_comment(buffer);
1527   }
1528 #endif
1529   if (operand_valid_for_logical_immediate(false, imm64)) {
1530     orr(dst, zr, imm64);
1531   } else {
1532     // we can use a combination of MOVZ or MOVN with
1533     // MOVK to build up the constant
1534     u_int64_t imm_h[4];
1535     int zero_count = 0;
1536     int neg_count = 0;
1537     int i;
1538     for (i = 0; i < 4; i++) {
1539       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1540       if (imm_h[i] == 0) {
1541         zero_count++;
1542       } else if (imm_h[i] == 0xffffL) {
1543         neg_count++;
1544       }
1545     }
1546     if (zero_count == 4) {
1547       // one MOVZ will do
1548       movz(dst, 0);
1549     } else if (neg_count == 4) {
1550       // one MOVN will do
1551       movn(dst, 0);
1552     } else if (zero_count == 3) {
1553       for (i = 0; i < 4; i++) {
1554         if (imm_h[i] != 0L) {
1555           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1556           break;
1557         }
1558       }
1559     } else if (neg_count == 3) {
1560       // one MOVN will do
1561       for (int i = 0; i < 4; i++) {
1562         if (imm_h[i] != 0xffffL) {
1563           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1564           break;
1565         }
1566       }
1567     } else if (zero_count == 2) {
1568       // one MOVZ and one MOVK will do
1569       for (i = 0; i < 3; i++) {
1570         if (imm_h[i] != 0L) {
1571           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1572           i++;
1573           break;
1574         }
1575       }
1576       for (;i < 4; i++) {
1577         if (imm_h[i] != 0L) {
1578           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1579         }
1580       }
1581     } else if (neg_count == 2) {
1582       // one MOVN and one MOVK will do
1583       for (i = 0; i < 4; i++) {
1584         if (imm_h[i] != 0xffffL) {
1585           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1586           i++;
1587           break;
1588         }
1589       }
1590       for (;i < 4; i++) {
1591         if (imm_h[i] != 0xffffL) {
1592           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1593         }
1594       }
1595     } else if (zero_count == 1) {
1596       // one MOVZ and two MOVKs will do
1597       for (i = 0; i < 4; i++) {
1598         if (imm_h[i] != 0L) {
1599           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1600           i++;
1601           break;
1602         }
1603       }
1604       for (;i < 4; i++) {
1605         if (imm_h[i] != 0x0L) {
1606           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1607         }
1608       }
1609     } else if (neg_count == 1) {
1610       // one MOVN and two MOVKs will do
1611       for (i = 0; i < 4; i++) {
1612         if (imm_h[i] != 0xffffL) {
1613           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1614           i++;
1615           break;
1616         }
1617       }
1618       for (;i < 4; i++) {
1619         if (imm_h[i] != 0xffffL) {
1620           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1621         }
1622       }
1623     } else {
1624       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1625       movz(dst, (u_int32_t)imm_h[0], 0);
1626       for (i = 1; i < 4; i++) {
1627         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1628       }
1629     }
1630   }
1631 }
1632 
1633 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1634 {
1635 #ifndef PRODUCT
1636     {
1637       char buffer[64];
1638       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1639       block_comment(buffer);
1640     }
1641 #endif
1642   if (operand_valid_for_logical_immediate(true, imm32)) {
1643     orrw(dst, zr, imm32);
1644   } else {
1645     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1646     // constant
1647     u_int32_t imm_h[2];
1648     imm_h[0] = imm32 & 0xffff;
1649     imm_h[1] = ((imm32 >> 16) & 0xffff);
1650     if (imm_h[0] == 0) {
1651       movzw(dst, imm_h[1], 16);
1652     } else if (imm_h[0] == 0xffff) {
1653       movnw(dst, imm_h[1] ^ 0xffff, 16);
1654     } else if (imm_h[1] == 0) {
1655       movzw(dst, imm_h[0], 0);
1656     } else if (imm_h[1] == 0xffff) {
1657       movnw(dst, imm_h[0] ^ 0xffff, 0);
1658     } else {
1659       // use a MOVZ and MOVK (makes it easier to debug)
1660       movzw(dst, imm_h[0], 0);
1661       movkw(dst, imm_h[1], 16);
1662     }
1663   }
1664 }
1665 
1666 void MacroAssembler::mov(Register dst, address addr) {
1667   assert(Universe::heap() == NULL
1668          || !Universe::heap()->is_in(addr), "use movptr for oop pointers");
1669     mov_immediate64(dst, (uintptr_t)addr);
1670 }
1671 
1672 // Form an address from base + offset in Rd.  Rd may or may
1673 // not actually be used: you must use the Address that is returned.
1674 // It is up to you to ensure that the shift provided matches the size
1675 // of your data.
1676 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1677   if (Address::offset_ok_for_immed(byte_offset, shift))
1678     // It fits; no need for any heroics
1679     return Address(base, byte_offset);
1680 
1681   // Don't do anything clever with negative or misaligned offsets
1682   unsigned mask = (1 << shift) - 1;
1683   if (byte_offset < 0 || byte_offset & mask) {
1684     mov(Rd, byte_offset);
1685     add(Rd, base, Rd);
1686     return Address(Rd);
1687   }
1688 
1689   // See if we can do this with two 12-bit offsets
1690   {
1691     unsigned long word_offset = byte_offset >> shift;
1692     unsigned long masked_offset = word_offset & 0xfff000;
1693     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1694         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1695       add(Rd, base, masked_offset << shift);
1696       word_offset -= masked_offset;
1697       return Address(Rd, word_offset << shift);
1698     }
1699   }
1700 
1701   // Do it the hard way
1702   mov(Rd, byte_offset);
1703   add(Rd, base, Rd);
1704   return Address(Rd);
1705 }
1706 
1707 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp, Register tmp2) {
1708   if (UseLSE) {
1709     mov(tmp, 1);
1710     ldadd(Assembler::word, tmp, zr, counter_addr);
1711     return;
1712   }
1713   Label retry_load;
1714   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
1715     prfm(Address(counter_addr), PSTL1STRM);
1716   bind(retry_load);
1717   // flush and load exclusive from the memory location
1718   ldxrw(tmp, counter_addr);
1719   addw(tmp, tmp, 1);
1720   // if we store+flush with no intervening write tmp wil be zero
1721   stxrw(tmp2, tmp, counter_addr);
1722   cbnzw(tmp2, retry_load);
1723 }
1724 
1725 
1726 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1727                                     bool want_remainder, Register scratch)
1728 {
1729   // Full implementation of Java idiv and irem.  The function
1730   // returns the (pc) offset of the div instruction - may be needed
1731   // for implicit exceptions.
1732   //
1733   // constraint : ra/rb =/= scratch
1734   //         normal case
1735   //
1736   // input : ra: dividend
1737   //         rb: divisor
1738   //
1739   // result: either
1740   //         quotient  (= ra idiv rb)
1741   //         remainder (= ra irem rb)
1742 
1743   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1744 
1745   int idivl_offset = offset();
1746   if (! want_remainder) {
1747     sdivw(result, ra, rb);
1748   } else {
1749     sdivw(scratch, ra, rb);
1750     Assembler::msubw(result, scratch, rb, ra);
1751   }
1752 
1753   return idivl_offset;
1754 }
1755 
1756 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1757                                     bool want_remainder, Register scratch)
1758 {
1759   // Full implementation of Java ldiv and lrem.  The function
1760   // returns the (pc) offset of the div instruction - may be needed
1761   // for implicit exceptions.
1762   //
1763   // constraint : ra/rb =/= scratch
1764   //         normal case
1765   //
1766   // input : ra: dividend
1767   //         rb: divisor
1768   //
1769   // result: either
1770   //         quotient  (= ra idiv rb)
1771   //         remainder (= ra irem rb)
1772 
1773   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1774 
1775   int idivq_offset = offset();
1776   if (! want_remainder) {
1777     sdiv(result, ra, rb);
1778   } else {
1779     sdiv(scratch, ra, rb);
1780     Assembler::msub(result, scratch, rb, ra);
1781   }
1782 
1783   return idivq_offset;
1784 }
1785 
1786 // MacroAssembler routines found actually to be needed
1787 
1788 void MacroAssembler::push(Register src)
1789 {
1790   str(src, Address(pre(esp, -1 * wordSize)));
1791 }
1792 
1793 void MacroAssembler::pop(Register dst)
1794 {
1795   ldr(dst, Address(post(esp, 1 * wordSize)));
1796 }
1797 
1798 // Note: load_unsigned_short used to be called load_unsigned_word.
1799 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1800   int off = offset();
1801   ldrh(dst, src);
1802   return off;
1803 }
1804 
1805 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1806   int off = offset();
1807   ldrb(dst, src);
1808   return off;
1809 }
1810 
1811 int MacroAssembler::load_signed_short(Register dst, Address src) {
1812   int off = offset();
1813   ldrsh(dst, src);
1814   return off;
1815 }
1816 
1817 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1818   int off = offset();
1819   ldrsb(dst, src);
1820   return off;
1821 }
1822 
1823 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1824   int off = offset();
1825   ldrshw(dst, src);
1826   return off;
1827 }
1828 
1829 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1830   int off = offset();
1831   ldrsbw(dst, src);
1832   return off;
1833 }
1834 
1835 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1836   switch (size_in_bytes) {
1837   case  8:  ldr(dst, src); break;
1838   case  4:  ldrw(dst, src); break;
1839   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1840   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1841   default:  ShouldNotReachHere();
1842   }
1843 }
1844 
1845 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1846   switch (size_in_bytes) {
1847   case  8:  str(src, dst); break;
1848   case  4:  strw(src, dst); break;
1849   case  2:  strh(src, dst); break;
1850   case  1:  strb(src, dst); break;
1851   default:  ShouldNotReachHere();
1852   }
1853 }
1854 
1855 void MacroAssembler::decrementw(Register reg, int value)
1856 {
1857   if (value < 0)  { incrementw(reg, -value);      return; }
1858   if (value == 0) {                               return; }
1859   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1860   /* else */ {
1861     guarantee(reg != rscratch2, "invalid dst for register decrement");
1862     movw(rscratch2, (unsigned)value);
1863     subw(reg, reg, rscratch2);
1864   }
1865 }
1866 
1867 void MacroAssembler::decrement(Register reg, int value)
1868 {
1869   if (value < 0)  { increment(reg, -value);      return; }
1870   if (value == 0) {                              return; }
1871   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1872   /* else */ {
1873     assert(reg != rscratch2, "invalid dst for register decrement");
1874     mov(rscratch2, (unsigned long)value);
1875     sub(reg, reg, rscratch2);
1876   }
1877 }
1878 
1879 void MacroAssembler::decrementw(Address dst, int value)
1880 {
1881   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1882   ldrw(rscratch1, dst);
1883   decrementw(rscratch1, value);
1884   strw(rscratch1, dst);
1885 }
1886 
1887 void MacroAssembler::decrement(Address dst, int value)
1888 {
1889   assert(!dst.uses(rscratch1), "invalid address for decrement");
1890   ldr(rscratch1, dst);
1891   decrement(rscratch1, value);
1892   str(rscratch1, dst);
1893 }
1894 
1895 void MacroAssembler::incrementw(Register reg, int value)
1896 {
1897   if (value < 0)  { decrementw(reg, -value);      return; }
1898   if (value == 0) {                               return; }
1899   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1900   /* else */ {
1901     assert(reg != rscratch2, "invalid dst for register increment");
1902     movw(rscratch2, (unsigned)value);
1903     addw(reg, reg, rscratch2);
1904   }
1905 }
1906 
1907 void MacroAssembler::increment(Register reg, int value)
1908 {
1909   if (value < 0)  { decrement(reg, -value);      return; }
1910   if (value == 0) {                              return; }
1911   if (value < (1 << 12)) { add(reg, reg, value); return; }
1912   /* else */ {
1913     assert(reg != rscratch2, "invalid dst for register increment");
1914     movw(rscratch2, (unsigned)value);
1915     add(reg, reg, rscratch2);
1916   }
1917 }
1918 
1919 void MacroAssembler::incrementw(Address dst, int value)
1920 {
1921   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1922   ldrw(rscratch1, dst);
1923   incrementw(rscratch1, value);
1924   strw(rscratch1, dst);
1925 }
1926 
1927 void MacroAssembler::increment(Address dst, int value)
1928 {
1929   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1930   ldr(rscratch1, dst);
1931   increment(rscratch1, value);
1932   str(rscratch1, dst);
1933 }
1934 
1935 
1936 void MacroAssembler::pusha() {
1937   push(0x7fffffff, sp);
1938 }
1939 
1940 void MacroAssembler::popa() {
1941   pop(0x7fffffff, sp);
1942 }
1943 
1944 // Push lots of registers in the bit set supplied.  Don't push sp.
1945 // Return the number of words pushed
1946 int MacroAssembler::push(unsigned int bitset, Register stack) {
1947   int words_pushed = 0;
1948 
1949   // Scan bitset to accumulate register pairs
1950   unsigned char regs[32];
1951   int count = 0;
1952   for (int reg = 0; reg <= 30; reg++) {
1953     if (1 & bitset)
1954       regs[count++] = reg;
1955     bitset >>= 1;
1956   }
1957   regs[count++] = zr->encoding_nocheck();
1958   count &= ~1;  // Only push an even nuber of regs
1959 
1960   if (count) {
1961     stp(as_Register(regs[0]), as_Register(regs[1]),
1962        Address(pre(stack, -count * wordSize)));
1963     words_pushed += 2;
1964   }
1965   for (int i = 2; i < count; i += 2) {
1966     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1967        Address(stack, i * wordSize));
1968     words_pushed += 2;
1969   }
1970 
1971   assert(words_pushed == count, "oops, pushed != count");
1972 
1973   return count;
1974 }
1975 
1976 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1977   int words_pushed = 0;
1978 
1979   // Scan bitset to accumulate register pairs
1980   unsigned char regs[32];
1981   int count = 0;
1982   for (int reg = 0; reg <= 30; reg++) {
1983     if (1 & bitset)
1984       regs[count++] = reg;
1985     bitset >>= 1;
1986   }
1987   regs[count++] = zr->encoding_nocheck();
1988   count &= ~1;
1989 
1990   for (int i = 2; i < count; i += 2) {
1991     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1992        Address(stack, i * wordSize));
1993     words_pushed += 2;
1994   }
1995   if (count) {
1996     ldp(as_Register(regs[0]), as_Register(regs[1]),
1997        Address(post(stack, count * wordSize)));
1998     words_pushed += 2;
1999   }
2000 
2001   assert(words_pushed == count, "oops, pushed != count");
2002 
2003   return count;
2004 }
2005 #ifdef ASSERT
2006 void MacroAssembler::verify_heapbase(const char* msg) {
2007 #if 0
2008   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
2009   assert (Universe::heap() != NULL, "java heap should be initialized");
2010   if (CheckCompressedOops) {
2011     Label ok;
2012     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
2013     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2014     br(Assembler::EQ, ok);
2015     stop(msg);
2016     bind(ok);
2017     pop(1 << rscratch1->encoding(), sp);
2018   }
2019 #endif
2020 }
2021 #endif
2022 
2023 void MacroAssembler::stop(const char* msg, Label *l) {
2024   address ip = pc();
2025   pusha();
2026   // We use movptr rather than mov here because we need code size not
2027   // to depend on the pointer value of msg otherwise C2 can observe
2028   // the same node with different sizes when emitted in a scratch
2029   // buffer and later when emitted for good.
2030   movptr(c_rarg0, (uintptr_t)msg);
2031   if (! l) {
2032     adr(c_rarg1, (address)ip);
2033   } else {
2034     adr(c_rarg1, *l);
2035   }
2036   mov(c_rarg2, sp);
2037   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
2038   // call(c_rarg3);
2039   blrt(c_rarg3, 3, 0, 1);
2040   hlt(0);
2041 }
2042 
2043 // If a constant does not fit in an immediate field, generate some
2044 // number of MOV instructions and then perform the operation.
2045 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
2046                                            add_sub_imm_insn insn1,
2047                                            add_sub_reg_insn insn2) {
2048   assert(Rd != zr, "Rd = zr and not setting flags?");
2049   if (operand_valid_for_add_sub_immediate((int)imm)) {
2050     (this->*insn1)(Rd, Rn, imm);
2051   } else {
2052     if (uabs(imm) < (1 << 24)) {
2053        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
2054        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
2055     } else {
2056        assert_different_registers(Rd, Rn);
2057        mov(Rd, (uint64_t)imm);
2058        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2059     }
2060   }
2061 }
2062 
2063 // Seperate vsn which sets the flags. Optimisations are more restricted
2064 // because we must set the flags correctly.
2065 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
2066                                            add_sub_imm_insn insn1,
2067                                            add_sub_reg_insn insn2) {
2068   if (operand_valid_for_add_sub_immediate((int)imm)) {
2069     (this->*insn1)(Rd, Rn, imm);
2070   } else {
2071     assert_different_registers(Rd, Rn);
2072     assert(Rd != zr, "overflow in immediate operand");
2073     mov(Rd, (uint64_t)imm);
2074     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
2075   }
2076 }
2077 
2078 
2079 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2080   if (increment.is_register()) {
2081     add(Rd, Rn, increment.as_register());
2082   } else {
2083     add(Rd, Rn, increment.as_constant());
2084   }
2085 }
2086 
2087 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2088   if (increment.is_register()) {
2089     addw(Rd, Rn, increment.as_register());
2090   } else {
2091     addw(Rd, Rn, increment.as_constant());
2092   }
2093 }
2094 
2095 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2096   if (decrement.is_register()) {
2097     sub(Rd, Rn, decrement.as_register());
2098   } else {
2099     sub(Rd, Rn, decrement.as_constant());
2100   }
2101 }
2102 
2103 void MacroAssembler::subw(Register Rd, Register Rn, RegisterOrConstant decrement) {
2104   if (decrement.is_register()) {
2105     subw(Rd, Rn, decrement.as_register());
2106   } else {
2107     subw(Rd, Rn, decrement.as_constant());
2108   }
2109 }
2110 
2111 void MacroAssembler::reinit_heapbase()
2112 {
2113   if (UseCompressedOops) {
2114     if (Universe::is_fully_initialized()) {
2115       mov(rheapbase, Universe::narrow_ptrs_base());
2116     } else {
2117       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2118       ldr(rheapbase, Address(rheapbase));
2119     }
2120   }
2121 }
2122 
2123 // this simulates the behaviour of the x86 cmpxchg instruction using a
2124 // load linked/store conditional pair. we use the acquire/release
2125 // versions of these instructions so that we flush pending writes as
2126 // per Java semantics.
2127 
2128 // n.b the x86 version assumes the old value to be compared against is
2129 // in rax and updates rax with the value located in memory if the
2130 // cmpxchg fails. we supply a register for the old value explicitly
2131 
2132 // the aarch64 load linked/store conditional instructions do not
2133 // accept an offset. so, unlike x86, we must provide a plain register
2134 // to identify the memory word to be compared/exchanged rather than a
2135 // register+offset Address.
2136 
2137 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2138                                 Label &succeed, Label *fail) {
2139   // oldv holds comparison value
2140   // newv holds value to write in exchange
2141   // addr identifies memory word to compare against/update
2142   if (UseLSE) {
2143     mov(tmp, oldv);
2144     casal(Assembler::xword, oldv, newv, addr);
2145     cmp(tmp, oldv);
2146     br(Assembler::EQ, succeed);
2147     membar(AnyAny);
2148   } else {
2149     Label retry_load, nope;
2150     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2151       prfm(Address(addr), PSTL1STRM);
2152     bind(retry_load);
2153     // flush and load exclusive from the memory location
2154     // and fail if it is not what we expect
2155     ldaxr(tmp, addr);
2156     cmp(tmp, oldv);
2157     br(Assembler::NE, nope);
2158     // if we store+flush with no intervening write tmp wil be zero
2159     stlxr(tmp, newv, addr);
2160     cbzw(tmp, succeed);
2161     // retry so we only ever return after a load fails to compare
2162     // ensures we don't return a stale value after a failed write.
2163     b(retry_load);
2164     // if the memory word differs we return it in oldv and signal a fail
2165     bind(nope);
2166     membar(AnyAny);
2167     mov(oldv, tmp);
2168   }
2169   if (fail)
2170     b(*fail);
2171 }
2172 
2173 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2174                                 Label &succeed, Label *fail) {
2175   // oldv holds comparison value
2176   // newv holds value to write in exchange
2177   // addr identifies memory word to compare against/update
2178   // tmp returns 0/1 for success/failure
2179   if (UseLSE) {
2180     mov(tmp, oldv);
2181     casal(Assembler::word, oldv, newv, addr);
2182     cmp(tmp, oldv);
2183     br(Assembler::EQ, succeed);
2184     membar(AnyAny);
2185   } else {
2186     Label retry_load, nope;
2187     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2188       prfm(Address(addr), PSTL1STRM);
2189     bind(retry_load);
2190     // flush and load exclusive from the memory location
2191     // and fail if it is not what we expect
2192     ldaxrw(tmp, addr);
2193     cmp(tmp, oldv);
2194     br(Assembler::NE, nope);
2195     // if we store+flush with no intervening write tmp wil be zero
2196     stlxrw(tmp, newv, addr);
2197     cbzw(tmp, succeed);
2198     // retry so we only ever return after a load fails to compare
2199     // ensures we don't return a stale value after a failed write.
2200     b(retry_load);
2201     // if the memory word differs we return it in oldv and signal a fail
2202     bind(nope);
2203     membar(AnyAny);
2204     mov(oldv, tmp);
2205   }
2206   if (fail)
2207     b(*fail);
2208 }
2209 
2210 // A generic CAS; success or failure is in the EQ flag.
2211 void MacroAssembler::cmpxchg(Register addr, Register expected,
2212                              Register new_val,
2213                              enum operand_size size,
2214                              bool acquire, bool release,
2215                              Register tmp) {
2216   if (UseLSE) {
2217     mov(tmp, expected);
2218     lse_cas(tmp, new_val, addr, size, acquire, release, /*not_pair*/ true);
2219     cmp(tmp, expected);
2220   } else {
2221     BLOCK_COMMENT("cmpxchg {");
2222     Label retry_load, done;
2223     if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))
2224       prfm(Address(addr), PSTL1STRM);
2225     bind(retry_load);
2226     load_exclusive(tmp, addr, size, acquire);
2227     if (size == xword)
2228       cmp(tmp, expected);
2229     else
2230       cmpw(tmp, expected);
2231     br(Assembler::NE, done);
2232     store_exclusive(tmp, new_val, addr, size, release);
2233     cbnzw(tmp, retry_load);
2234     bind(done);
2235     BLOCK_COMMENT("} cmpxchg");
2236   }
2237 }
2238 
2239 void MacroAssembler::cmpxchg_oop_shenandoah(Register addr, Register expected,
2240                                             Register new_val,
2241                                             enum operand_size size,
2242                                             bool acquire, bool release,
2243                                             bool weak,
2244                                             Register result, Register tmp2) {
2245   assert(UseShenandoahGC, "only for shenandoah");
2246   bool is_cae = (result != noreg);
2247   bool is_narrow = (size == word);
2248 
2249   if (! is_cae) result = rscratch1;
2250 
2251   assert_different_registers(addr, expected, new_val, result, tmp2);
2252 
2253   Label retry, done, fail;
2254 
2255   // CAS, using LL/SC pair.
2256   bind(retry);
2257   load_exclusive(result, addr, size, acquire);
2258   if (is_narrow) {
2259     cmpw(result, expected);
2260   } else {
2261     cmp(result, expected);
2262   }
2263   br(Assembler::NE, fail);
2264   store_exclusive(tmp2, new_val, addr, size, release);
2265   if (weak) {
2266     cmpw(tmp2, 0u); // If the store fails, return NE to our caller
2267   } else {
2268     cbnzw(tmp2, retry);
2269   }
2270   b(done);
2271 
2272   bind(fail);
2273   // Check if rb(expected)==rb(result)
2274   // Shuffle registers so that we have memory value ready for next expected.
2275   mov(tmp2, expected);
2276   mov(expected, result);
2277   if (is_narrow) {
2278     decode_heap_oop(result, result);
2279     decode_heap_oop(tmp2, tmp2);
2280   }
2281   oopDesc::bs()->interpreter_read_barrier(this, result);
2282   oopDesc::bs()->interpreter_read_barrier(this, tmp2);
2283   cmp(result, tmp2);
2284   // Retry with expected now being the value we just loaded from addr.
2285   br(Assembler::EQ, retry);
2286   if (is_narrow && is_cae) {
2287     // For cmp-and-exchange and narrow oops, we need to restore
2288     // the compressed old-value. We moved it to 'expected' a few lines up.
2289     mov(result, expected);
2290   }
2291   bind(done);
2292 }
2293 
2294 static bool different(Register a, RegisterOrConstant b, Register c) {
2295   if (b.is_constant())
2296     return a != c;
2297   else
2298     return a != b.as_register() && a != c && b.as_register() != c;
2299 }
2300 
2301 #define ATOMIC_OP(LDXR, OP, IOP, AOP, STXR, sz)                         \
2302 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2303   if (UseLSE) {                                                         \
2304     prev = prev->is_valid() ? prev : zr;                                \
2305     if (incr.is_register()) {                                           \
2306       AOP(sz, incr.as_register(), prev, addr);                          \
2307     } else {                                                            \
2308       mov(rscratch2, incr.as_constant());                               \
2309       AOP(sz, rscratch2, prev, addr);                                   \
2310     }                                                                   \
2311     return;                                                             \
2312   }                                                                     \
2313   Register result = rscratch2;                                          \
2314   if (prev->is_valid())                                                      \
2315     result = different(prev, incr, addr) ? prev : rscratch2;            \
2316                                                                         \
2317   Label retry_load;                                                     \
2318   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
2319     prfm(Address(addr), PSTL1STRM);                                     \
2320   bind(retry_load);                                                     \
2321   LDXR(result, addr);                                                   \
2322   OP(rscratch1, result, incr);                                          \
2323   STXR(rscratch2, rscratch1, addr);                                     \
2324   cbnzw(rscratch2, retry_load);                                         \
2325   if (prev->is_valid() && prev != result) {                             \
2326     IOP(prev, rscratch1, incr);                                         \
2327   }                                                                     \
2328 }
2329 
2330 ATOMIC_OP(ldxr, add, sub, ldadd, stxr, Assembler::xword)
2331 ATOMIC_OP(ldxrw, addw, subw, ldadd, stxrw, Assembler::word)
2332 
2333 #undef ATOMIC_OP
2334 
2335 #define ATOMIC_XCHG(OP, LDXR, STXR, sz)                                 \
2336 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2337   if (UseLSE) {                                                         \
2338     prev = prev->is_valid() ? prev : zr;                                \
2339     swp(sz, newv, prev, addr);                                          \
2340     return;                                                             \
2341   }                                                                     \
2342   Register result = rscratch2;                                          \
2343   if (prev->is_valid())                                                      \
2344     result = different(prev, newv, addr) ? prev : rscratch2;            \
2345                                                                         \
2346   Label retry_load;                                                     \
2347   if ((VM_Version::cpu_cpuFeatures() & VM_Version::CPU_STXR_PREFETCH))         \
2348     prfm(Address(addr), PSTL1STRM);                                     \
2349   bind(retry_load);                                                     \
2350   LDXR(result, addr);                                                   \
2351   STXR(rscratch1, newv, addr);                                          \
2352   cbnzw(rscratch1, retry_load);                                         \
2353   if (prev->is_valid() && prev != result)                            \
2354     mov(prev, result);                                                  \
2355 }
2356 
2357 ATOMIC_XCHG(xchg, ldxr, stxr, Assembler::xword)
2358 ATOMIC_XCHG(xchgw, ldxrw, stxrw, Assembler::word)
2359 
2360 #undef ATOMIC_XCHG
2361 
2362 void MacroAssembler::incr_allocated_bytes(Register thread,
2363                                           Register var_size_in_bytes,
2364                                           int con_size_in_bytes,
2365                                           Register t1) {
2366   if (!thread->is_valid()) {
2367     thread = rthread;
2368   }
2369   assert(t1->is_valid(), "need temp reg");
2370 
2371   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2372   if (var_size_in_bytes->is_valid()) {
2373     add(t1, t1, var_size_in_bytes);
2374   } else {
2375     add(t1, t1, con_size_in_bytes);
2376   }
2377   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2378 }
2379 
2380 #ifndef PRODUCT
2381 extern "C" void findpc(intptr_t x);
2382 #endif
2383 
2384 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2385 {
2386   // In order to get locks to work, we need to fake a in_VM state
2387   if (ShowMessageBoxOnError ) {
2388     JavaThread* thread = JavaThread::current();
2389     JavaThreadState saved_state = thread->thread_state();
2390     thread->set_thread_state(_thread_in_vm);
2391 #ifndef PRODUCT
2392     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2393       ttyLocker ttyl;
2394       BytecodeCounter::print();
2395     }
2396 #endif
2397 
2398     if (os::message_box(msg, "Execution stopped, print registers?")) {
2399       ttyLocker ttyl;
2400       tty->print_cr(" pc = 0x%016lx", pc);
2401 #ifndef PRODUCT
2402       tty->cr();
2403       findpc(pc);
2404       tty->cr();
2405 #endif
2406       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2407       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2408       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2409       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2410       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2411       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2412       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2413       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2414       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2415       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2416       tty->print_cr("r10 = 0x%016lx", regs[10]);
2417       tty->print_cr("r11 = 0x%016lx", regs[11]);
2418       tty->print_cr("r12 = 0x%016lx", regs[12]);
2419       tty->print_cr("r13 = 0x%016lx", regs[13]);
2420       tty->print_cr("r14 = 0x%016lx", regs[14]);
2421       tty->print_cr("r15 = 0x%016lx", regs[15]);
2422       tty->print_cr("r16 = 0x%016lx", regs[16]);
2423       tty->print_cr("r17 = 0x%016lx", regs[17]);
2424       tty->print_cr("r18 = 0x%016lx", regs[18]);
2425       tty->print_cr("r19 = 0x%016lx", regs[19]);
2426       tty->print_cr("r20 = 0x%016lx", regs[20]);
2427       tty->print_cr("r21 = 0x%016lx", regs[21]);
2428       tty->print_cr("r22 = 0x%016lx", regs[22]);
2429       tty->print_cr("r23 = 0x%016lx", regs[23]);
2430       tty->print_cr("r24 = 0x%016lx", regs[24]);
2431       tty->print_cr("r25 = 0x%016lx", regs[25]);
2432       tty->print_cr("r26 = 0x%016lx", regs[26]);
2433       tty->print_cr("r27 = 0x%016lx", regs[27]);
2434       tty->print_cr("r28 = 0x%016lx", regs[28]);
2435       tty->print_cr("r30 = 0x%016lx", regs[30]);
2436       tty->print_cr("r31 = 0x%016lx", regs[31]);
2437       BREAKPOINT;
2438     }
2439     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2440   } else {
2441     ttyLocker ttyl;
2442     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2443                     msg);
2444     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
2445   }
2446 }
2447 
2448 #ifdef BUILTIN_SIM
2449 // routine to generate an x86 prolog for a stub function which
2450 // bootstraps into the generated ARM code which directly follows the
2451 // stub
2452 //
2453 // the argument encodes the number of general and fp registers
2454 // passed by the caller and the callng convention (currently just
2455 // the number of general registers and assumes C argument passing)
2456 
2457 extern "C" {
2458 int aarch64_stub_prolog_size();
2459 void aarch64_stub_prolog();
2460 void aarch64_prolog();
2461 }
2462 
2463 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2464                                    address *prolog_ptr)
2465 {
2466   int calltype = (((ret_type & 0x3) << 8) |
2467                   ((fp_arg_count & 0xf) << 4) |
2468                   (gp_arg_count & 0xf));
2469 
2470   // the addresses for the x86 to ARM entry code we need to use
2471   address start = pc();
2472   // printf("start = %lx\n", start);
2473   int byteCount =  aarch64_stub_prolog_size();
2474   // printf("byteCount = %x\n", byteCount);
2475   int instructionCount = (byteCount + 3)/ 4;
2476   // printf("instructionCount = %x\n", instructionCount);
2477   for (int i = 0; i < instructionCount; i++) {
2478     nop();
2479   }
2480 
2481   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2482 
2483   // write the address of the setup routine and the call format at the
2484   // end of into the copied code
2485   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2486   if (prolog_ptr)
2487     patch_end[-2] = (u_int64_t)prolog_ptr;
2488   patch_end[-1] = calltype;
2489 }
2490 #endif
2491 
2492 void MacroAssembler::push_call_clobbered_fp_registers() {
2493   // Push v0-v7, v16-v31.
2494   for (int i = 30; i >= 0; i -= 2) {
2495     if (i <= v7->encoding() || i >= v16->encoding()) {
2496       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2497            Address(pre(sp, -2 * wordSize)));
2498     }
2499   }
2500 }
2501 
2502 void MacroAssembler::pop_call_clobbered_fp_registers() {
2503 
2504   for (int i = 0; i < 32; i += 2) {
2505     if (i <= v7->encoding() || i >= v16->encoding()) {
2506       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2507            Address(post(sp, 2 * wordSize)));
2508     }
2509   }
2510 }
2511 
2512 void MacroAssembler::push_call_clobbered_registers() {
2513   push(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2514 
2515   push_call_clobbered_fp_registers();
2516 }
2517 
2518 void MacroAssembler::pop_call_clobbered_registers() {
2519 
2520   pop_call_clobbered_fp_registers();
2521 
2522   pop(RegSet::range(r0, r18) - RegSet::of(rscratch1, rscratch2), sp);
2523 }
2524 
2525 void MacroAssembler::push_CPU_state(bool save_vectors) {
2526   push(0x3fffffff, sp);         // integer registers except lr & sp
2527 
2528   if (!save_vectors) {
2529     for (int i = 30; i >= 0; i -= 2)
2530       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2531            Address(pre(sp, -2 * wordSize)));
2532   } else {
2533     for (int i = 30; i >= 0; i -= 2)
2534       stpq(as_FloatRegister(i), as_FloatRegister(i+1),
2535            Address(pre(sp, -4 * wordSize)));
2536   }
2537 }
2538 
2539 void MacroAssembler::pop_CPU_state(bool restore_vectors) {
2540   if (!restore_vectors) {
2541     for (int i = 0; i < 32; i += 2)
2542       ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2543            Address(post(sp, 2 * wordSize)));
2544   } else {
2545     for (int i = 0; i < 32; i += 2)
2546       ldpq(as_FloatRegister(i), as_FloatRegister(i+1),
2547            Address(post(sp, 4 * wordSize)));
2548   }
2549 
2550   pop(0x3fffffff, sp);         // integer registers except lr & sp
2551 }
2552 
2553 /**
2554  * Helpers for multiply_to_len().
2555  */
2556 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2557                                      Register src1, Register src2) {
2558   adds(dest_lo, dest_lo, src1);
2559   adc(dest_hi, dest_hi, zr);
2560   adds(dest_lo, dest_lo, src2);
2561   adc(final_dest_hi, dest_hi, zr);
2562 }
2563 
2564 // Generate an address from (r + r1 extend offset).  "size" is the
2565 // size of the operand.  The result may be in rscratch2.
2566 Address MacroAssembler::offsetted_address(Register r, Register r1,
2567                                           Address::extend ext, int offset, int size) {
2568   if (offset || (ext.shift() % size != 0)) {
2569     lea(rscratch2, Address(r, r1, ext));
2570     return Address(rscratch2, offset);
2571   } else {
2572     return Address(r, r1, ext);
2573   }
2574 }
2575 
2576 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2577 {
2578   assert(offset >= 0, "spill to negative address?");
2579   // Offset reachable ?
2580   //   Not aligned - 9 bits signed offset
2581   //   Aligned - 12 bits unsigned offset shifted
2582   Register base = sp;
2583   if ((offset & (size-1)) && offset >= (1<<8)) {
2584     add(tmp, base, offset & ((1<<12)-1));
2585     base = tmp;
2586     offset &= -1<<12;
2587   }
2588 
2589   if (offset >= (1<<12) * size) {
2590     add(tmp, base, offset & (((1<<12)-1)<<12));
2591     base = tmp;
2592     offset &= ~(((1<<12)-1)<<12);
2593   }
2594 
2595   return Address(base, offset);
2596 }
2597 
2598 /**
2599  * Multiply 64 bit by 64 bit first loop.
2600  */
2601 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2602                                            Register y, Register y_idx, Register z,
2603                                            Register carry, Register product,
2604                                            Register idx, Register kdx) {
2605   //
2606   //  jlong carry, x[], y[], z[];
2607   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2608   //    huge_128 product = y[idx] * x[xstart] + carry;
2609   //    z[kdx] = (jlong)product;
2610   //    carry  = (jlong)(product >>> 64);
2611   //  }
2612   //  z[xstart] = carry;
2613   //
2614 
2615   Label L_first_loop, L_first_loop_exit;
2616   Label L_one_x, L_one_y, L_multiply;
2617 
2618   subsw(xstart, xstart, 1);
2619   br(Assembler::MI, L_one_x);
2620 
2621   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2622   ldr(x_xstart, Address(rscratch1));
2623   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2624 
2625   bind(L_first_loop);
2626   subsw(idx, idx, 1);
2627   br(Assembler::MI, L_first_loop_exit);
2628   subsw(idx, idx, 1);
2629   br(Assembler::MI, L_one_y);
2630   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2631   ldr(y_idx, Address(rscratch1));
2632   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2633   bind(L_multiply);
2634 
2635   // AArch64 has a multiply-accumulate instruction that we can't use
2636   // here because it has no way to process carries, so we have to use
2637   // separate add and adc instructions.  Bah.
2638   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2639   mul(product, x_xstart, y_idx);
2640   adds(product, product, carry);
2641   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2642 
2643   subw(kdx, kdx, 2);
2644   ror(product, product, 32); // back to big-endian
2645   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2646 
2647   b(L_first_loop);
2648 
2649   bind(L_one_y);
2650   ldrw(y_idx, Address(y,  0));
2651   b(L_multiply);
2652 
2653   bind(L_one_x);
2654   ldrw(x_xstart, Address(x,  0));
2655   b(L_first_loop);
2656 
2657   bind(L_first_loop_exit);
2658 }
2659 
2660 /**
2661  * Multiply 128 bit by 128. Unrolled inner loop.
2662  *
2663  */
2664 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2665                                              Register carry, Register carry2,
2666                                              Register idx, Register jdx,
2667                                              Register yz_idx1, Register yz_idx2,
2668                                              Register tmp, Register tmp3, Register tmp4,
2669                                              Register tmp6, Register product_hi) {
2670 
2671   //   jlong carry, x[], y[], z[];
2672   //   int kdx = ystart+1;
2673   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2674   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2675   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2676   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2677   //     carry  = (jlong)(tmp4 >>> 64);
2678   //     z[kdx+idx+1] = (jlong)tmp3;
2679   //     z[kdx+idx] = (jlong)tmp4;
2680   //   }
2681   //   idx += 2;
2682   //   if (idx > 0) {
2683   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2684   //     z[kdx+idx] = (jlong)yz_idx1;
2685   //     carry  = (jlong)(yz_idx1 >>> 64);
2686   //   }
2687   //
2688 
2689   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2690 
2691   lsrw(jdx, idx, 2);
2692 
2693   bind(L_third_loop);
2694 
2695   subsw(jdx, jdx, 1);
2696   br(Assembler::MI, L_third_loop_exit);
2697   subw(idx, idx, 4);
2698 
2699   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2700 
2701   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2702 
2703   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2704 
2705   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2706   ror(yz_idx2, yz_idx2, 32);
2707 
2708   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2709 
2710   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2711   umulh(tmp4, product_hi, yz_idx1);
2712 
2713   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2714   ror(rscratch2, rscratch2, 32);
2715 
2716   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2717   umulh(carry2, product_hi, yz_idx2);
2718 
2719   // propagate sum of both multiplications into carry:tmp4:tmp3
2720   adds(tmp3, tmp3, carry);
2721   adc(tmp4, tmp4, zr);
2722   adds(tmp3, tmp3, rscratch1);
2723   adcs(tmp4, tmp4, tmp);
2724   adc(carry, carry2, zr);
2725   adds(tmp4, tmp4, rscratch2);
2726   adc(carry, carry, zr);
2727 
2728   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2729   ror(tmp4, tmp4, 32);
2730   stp(tmp4, tmp3, Address(tmp6, 0));
2731 
2732   b(L_third_loop);
2733   bind (L_third_loop_exit);
2734 
2735   andw (idx, idx, 0x3);
2736   cbz(idx, L_post_third_loop_done);
2737 
2738   Label L_check_1;
2739   subsw(idx, idx, 2);
2740   br(Assembler::MI, L_check_1);
2741 
2742   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2743   ldr(yz_idx1, Address(rscratch1, 0));
2744   ror(yz_idx1, yz_idx1, 32);
2745   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2746   umulh(tmp4, product_hi, yz_idx1);
2747   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2748   ldr(yz_idx2, Address(rscratch1, 0));
2749   ror(yz_idx2, yz_idx2, 32);
2750 
2751   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2752 
2753   ror(tmp3, tmp3, 32);
2754   str(tmp3, Address(rscratch1, 0));
2755 
2756   bind (L_check_1);
2757 
2758   andw (idx, idx, 0x1);
2759   subsw(idx, idx, 1);
2760   br(Assembler::MI, L_post_third_loop_done);
2761   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2762   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2763   umulh(carry2, tmp4, product_hi);
2764   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2765 
2766   add2_with_carry(carry2, tmp3, tmp4, carry);
2767 
2768   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2769   extr(carry, carry2, tmp3, 32);
2770 
2771   bind(L_post_third_loop_done);
2772 }
2773 
2774 /**
2775  * Code for BigInteger::multiplyToLen() instrinsic.
2776  *
2777  * r0: x
2778  * r1: xlen
2779  * r2: y
2780  * r3: ylen
2781  * r4:  z
2782  * r5: zlen
2783  * r10: tmp1
2784  * r11: tmp2
2785  * r12: tmp3
2786  * r13: tmp4
2787  * r14: tmp5
2788  * r15: tmp6
2789  * r16: tmp7
2790  *
2791  */
2792 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2793                                      Register z, Register zlen,
2794                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2795                                      Register tmp5, Register tmp6, Register product_hi) {
2796 
2797   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2798 
2799   const Register idx = tmp1;
2800   const Register kdx = tmp2;
2801   const Register xstart = tmp3;
2802 
2803   const Register y_idx = tmp4;
2804   const Register carry = tmp5;
2805   const Register product  = xlen;
2806   const Register x_xstart = zlen;  // reuse register
2807 
2808   // First Loop.
2809   //
2810   //  final static long LONG_MASK = 0xffffffffL;
2811   //  int xstart = xlen - 1;
2812   //  int ystart = ylen - 1;
2813   //  long carry = 0;
2814   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2815   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2816   //    z[kdx] = (int)product;
2817   //    carry = product >>> 32;
2818   //  }
2819   //  z[xstart] = (int)carry;
2820   //
2821 
2822   movw(idx, ylen);      // idx = ylen;
2823   movw(kdx, zlen);      // kdx = xlen+ylen;
2824   mov(carry, zr);       // carry = 0;
2825 
2826   Label L_done;
2827 
2828   movw(xstart, xlen);
2829   subsw(xstart, xstart, 1);
2830   br(Assembler::MI, L_done);
2831 
2832   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2833 
2834   Label L_second_loop;
2835   cbzw(kdx, L_second_loop);
2836 
2837   Label L_carry;
2838   subw(kdx, kdx, 1);
2839   cbzw(kdx, L_carry);
2840 
2841   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2842   lsr(carry, carry, 32);
2843   subw(kdx, kdx, 1);
2844 
2845   bind(L_carry);
2846   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2847 
2848   // Second and third (nested) loops.
2849   //
2850   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2851   //   carry = 0;
2852   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2853   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2854   //                    (z[k] & LONG_MASK) + carry;
2855   //     z[k] = (int)product;
2856   //     carry = product >>> 32;
2857   //   }
2858   //   z[i] = (int)carry;
2859   // }
2860   //
2861   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2862 
2863   const Register jdx = tmp1;
2864 
2865   bind(L_second_loop);
2866   mov(carry, zr);                // carry = 0;
2867   movw(jdx, ylen);               // j = ystart+1
2868 
2869   subsw(xstart, xstart, 1);      // i = xstart-1;
2870   br(Assembler::MI, L_done);
2871 
2872   str(z, Address(pre(sp, -4 * wordSize)));
2873 
2874   Label L_last_x;
2875   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2876   subsw(xstart, xstart, 1);       // i = xstart-1;
2877   br(Assembler::MI, L_last_x);
2878 
2879   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2880   ldr(product_hi, Address(rscratch1));
2881   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2882 
2883   Label L_third_loop_prologue;
2884   bind(L_third_loop_prologue);
2885 
2886   str(ylen, Address(sp, wordSize));
2887   stp(x, xstart, Address(sp, 2 * wordSize));
2888   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2889                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2890   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2891   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2892 
2893   addw(tmp3, xlen, 1);
2894   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2895   subsw(tmp3, tmp3, 1);
2896   br(Assembler::MI, L_done);
2897 
2898   lsr(carry, carry, 32);
2899   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2900   b(L_second_loop);
2901 
2902   // Next infrequent code is moved outside loops.
2903   bind(L_last_x);
2904   ldrw(product_hi, Address(x,  0));
2905   b(L_third_loop_prologue);
2906 
2907   bind(L_done);
2908 }
2909 
2910 /**
2911  * Emits code to update CRC-32 with a byte value according to constants in table
2912  *
2913  * @param [in,out]crc   Register containing the crc.
2914  * @param [in]val       Register containing the byte to fold into the CRC.
2915  * @param [in]table     Register containing the table of crc constants.
2916  *
2917  * uint32_t crc;
2918  * val = crc_table[(val ^ crc) & 0xFF];
2919  * crc = val ^ (crc >> 8);
2920  *
2921  */
2922 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2923   eor(val, val, crc);
2924   andr(val, val, 0xff);
2925   ldrw(val, Address(table, val, Address::lsl(2)));
2926   eor(crc, val, crc, Assembler::LSR, 8);
2927 }
2928 
2929 /**
2930  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2931  *
2932  * @param [in,out]crc   Register containing the crc.
2933  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2934  * @param [in]table0    Register containing table 0 of crc constants.
2935  * @param [in]table1    Register containing table 1 of crc constants.
2936  * @param [in]table2    Register containing table 2 of crc constants.
2937  * @param [in]table3    Register containing table 3 of crc constants.
2938  *
2939  * uint32_t crc;
2940  *   v = crc ^ v
2941  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2942  *
2943  */
2944 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2945         Register table0, Register table1, Register table2, Register table3,
2946         bool upper) {
2947   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2948   uxtb(tmp, v);
2949   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2950   ubfx(tmp, v, 8, 8);
2951   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2952   eor(crc, crc, tmp);
2953   ubfx(tmp, v, 16, 8);
2954   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2955   eor(crc, crc, tmp);
2956   ubfx(tmp, v, 24, 8);
2957   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2958   eor(crc, crc, tmp);
2959 }
2960 
2961 /**
2962  * @param crc   register containing existing CRC (32-bit)
2963  * @param buf   register pointing to input byte buffer (byte*)
2964  * @param len   register containing number of bytes
2965  * @param table register that will contain address of CRC table
2966  * @param tmp   scratch register
2967  */
2968 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2969         Register table0, Register table1, Register table2, Register table3,
2970         Register tmp, Register tmp2, Register tmp3) {
2971   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2972   unsigned long offset;
2973 
2974     ornw(crc, zr, crc);
2975 
2976   if (UseCRC32) {
2977     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2978 
2979       subs(len, len, 64);
2980       br(Assembler::GE, CRC_by64_loop);
2981       adds(len, len, 64-4);
2982       br(Assembler::GE, CRC_by4_loop);
2983       adds(len, len, 4);
2984       br(Assembler::GT, CRC_by1_loop);
2985       b(L_exit);
2986 
2987     BIND(CRC_by4_loop);
2988       ldrw(tmp, Address(post(buf, 4)));
2989       subs(len, len, 4);
2990       crc32w(crc, crc, tmp);
2991       br(Assembler::GE, CRC_by4_loop);
2992       adds(len, len, 4);
2993       br(Assembler::LE, L_exit);
2994     BIND(CRC_by1_loop);
2995       ldrb(tmp, Address(post(buf, 1)));
2996       subs(len, len, 1);
2997       crc32b(crc, crc, tmp);
2998       br(Assembler::GT, CRC_by1_loop);
2999       b(L_exit);
3000 
3001       align(CodeEntryAlignment);
3002     BIND(CRC_by64_loop);
3003       subs(len, len, 64);
3004       ldp(tmp, tmp3, Address(post(buf, 16)));
3005       crc32x(crc, crc, tmp);
3006       crc32x(crc, crc, tmp3);
3007       ldp(tmp, tmp3, Address(post(buf, 16)));
3008       crc32x(crc, crc, tmp);
3009       crc32x(crc, crc, tmp3);
3010       ldp(tmp, tmp3, Address(post(buf, 16)));
3011       crc32x(crc, crc, tmp);
3012       crc32x(crc, crc, tmp3);
3013       ldp(tmp, tmp3, Address(post(buf, 16)));
3014       crc32x(crc, crc, tmp);
3015       crc32x(crc, crc, tmp3);
3016       br(Assembler::GE, CRC_by64_loop);
3017       adds(len, len, 64-4);
3018       br(Assembler::GE, CRC_by4_loop);
3019       adds(len, len, 4);
3020       br(Assembler::GT, CRC_by1_loop);
3021     BIND(L_exit);
3022       ornw(crc, zr, crc);
3023       return;
3024   }
3025 
3026     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
3027     if (offset) add(table0, table0, offset);
3028     add(table1, table0, 1*256*sizeof(juint));
3029     add(table2, table0, 2*256*sizeof(juint));
3030     add(table3, table0, 3*256*sizeof(juint));
3031 
3032   if (UseNeon) {
3033       cmp(len, 64);
3034       br(Assembler::LT, L_by16);
3035       eor(v16, T16B, v16, v16);
3036 
3037     Label L_fold;
3038 
3039       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3040 
3041       ld1(v0, v1, T2D, post(buf, 32));
3042       ld1r(v4, T2D, post(tmp, 8));
3043       ld1r(v5, T2D, post(tmp, 8));
3044       ld1r(v6, T2D, post(tmp, 8));
3045       ld1r(v7, T2D, post(tmp, 8));
3046       mov(v16, T4S, 0, crc);
3047 
3048       eor(v0, T16B, v0, v16);
3049       sub(len, len, 64);
3050 
3051     BIND(L_fold);
3052       pmull(v22, T8H, v0, v5, T8B);
3053       pmull(v20, T8H, v0, v7, T8B);
3054       pmull(v23, T8H, v0, v4, T8B);
3055       pmull(v21, T8H, v0, v6, T8B);
3056 
3057       pmull2(v18, T8H, v0, v5, T16B);
3058       pmull2(v16, T8H, v0, v7, T16B);
3059       pmull2(v19, T8H, v0, v4, T16B);
3060       pmull2(v17, T8H, v0, v6, T16B);
3061 
3062       uzp1(v24, v20, v22, T8H);
3063       uzp2(v25, v20, v22, T8H);
3064       eor(v20, T16B, v24, v25);
3065 
3066       uzp1(v26, v16, v18, T8H);
3067       uzp2(v27, v16, v18, T8H);
3068       eor(v16, T16B, v26, v27);
3069 
3070       ushll2(v22, T4S, v20, T8H, 8);
3071       ushll(v20, T4S, v20, T4H, 8);
3072 
3073       ushll2(v18, T4S, v16, T8H, 8);
3074       ushll(v16, T4S, v16, T4H, 8);
3075 
3076       eor(v22, T16B, v23, v22);
3077       eor(v18, T16B, v19, v18);
3078       eor(v20, T16B, v21, v20);
3079       eor(v16, T16B, v17, v16);
3080 
3081       uzp1(v17, v16, v20, T2D);
3082       uzp2(v21, v16, v20, T2D);
3083       eor(v17, T16B, v17, v21);
3084 
3085       ushll2(v20, T2D, v17, T4S, 16);
3086       ushll(v16, T2D, v17, T2S, 16);
3087 
3088       eor(v20, T16B, v20, v22);
3089       eor(v16, T16B, v16, v18);
3090 
3091       uzp1(v17, v20, v16, T2D);
3092       uzp2(v21, v20, v16, T2D);
3093       eor(v28, T16B, v17, v21);
3094 
3095       pmull(v22, T8H, v1, v5, T8B);
3096       pmull(v20, T8H, v1, v7, T8B);
3097       pmull(v23, T8H, v1, v4, T8B);
3098       pmull(v21, T8H, v1, v6, T8B);
3099 
3100       pmull2(v18, T8H, v1, v5, T16B);
3101       pmull2(v16, T8H, v1, v7, T16B);
3102       pmull2(v19, T8H, v1, v4, T16B);
3103       pmull2(v17, T8H, v1, v6, T16B);
3104 
3105       ld1(v0, v1, T2D, post(buf, 32));
3106 
3107       uzp1(v24, v20, v22, T8H);
3108       uzp2(v25, v20, v22, T8H);
3109       eor(v20, T16B, v24, v25);
3110 
3111       uzp1(v26, v16, v18, T8H);
3112       uzp2(v27, v16, v18, T8H);
3113       eor(v16, T16B, v26, v27);
3114 
3115       ushll2(v22, T4S, v20, T8H, 8);
3116       ushll(v20, T4S, v20, T4H, 8);
3117 
3118       ushll2(v18, T4S, v16, T8H, 8);
3119       ushll(v16, T4S, v16, T4H, 8);
3120 
3121       eor(v22, T16B, v23, v22);
3122       eor(v18, T16B, v19, v18);
3123       eor(v20, T16B, v21, v20);
3124       eor(v16, T16B, v17, v16);
3125 
3126       uzp1(v17, v16, v20, T2D);
3127       uzp2(v21, v16, v20, T2D);
3128       eor(v16, T16B, v17, v21);
3129 
3130       ushll2(v20, T2D, v16, T4S, 16);
3131       ushll(v16, T2D, v16, T2S, 16);
3132 
3133       eor(v20, T16B, v22, v20);
3134       eor(v16, T16B, v16, v18);
3135 
3136       uzp1(v17, v20, v16, T2D);
3137       uzp2(v21, v20, v16, T2D);
3138       eor(v20, T16B, v17, v21);
3139 
3140       shl(v16, T2D, v28, 1);
3141       shl(v17, T2D, v20, 1);
3142 
3143       eor(v0, T16B, v0, v16);
3144       eor(v1, T16B, v1, v17);
3145 
3146       subs(len, len, 32);
3147       br(Assembler::GE, L_fold);
3148 
3149       mov(crc, 0);
3150       mov(tmp, v0, T1D, 0);
3151       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3152       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3153       mov(tmp, v0, T1D, 1);
3154       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3155       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3156       mov(tmp, v1, T1D, 0);
3157       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3158       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3159       mov(tmp, v1, T1D, 1);
3160       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3161       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3162 
3163       add(len, len, 32);
3164   }
3165 
3166   BIND(L_by16);
3167     subs(len, len, 16);
3168     br(Assembler::GE, L_by16_loop);
3169     adds(len, len, 16-4);
3170     br(Assembler::GE, L_by4_loop);
3171     adds(len, len, 4);
3172     br(Assembler::GT, L_by1_loop);
3173     b(L_exit);
3174 
3175   BIND(L_by4_loop);
3176     ldrw(tmp, Address(post(buf, 4)));
3177     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
3178     subs(len, len, 4);
3179     br(Assembler::GE, L_by4_loop);
3180     adds(len, len, 4);
3181     br(Assembler::LE, L_exit);
3182   BIND(L_by1_loop);
3183     subs(len, len, 1);
3184     ldrb(tmp, Address(post(buf, 1)));
3185     update_byte_crc32(crc, tmp, table0);
3186     br(Assembler::GT, L_by1_loop);
3187     b(L_exit);
3188 
3189     align(CodeEntryAlignment);
3190   BIND(L_by16_loop);
3191     subs(len, len, 16);
3192     ldp(tmp, tmp3, Address(post(buf, 16)));
3193     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
3194     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
3195     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
3196     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
3197     br(Assembler::GE, L_by16_loop);
3198     adds(len, len, 16-4);
3199     br(Assembler::GE, L_by4_loop);
3200     adds(len, len, 4);
3201     br(Assembler::GT, L_by1_loop);
3202   BIND(L_exit);
3203     ornw(crc, zr, crc);
3204 }
3205 
3206 SkipIfEqual::SkipIfEqual(
3207     MacroAssembler* masm, const bool* flag_addr, bool value) {
3208   _masm = masm;
3209   unsigned long offset;
3210   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3211   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3212   _masm->cbzw(rscratch1, _label);
3213 }
3214 
3215 SkipIfEqual::~SkipIfEqual() {
3216   _masm->bind(_label);
3217 }
3218 
3219 void MacroAssembler::addptr(const Address &dst, int32_t src) {
3220   Address adr;
3221   switch(dst.getMode()) {
3222   case Address::base_plus_offset:
3223     // This is the expected mode, although we allow all the other
3224     // forms below.
3225     adr = form_address(rscratch2, dst.base(), dst.offset(), LogBytesPerWord);
3226     break;
3227   default:
3228     lea(rscratch2, dst);
3229     adr = Address(rscratch2);
3230     break;
3231   }
3232   ldr(rscratch1, adr);
3233   add(rscratch1, rscratch1, src);
3234   str(rscratch1, adr);
3235 }
3236 
3237 void MacroAssembler::cmpptr(Register src1, Address src2) {
3238   unsigned long offset;
3239   adrp(rscratch1, src2, offset);
3240   ldr(rscratch1, Address(rscratch1, offset));
3241   cmp(src1, rscratch1);
3242 }
3243 
3244 void MacroAssembler::store_check(Register obj) {
3245   // Does a store check for the oop in register obj. The content of
3246   // register obj is destroyed afterwards.
3247   store_check_part_1(obj);
3248   store_check_part_2(obj);
3249 }
3250 
3251 void MacroAssembler::cmpoops(Register src1, Register src2) {
3252   cmp(src1, src2);
3253   oopDesc::bs()->asm_acmp_barrier(this, src1, src2);
3254 }
3255 
3256 void MacroAssembler::store_check(Register obj, Address dst) {
3257   store_check(obj);
3258 }
3259 
3260 
3261 // split the store check operation so that other instructions can be scheduled inbetween
3262 void MacroAssembler::store_check_part_1(Register obj) {
3263   BarrierSet* bs = Universe::heap()->barrier_set();
3264   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3265   lsr(obj, obj, CardTableModRefBS::card_shift);
3266 }
3267 
3268 void MacroAssembler::store_check_part_2(Register obj) {
3269   BarrierSet* bs = Universe::heap()->barrier_set();
3270   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3271   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3272   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3273 
3274   // The calculation for byte_map_base is as follows:
3275   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
3276   // So this essentially converts an address to a displacement and
3277   // it will never need to be relocated.
3278 
3279   // FIXME: It's not likely that disp will fit into an offset so we
3280   // don't bother to check, but it could save an instruction.
3281   intptr_t disp = (intptr_t) ct->byte_map_base;
3282   load_byte_map_base(rscratch1);
3283   strb(zr, Address(obj, rscratch1));
3284 }
3285 
3286 void MacroAssembler::load_klass(Register dst, Register src) {
3287   if (UseCompressedClassPointers) {
3288     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3289     decode_klass_not_null(dst);
3290   } else {
3291     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3292   }
3293 }
3294 
3295 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3296   if (UseCompressedClassPointers) {
3297     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3298     if (Universe::narrow_klass_base() == NULL) {
3299       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3300       return;
3301     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3302                && Universe::narrow_klass_shift() == 0) {
3303       // Only the bottom 32 bits matter
3304       cmpw(trial_klass, tmp);
3305       return;
3306     }
3307     decode_klass_not_null(tmp);
3308   } else {
3309     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3310   }
3311   cmp(trial_klass, tmp);
3312 }
3313 
3314 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3315   load_klass(dst, src);
3316   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3317 }
3318 
3319 void MacroAssembler::store_klass(Register dst, Register src) {
3320   // FIXME: Should this be a store release?  concurrent gcs assumes
3321   // klass length is valid if klass field is not null.
3322   if (UseCompressedClassPointers) {
3323     encode_klass_not_null(src);
3324     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3325   } else {
3326     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3327   }
3328 }
3329 
3330 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3331   if (UseCompressedClassPointers) {
3332     // Store to klass gap in destination
3333     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3334   }
3335 }
3336 
3337 // Algorithm must match oop.inline.hpp encode_heap_oop.
3338 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3339 #ifdef ASSERT
3340   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3341 #endif
3342   verify_oop(s, "broken oop in encode_heap_oop");
3343   if (Universe::narrow_oop_base() == NULL) {
3344     if (Universe::narrow_oop_shift() != 0) {
3345       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3346       lsr(d, s, LogMinObjAlignmentInBytes);
3347     } else {
3348       mov(d, s);
3349     }
3350   } else {
3351     subs(d, s, rheapbase);
3352     csel(d, d, zr, Assembler::HS);
3353     lsr(d, d, LogMinObjAlignmentInBytes);
3354 
3355     /*  Old algorithm: is this any worse?
3356     Label nonnull;
3357     cbnz(r, nonnull);
3358     sub(r, r, rheapbase);
3359     bind(nonnull);
3360     lsr(r, r, LogMinObjAlignmentInBytes);
3361     */
3362   }
3363 }
3364 
3365 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3366 #ifdef ASSERT
3367   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3368   if (CheckCompressedOops) {
3369     Label ok;
3370     cbnz(r, ok);
3371     stop("null oop passed to encode_heap_oop_not_null");
3372     bind(ok);
3373   }
3374 #endif
3375   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3376   if (Universe::narrow_oop_base() != NULL) {
3377     sub(r, r, rheapbase);
3378   }
3379   if (Universe::narrow_oop_shift() != 0) {
3380     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3381     lsr(r, r, LogMinObjAlignmentInBytes);
3382   }
3383 }
3384 
3385 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3386 #ifdef ASSERT
3387   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3388   if (CheckCompressedOops) {
3389     Label ok;
3390     cbnz(src, ok);
3391     stop("null oop passed to encode_heap_oop_not_null2");
3392     bind(ok);
3393   }
3394 #endif
3395   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3396 
3397   Register data = src;
3398   if (Universe::narrow_oop_base() != NULL) {
3399     sub(dst, src, rheapbase);
3400     data = dst;
3401   }
3402   if (Universe::narrow_oop_shift() != 0) {
3403     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3404     lsr(dst, data, LogMinObjAlignmentInBytes);
3405     data = dst;
3406   }
3407   if (data == src)
3408     mov(dst, src);
3409 }
3410 
3411 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3412 #ifdef ASSERT
3413   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3414 #endif
3415   if (Universe::narrow_oop_base() == NULL) {
3416     if (Universe::narrow_oop_shift() != 0 || d != s) {
3417       lsl(d, s, Universe::narrow_oop_shift());
3418     }
3419   } else {
3420     Label done;
3421     if (d != s)
3422       mov(d, s);
3423     cbz(s, done);
3424     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3425     bind(done);
3426   }
3427   verify_oop(d, "broken oop in decode_heap_oop");
3428 }
3429 
3430 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3431   assert (UseCompressedOops, "should only be used for compressed headers");
3432   assert (Universe::heap() != NULL, "java heap should be initialized");
3433   // Cannot assert, unverified entry point counts instructions (see .ad file)
3434   // vtableStubs also counts instructions in pd_code_size_limit.
3435   // Also do not verify_oop as this is called by verify_oop.
3436   if (Universe::narrow_oop_shift() != 0) {
3437     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3438     if (Universe::narrow_oop_base() != NULL) {
3439       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3440     } else {
3441       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3442     }
3443   } else {
3444     assert (Universe::narrow_oop_base() == NULL, "sanity");
3445   }
3446 }
3447 
3448 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3449   assert (UseCompressedOops, "should only be used for compressed headers");
3450   assert (Universe::heap() != NULL, "java heap should be initialized");
3451   // Cannot assert, unverified entry point counts instructions (see .ad file)
3452   // vtableStubs also counts instructions in pd_code_size_limit.
3453   // Also do not verify_oop as this is called by verify_oop.
3454   if (Universe::narrow_oop_shift() != 0) {
3455     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3456     if (Universe::narrow_oop_base() != NULL) {
3457       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3458     } else {
3459       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3460     }
3461   } else {
3462     assert (Universe::narrow_oop_base() == NULL, "sanity");
3463     if (dst != src) {
3464       mov(dst, src);
3465     }
3466   }
3467 }
3468 
3469 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3470   if (Universe::narrow_klass_base() == NULL) {
3471     if (Universe::narrow_klass_shift() != 0) {
3472       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3473       lsr(dst, src, LogKlassAlignmentInBytes);
3474     } else {
3475       if (dst != src) mov(dst, src);
3476     }
3477     return;
3478   }
3479 
3480   if (use_XOR_for_compressed_class_base) {
3481     if (Universe::narrow_klass_shift() != 0) {
3482       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3483       lsr(dst, dst, LogKlassAlignmentInBytes);
3484     } else {
3485       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3486     }
3487     return;
3488   }
3489 
3490   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3491       && Universe::narrow_klass_shift() == 0) {
3492     movw(dst, src);
3493     return;
3494   }
3495 
3496 #ifdef ASSERT
3497   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3498 #endif
3499 
3500   Register rbase = dst;
3501   if (dst == src) rbase = rheapbase;
3502   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3503   sub(dst, src, rbase);
3504   if (Universe::narrow_klass_shift() != 0) {
3505     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3506     lsr(dst, dst, LogKlassAlignmentInBytes);
3507   }
3508   if (dst == src) reinit_heapbase();
3509 }
3510 
3511 void MacroAssembler::encode_klass_not_null(Register r) {
3512   encode_klass_not_null(r, r);
3513 }
3514 
3515 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3516   Register rbase = dst;
3517   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3518 
3519   if (Universe::narrow_klass_base() == NULL) {
3520     if (Universe::narrow_klass_shift() != 0) {
3521       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3522       lsl(dst, src, LogKlassAlignmentInBytes);
3523     } else {
3524       if (dst != src) mov(dst, src);
3525     }
3526     return;
3527   }
3528 
3529   if (use_XOR_for_compressed_class_base) {
3530     if (Universe::narrow_klass_shift() != 0) {
3531       lsl(dst, src, LogKlassAlignmentInBytes);
3532       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3533     } else {
3534       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3535     }
3536     return;
3537   }
3538 
3539   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3540       && Universe::narrow_klass_shift() == 0) {
3541     if (dst != src)
3542       movw(dst, src);
3543     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3544     return;
3545   }
3546 
3547   // Cannot assert, unverified entry point counts instructions (see .ad file)
3548   // vtableStubs also counts instructions in pd_code_size_limit.
3549   // Also do not verify_oop as this is called by verify_oop.
3550   if (dst == src) rbase = rheapbase;
3551   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3552   if (Universe::narrow_klass_shift() != 0) {
3553     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3554     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3555   } else {
3556     add(dst, rbase, src);
3557   }
3558   if (dst == src) reinit_heapbase();
3559 }
3560 
3561 void  MacroAssembler::decode_klass_not_null(Register r) {
3562   decode_klass_not_null(r, r);
3563 }
3564 
3565 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3566   assert (UseCompressedOops, "should only be used for compressed oops");
3567   assert (Universe::heap() != NULL, "java heap should be initialized");
3568   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3569 
3570   int oop_index = oop_recorder()->find_index(obj);
3571   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3572 
3573   InstructionMark im(this);
3574   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3575   code_section()->relocate(inst_mark(), rspec);
3576   movz(dst, 0xDEAD, 16);
3577   movk(dst, 0xBEEF);
3578 }
3579 
3580 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3581   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3582   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3583   int index = oop_recorder()->find_index(k);
3584   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3585 
3586   InstructionMark im(this);
3587   RelocationHolder rspec = metadata_Relocation::spec(index);
3588   code_section()->relocate(inst_mark(), rspec);
3589   narrowKlass nk = Klass::encode_klass(k);
3590   movz(dst, (nk >> 16), 16);
3591   movk(dst, nk & 0xffff);
3592 }
3593 
3594 void MacroAssembler::load_heap_oop(Register dst, Address src)
3595 {
3596   if (UseCompressedOops) {
3597     ldrw(dst, src);
3598     decode_heap_oop(dst);
3599   } else {
3600     ldr(dst, src);
3601   }
3602 }
3603 
3604 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3605 {
3606   if (UseCompressedOops) {
3607     ldrw(dst, src);
3608     decode_heap_oop_not_null(dst);
3609   } else {
3610     ldr(dst, src);
3611   }
3612 }
3613 
3614 void MacroAssembler::store_heap_oop(Address dst, Register src) {
3615   if (UseCompressedOops) {
3616     assert(!dst.uses(src), "not enough registers");
3617     encode_heap_oop(src);
3618     strw(src, dst);
3619   } else
3620     str(src, dst);
3621 }
3622 
3623 // Used for storing NULLs.
3624 void MacroAssembler::store_heap_oop_null(Address dst) {
3625   if (UseCompressedOops) {
3626     strw(zr, dst);
3627   } else
3628     str(zr, dst);
3629 }
3630 
3631 #if INCLUDE_ALL_GCS
3632 void MacroAssembler::g1_write_barrier_pre(Register obj,
3633                                           Register pre_val,
3634                                           Register thread,
3635                                           Register tmp,
3636                                           bool tosca_live,
3637                                           bool expand_call) {
3638   // If expand_call is true then we expand the call_VM_leaf macro
3639   // directly to skip generating the check by
3640   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3641 
3642 #ifdef _LP64
3643   assert(thread == rthread, "must be");
3644 #endif // _LP64
3645 
3646   Label done;
3647   Label runtime;
3648 
3649   assert(pre_val != noreg, "check this code");
3650 
3651   if (obj != noreg)
3652     assert_different_registers(obj, pre_val, tmp);
3653 
3654   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3655                                        PtrQueue::byte_offset_of_active()));
3656   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3657                                        PtrQueue::byte_offset_of_index()));
3658   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3659                                        PtrQueue::byte_offset_of_buf()));
3660 
3661 
3662   // Is marking active?
3663   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
3664     ldrw(tmp, in_progress);
3665   } else {
3666     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
3667     ldrb(tmp, in_progress);
3668   }
3669   cbzw(tmp, done);
3670 
3671   // Do we need to load the previous value?
3672   if (obj != noreg) {
3673     load_heap_oop(pre_val, Address(obj, 0));
3674   }
3675 
3676   // Is the previous value null?
3677   cbz(pre_val, done);
3678 
3679   // Can we store original value in the thread's buffer?
3680   // Is index == 0?
3681   // (The index field is typed as size_t.)
3682 
3683   ldr(tmp, index);                      // tmp := *index_adr
3684   cbz(tmp, runtime);                    // tmp == 0?
3685                                         // If yes, goto runtime
3686 
3687   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3688   str(tmp, index);                      // *index_adr := tmp
3689   ldr(rscratch1, buffer);
3690   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3691 
3692   // Record the previous value
3693   str(pre_val, Address(tmp, 0));
3694   b(done);
3695 
3696   bind(runtime);
3697   // save the live input values
3698   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3699 
3700   // Calling the runtime using the regular call_VM_leaf mechanism generates
3701   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3702   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3703   //
3704   // If we care generating the pre-barrier without a frame (e.g. in the
3705   // intrinsified Reference.get() routine) then ebp might be pointing to
3706   // the caller frame and so this check will most likely fail at runtime.
3707   //
3708   // Expanding the call directly bypasses the generation of the check.
3709   // So when we do not have have a full interpreter frame on the stack
3710   // expand_call should be passed true.
3711 
3712   if (expand_call) {
3713     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
3714     pass_arg1(this, thread);
3715     pass_arg0(this, pre_val);
3716     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3717   } else {
3718     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3719   }
3720 
3721   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3722 
3723   bind(done);
3724 }
3725 
3726 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3727                                            Register new_val,
3728                                            Register thread,
3729                                            Register tmp,
3730                                            Register tmp2) {
3731 #ifdef _LP64
3732   assert(thread == rthread, "must be");
3733 #endif // _LP64
3734 
3735   if (UseShenandoahGC) {
3736     // No need for this in Shenandoah.
3737     return;
3738   }
3739 
3740   assert(UseG1GC, "expect G1 GC");
3741 
3742   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3743                                        PtrQueue::byte_offset_of_index()));
3744   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3745                                        PtrQueue::byte_offset_of_buf()));
3746 
3747   BarrierSet* bs = Universe::heap()->barrier_set();
3748   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3749   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3750 
3751   Label done;
3752   Label runtime;
3753 
3754   // Does store cross heap regions?
3755 
3756   eor(tmp, store_addr, new_val);
3757   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3758   cbz(tmp, done);
3759 
3760   // crosses regions, storing NULL?
3761 
3762   cbz(new_val, done);
3763 
3764   // storing region crossing non-NULL, is card already dirty?
3765 
3766   ExternalAddress cardtable((address) ct->byte_map_base);
3767   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3768   const Register card_addr = tmp;
3769 
3770   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3771 
3772   // get the address of the card
3773   load_byte_map_base(tmp2);
3774   add(card_addr, card_addr, tmp2);
3775   ldrb(tmp2, Address(card_addr));
3776   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3777   br(Assembler::EQ, done);
3778 
3779   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3780 
3781   membar(Assembler::Assembler::StoreLoad);
3782 
3783   ldrb(tmp2, Address(card_addr));
3784   cbzw(tmp2, done);
3785 
3786   // storing a region crossing, non-NULL oop, card is clean.
3787   // dirty card and log.
3788 
3789   strb(zr, Address(card_addr));
3790 
3791   ldr(rscratch1, queue_index);
3792   cbz(rscratch1, runtime);
3793   sub(rscratch1, rscratch1, wordSize);
3794   str(rscratch1, queue_index);
3795 
3796   ldr(tmp2, buffer);
3797   str(card_addr, Address(tmp2, rscratch1));
3798   b(done);
3799 
3800   bind(runtime);
3801   // save the live input values
3802   push(store_addr->bit(true) | new_val->bit(true), sp);
3803   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3804   pop(store_addr->bit(true) | new_val->bit(true), sp);
3805 
3806   bind(done);
3807 }
3808 
3809 void MacroAssembler::shenandoah_write_barrier(Register dst) {
3810   assert(UseShenandoahGC && ShenandoahWriteBarrier, "Should be enabled");
3811   assert(dst != rscratch1, "need rscratch1");
3812   assert(dst != rscratch2, "need rscratch2");
3813 
3814   Label done;
3815 
3816   Address gc_state(rthread, in_bytes(JavaThread::gc_state_offset()));
3817   ldrb(rscratch1, gc_state);
3818 
3819   // Check for heap stability
3820   mov(rscratch2, ShenandoahHeap::HAS_FORWARDED | ShenandoahHeap::EVACUATION);
3821   tst(rscratch1, rscratch2);
3822   br(Assembler::EQ, done);
3823 
3824   // Heap is unstable, need to perform the read-barrier even if WB is inactive
3825   if (ShenandoahWriteBarrierRB) {
3826     ldr(dst, Address(dst, BrooksPointer::byte_offset()));
3827   }
3828 
3829   // Check for evacuation-in-progress and jump to WB slow-path if needed
3830   mov(rscratch2, ShenandoahHeap::EVACUATION);
3831   tst(rscratch1, rscratch2);
3832   br(Assembler::EQ, done);
3833 
3834   RegSet to_save = RegSet::of(r0);
3835   if (dst != r0) {
3836     push(to_save, sp);
3837     mov(r0, dst);
3838   }
3839 
3840   assert(StubRoutines::aarch64::shenandoah_wb() != NULL, "need write barrier stub");
3841   far_call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::aarch64::shenandoah_wb())));
3842 
3843   if (dst != r0) {
3844     mov(dst, r0);
3845     pop(to_save, sp);
3846   }
3847   block_comment("} Shenandoah write barrier");
3848 
3849   bind(done);
3850 }
3851 
3852 #endif // INCLUDE_ALL_GCS
3853 
3854 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3855   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3856   int index = oop_recorder()->allocate_metadata_index(obj);
3857   RelocationHolder rspec = metadata_Relocation::spec(index);
3858   return Address((address)obj, rspec);
3859 }
3860 
3861 // Move an oop into a register.  immediate is true if we want
3862 // immediate instrcutions, i.e. we are not going to patch this
3863 // instruction while the code is being executed by another thread.  In
3864 // that case we can use move immediates rather than the constant pool.
3865 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3866   int oop_index;
3867   if (obj == NULL) {
3868     oop_index = oop_recorder()->allocate_oop_index(obj);
3869   } else {
3870     oop_index = oop_recorder()->find_index(obj);
3871     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3872   }
3873   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3874   if (! immediate) {
3875     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3876     ldr_constant(dst, Address(dummy, rspec));
3877   } else
3878     mov(dst, Address((address)obj, rspec));
3879 }
3880 
3881 // Move a metadata address into a register.
3882 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3883   int oop_index;
3884   if (obj == NULL) {
3885     oop_index = oop_recorder()->allocate_metadata_index(obj);
3886   } else {
3887     oop_index = oop_recorder()->find_index(obj);
3888   }
3889   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3890   mov(dst, Address((address)obj, rspec));
3891 }
3892 
3893 Address MacroAssembler::constant_oop_address(jobject obj) {
3894   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3895   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3896   int oop_index = oop_recorder()->find_index(obj);
3897   return Address((address)obj, oop_Relocation::spec(oop_index));
3898 }
3899 
3900 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3901 void MacroAssembler::tlab_allocate(Register obj,
3902                                    Register var_size_in_bytes,
3903                                    int con_size_in_bytes,
3904                                    Register t1,
3905                                    Register t2,
3906                                    Label& slow_case) {
3907   assert_different_registers(obj, t2);
3908   assert_different_registers(obj, var_size_in_bytes);
3909   Register end = t2;
3910 
3911   // verify_tlab();
3912 
3913   int oop_extra_words = Universe::heap()->oop_extra_words();
3914 
3915   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3916   if (var_size_in_bytes == noreg) {
3917     lea(end, Address(obj, con_size_in_bytes + oop_extra_words * HeapWordSize));
3918   } else {
3919     if (oop_extra_words > 0) {
3920       add(var_size_in_bytes, var_size_in_bytes, oop_extra_words * HeapWordSize);
3921     }
3922     lea(end, Address(obj, var_size_in_bytes));
3923   }
3924   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3925   cmp(end, rscratch1);
3926   br(Assembler::HI, slow_case);
3927 
3928   // update the tlab top pointer
3929   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3930 
3931   Universe::heap()->compile_prepare_oop(this, obj);
3932 
3933   // recover var_size_in_bytes if necessary
3934   if (var_size_in_bytes == end) {
3935     sub(var_size_in_bytes, var_size_in_bytes, obj);
3936   }
3937   // verify_tlab();
3938 }
3939 
3940 // Preserves r19, and r3.
3941 Register MacroAssembler::tlab_refill(Label& retry,
3942                                      Label& try_eden,
3943                                      Label& slow_case) {
3944   Register top = r0;
3945   Register t1  = r2;
3946   Register t2  = r4;
3947   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3948   Label do_refill, discard_tlab;
3949 
3950   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3951     // No allocation in the shared eden.
3952     b(slow_case);
3953   }
3954 
3955   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3956   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3957 
3958   // calculate amount of free space
3959   sub(t1, t1, top);
3960   lsr(t1, t1, LogHeapWordSize);
3961 
3962   // Retain tlab and allocate object in shared space if
3963   // the amount free in the tlab is too large to discard.
3964 
3965   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3966   cmp(t1, rscratch1);
3967   br(Assembler::LE, discard_tlab);
3968 
3969   // Retain
3970   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3971   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3972   add(rscratch1, rscratch1, t2);
3973   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3974 
3975   if (TLABStats) {
3976     // increment number of slow_allocations
3977     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3978          1, rscratch1);
3979   }
3980   b(try_eden);
3981 
3982   bind(discard_tlab);
3983   if (TLABStats) {
3984     // increment number of refills
3985     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3986          rscratch1);
3987     // accumulate wastage -- t1 is amount free in tlab
3988     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3989          rscratch1);
3990   }
3991 
3992   // if tlab is currently allocated (top or end != null) then
3993   // fill [top, end + alignment_reserve) with array object
3994   cbz(top, do_refill);
3995 
3996   // set up the mark word
3997   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3998   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3999   // set the length to the remaining space
4000   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
4001   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
4002   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
4003   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
4004   // set klass to intArrayKlass
4005   {
4006     unsigned long offset;
4007     // dubious reloc why not an oop reloc?
4008     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
4009          offset);
4010     ldr(t1, Address(rscratch1, offset));
4011   }
4012   // store klass last.  concurrent gcs assumes klass length is valid if
4013   // klass field is not null.
4014   store_klass(top, t1);
4015 
4016   mov(t1, top);
4017   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4018   sub(t1, t1, rscratch1);
4019   incr_allocated_bytes(rthread, t1, 0, rscratch1);
4020 
4021   // refill the tlab with an eden allocation
4022   bind(do_refill);
4023   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4024   lsl(t1, t1, LogHeapWordSize);
4025   // allocate new tlab, address returned in top
4026   eden_allocate(top, t1, 0, t2, slow_case);
4027 
4028   // Check that t1 was preserved in eden_allocate.
4029 #ifdef ASSERT
4030   if (UseTLAB) {
4031     Label ok;
4032     Register tsize = r4;
4033     assert_different_registers(tsize, rthread, t1);
4034     str(tsize, Address(pre(sp, -16)));
4035     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
4036     lsl(tsize, tsize, LogHeapWordSize);
4037     cmp(t1, tsize);
4038     br(Assembler::EQ, ok);
4039     STOP("assert(t1 != tlab size)");
4040     should_not_reach_here();
4041 
4042     bind(ok);
4043     ldr(tsize, Address(post(sp, 16)));
4044   }
4045 #endif
4046   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4047   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4048   add(top, top, t1);
4049   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
4050   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4051   verify_tlab();
4052   b(retry);
4053 
4054   return rthread; // for use by caller
4055 }
4056 
4057 // Defines obj, preserves var_size_in_bytes
4058 void MacroAssembler::eden_allocate(Register obj,
4059                                    Register var_size_in_bytes,
4060                                    int con_size_in_bytes,
4061                                    Register t1,
4062                                    Label& slow_case) {
4063   assert_different_registers(obj, var_size_in_bytes, t1);
4064   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
4065     b(slow_case);
4066   } else {
4067     Register end = t1;
4068     Register heap_end = rscratch2;
4069     Label retry;
4070     bind(retry);
4071     {
4072       unsigned long offset;
4073       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
4074       ldr(heap_end, Address(rscratch1, offset));
4075     }
4076 
4077     ExternalAddress heap_top((address) Universe::heap()->top_addr());
4078 
4079     // Get the current top of the heap
4080     {
4081       unsigned long offset;
4082       adrp(rscratch1, heap_top, offset);
4083       // Use add() here after ARDP, rather than lea().
4084       // lea() does not generate anything if its offset is zero.
4085       // However, relocs expect to find either an ADD or a load/store
4086       // insn after an ADRP.  add() always generates an ADD insn, even
4087       // for add(Rn, Rn, 0).
4088       add(rscratch1, rscratch1, offset);
4089       ldaxr(obj, rscratch1);
4090     }
4091 
4092     // Adjust it my the size of our new object
4093     if (var_size_in_bytes == noreg) {
4094       lea(end, Address(obj, con_size_in_bytes));
4095     } else {
4096       lea(end, Address(obj, var_size_in_bytes));
4097     }
4098 
4099     // if end < obj then we wrapped around high memory
4100     cmp(end, obj);
4101     br(Assembler::LO, slow_case);
4102 
4103     cmp(end, heap_end);
4104     br(Assembler::HI, slow_case);
4105 
4106     // If heap_top hasn't been changed by some other thread, update it.
4107     stlxr(rscratch2, end, rscratch1);
4108     cbnzw(rscratch2, retry);
4109   }
4110 }
4111 
4112 void MacroAssembler::verify_tlab() {
4113 #ifdef ASSERT
4114   if (UseTLAB && VerifyOops) {
4115     Label next, ok;
4116 
4117     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
4118 
4119     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4120     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
4121     cmp(rscratch2, rscratch1);
4122     br(Assembler::HS, next);
4123     STOP("assert(top >= start)");
4124     should_not_reach_here();
4125 
4126     bind(next);
4127     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
4128     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
4129     cmp(rscratch2, rscratch1);
4130     br(Assembler::HS, ok);
4131     STOP("assert(top <= end)");
4132     should_not_reach_here();
4133 
4134     bind(ok);
4135     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
4136   }
4137 #endif
4138 }
4139 
4140 // Writes to stack successive pages until offset reached to check for
4141 // stack overflow + shadow pages.  This clobbers tmp.
4142 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
4143   assert_different_registers(tmp, size, rscratch1);
4144   mov(tmp, sp);
4145   // Bang stack for total size given plus shadow page size.
4146   // Bang one page at a time because large size can bang beyond yellow and
4147   // red zones.
4148   Label loop;
4149   mov(rscratch1, os::vm_page_size());
4150   bind(loop);
4151   lea(tmp, Address(tmp, -os::vm_page_size()));
4152   subsw(size, size, rscratch1);
4153   str(size, Address(tmp));
4154   br(Assembler::GT, loop);
4155 
4156   // Bang down shadow pages too.
4157   // The -1 because we already subtracted 1 page.
4158   for (int i = 0; i< StackShadowPages-1; i++) {
4159     // this could be any sized move but this is can be a debugging crumb
4160     // so the bigger the better.
4161     lea(tmp, Address(tmp, -os::vm_page_size()));
4162     str(size, Address(tmp));
4163   }
4164 }
4165 
4166 
4167 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
4168   unsigned long off;
4169   adrp(r, Address(page, rtype), off);
4170   InstructionMark im(this);
4171   code_section()->relocate(inst_mark(), rtype);
4172   ldrw(zr, Address(r, off));
4173   return inst_mark();
4174 }
4175 
4176 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
4177   InstructionMark im(this);
4178   code_section()->relocate(inst_mark(), rtype);
4179   ldrw(zr, Address(r, 0));
4180   return inst_mark();
4181 }
4182 
4183 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
4184   relocInfo::relocType rtype = dest.rspec().reloc()->type();
4185   unsigned long low_page = (unsigned long)CodeCache::low_bound() >> 12;
4186   unsigned long high_page = (unsigned long)(CodeCache::high_bound()-1) >> 12;
4187   unsigned long dest_page = (unsigned long)dest.target() >> 12;
4188   long offset_low = dest_page - low_page;
4189   long offset_high = dest_page - high_page;
4190 
4191   assert(is_valid_AArch64_address(dest.target()), "bad address");
4192   assert(dest.getMode() == Address::literal, "ADRP must be applied to a literal address");
4193 
4194   InstructionMark im(this);
4195   code_section()->relocate(inst_mark(), dest.rspec());
4196   // 8143067: Ensure that the adrp can reach the dest from anywhere within
4197   // the code cache so that if it is relocated we know it will still reach
4198   if (offset_high >= -(1<<20) && offset_low < (1<<20)) {
4199     _adrp(reg1, dest.target());
4200   } else {
4201     unsigned long target = (unsigned long)dest.target();
4202     unsigned long adrp_target
4203       = (target & 0xffffffffUL) | ((unsigned long)pc() & 0xffff00000000UL);
4204 
4205     _adrp(reg1, (address)adrp_target);
4206     movk(reg1, target >> 32, 32);
4207   }
4208   byte_offset = (unsigned long)dest.target() & 0xfff;
4209 }
4210 
4211 void MacroAssembler::load_byte_map_base(Register reg) {
4212   jbyte *byte_map_base =
4213     ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base;
4214 
4215   if (is_valid_AArch64_address((address)byte_map_base)) {
4216     // Strictly speaking the byte_map_base isn't an address at all,
4217     // and it might even be negative.
4218     unsigned long offset;
4219     adrp(reg, ExternalAddress((address)byte_map_base), offset);
4220     // We expect offset to be zero with most collectors.
4221     if (offset != 0) {
4222       add(reg, reg, offset);
4223     }
4224   } else {
4225     mov(reg, (uint64_t)byte_map_base);
4226   }
4227 }
4228 
4229 void MacroAssembler::build_frame(int framesize) {
4230   if (framesize == 0) {
4231     // Is this even possible?
4232     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4233   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
4234     sub(sp, sp, framesize);
4235     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4236   } else {
4237     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
4238     if (framesize < ((1 << 12) + 2 * wordSize))
4239       sub(sp, sp, framesize - 2 * wordSize);
4240     else {
4241       mov(rscratch1, framesize - 2 * wordSize);
4242       sub(sp, sp, rscratch1);
4243     }
4244   }
4245 }
4246 
4247 void MacroAssembler::remove_frame(int framesize) {
4248   if (framesize == 0) {
4249     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4250   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
4251     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
4252     add(sp, sp, framesize);
4253   } else {
4254     if (framesize < ((1 << 12) + 2 * wordSize))
4255       add(sp, sp, framesize - 2 * wordSize);
4256     else {
4257       mov(rscratch1, framesize - 2 * wordSize);
4258       add(sp, sp, rscratch1);
4259     }
4260     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
4261   }
4262 }
4263 
4264 // Search for str1 in str2 and return index or -1
4265 void MacroAssembler::string_indexof(Register str2, Register str1,
4266                                     Register cnt2, Register cnt1,
4267                                     Register tmp1, Register tmp2,
4268                                     Register tmp3, Register tmp4,
4269                                     int icnt1, Register result) {
4270   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
4271 
4272   Register ch1 = rscratch1;
4273   Register ch2 = rscratch2;
4274   Register cnt1tmp = tmp1;
4275   Register cnt2tmp = tmp2;
4276   Register cnt1_neg = cnt1;
4277   Register cnt2_neg = cnt2;
4278   Register result_tmp = tmp4;
4279 
4280   // Note, inline_string_indexOf() generates checks:
4281   // if (substr.count > string.count) return -1;
4282   // if (substr.count == 0) return 0;
4283 
4284 // We have two strings, a source string in str2, cnt2 and a pattern string
4285 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
4286 
4287 // For larger pattern and source we use a simplified Boyer Moore algorithm.
4288 // With a small pattern and source we use linear scan.
4289 
4290   if (icnt1 == -1) {
4291     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
4292     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
4293     br(LO, LINEARSEARCH);       // a byte array.
4294     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
4295     br(HS, LINEARSEARCH);
4296   }
4297 
4298 // The Boyer Moore alogorithm is based on the description here:-
4299 //
4300 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4301 //
4302 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4303 // and the 'Good Suffix' rule.
4304 //
4305 // These rules are essentially heuristics for how far we can shift the
4306 // pattern along the search string.
4307 //
4308 // The implementation here uses the 'Bad Character' rule only because of the
4309 // complexity of initialisation for the 'Good Suffix' rule.
4310 //
4311 // This is also known as the Boyer-Moore-Horspool algorithm:-
4312 //
4313 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4314 //
4315 // #define ASIZE 128
4316 //
4317 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4318 //       int i, j;
4319 //       unsigned c;
4320 //       unsigned char bc[ASIZE];
4321 //    
4322 //       /* Preprocessing */
4323 //       for (i = 0; i < ASIZE; ++i)
4324 //          bc[i] = 0;
4325 //       for (i = 0; i < m - 1; ) {
4326 //          c = x[i];
4327 //          ++i;
4328 //          if (c < ASIZE) bc[c] = i;
4329 //       }
4330 //    
4331 //       /* Searching */
4332 //       j = 0;
4333 //       while (j <= n - m) {
4334 //          c = y[i+j];
4335 //          if (x[m-1] == c)
4336 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4337 //          if (i < 0) return j;
4338 //          if (c < ASIZE)
4339 //            j = j - bc[y[j+m-1]] + m;
4340 //          else
4341 //            j += 1; // Advance by 1 only if char >= ASIZE
4342 //       }
4343 //    }
4344 
4345   if (icnt1 == -1) {
4346     BIND(BM);
4347 
4348     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4349     Label BMADV, BMMATCH, BMCHECKEND;
4350 
4351     Register cnt1end = tmp2;
4352     Register str2end = cnt2;
4353     Register skipch = tmp2;
4354 
4355     // Restrict ASIZE to 128 to reduce stack space/initialisation.
4356     // The presence of chars >= ASIZE in the target string does not affect
4357     // performance, but we must be careful not to initialise them in the stack
4358     // array.
4359     // The presence of chars >= ASIZE in the source string may adversely affect
4360     // performance since we can only advance by one when we encounter one.
4361 
4362       stp(zr, zr, pre(sp, -128));
4363       for (int i = 1; i < 8; i++)
4364           stp(zr, zr, Address(sp, i*16));
4365 
4366       mov(cnt1tmp, 0);
4367       sub(cnt1end, cnt1, 1);
4368     BIND(BCLOOP);
4369       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4370       cmp(ch1, 128);
4371       add(cnt1tmp, cnt1tmp, 1);
4372       br(HS, BCSKIP);
4373       strb(cnt1tmp, Address(sp, ch1));
4374     BIND(BCSKIP);
4375       cmp(cnt1tmp, cnt1end);
4376       br(LT, BCLOOP);
4377 
4378       mov(result_tmp, str2);
4379 
4380       sub(cnt2, cnt2, cnt1);
4381       add(str2end, str2, cnt2, LSL, 1);
4382     BIND(BMLOOPSTR2);
4383       sub(cnt1tmp, cnt1, 1);
4384       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4385       ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
4386       cmp(ch1, skipch);
4387       br(NE, BMSKIP);
4388       subs(cnt1tmp, cnt1tmp, 1);
4389       br(LT, BMMATCH);
4390     BIND(BMLOOPSTR1);
4391       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4392       ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
4393       cmp(ch1, ch2);
4394       br(NE, BMSKIP);
4395       subs(cnt1tmp, cnt1tmp, 1);
4396       br(GE, BMLOOPSTR1);
4397     BIND(BMMATCH);
4398       sub(result_tmp, str2, result_tmp);
4399       lsr(result, result_tmp, 1);
4400       add(sp, sp, 128);
4401       b(DONE);
4402     BIND(BMADV);
4403       add(str2, str2, 2);
4404       b(BMCHECKEND);
4405     BIND(BMSKIP);
4406       cmp(skipch, 128);
4407       br(HS, BMADV);
4408       ldrb(ch2, Address(sp, skipch));
4409       add(str2, str2, cnt1, LSL, 1);
4410       sub(str2, str2, ch2, LSL, 1);
4411     BIND(BMCHECKEND);
4412       cmp(str2, str2end);
4413       br(LE, BMLOOPSTR2);
4414       add(sp, sp, 128);
4415       b(NOMATCH);
4416   }
4417 
4418   BIND(LINEARSEARCH);
4419   {
4420     Label DO1, DO2, DO3;
4421 
4422     Register str2tmp = tmp2;
4423     Register first = tmp3;
4424 
4425     if (icnt1 == -1)
4426     {
4427         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
4428 
4429         cmp(cnt1, 4);
4430         br(LT, DOSHORT);
4431 
4432         sub(cnt2, cnt2, cnt1);
4433         sub(cnt1, cnt1, 4);
4434         mov(result_tmp, cnt2);
4435 
4436         lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4437         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4438         sub(cnt1_neg, zr, cnt1, LSL, 1);
4439         sub(cnt2_neg, zr, cnt2, LSL, 1);
4440         ldr(first, Address(str1, cnt1_neg));
4441 
4442       BIND(FIRST_LOOP);
4443         ldr(ch2, Address(str2, cnt2_neg));
4444         cmp(first, ch2);
4445         br(EQ, STR1_LOOP);
4446       BIND(STR2_NEXT);
4447         adds(cnt2_neg, cnt2_neg, 2);
4448         br(LE, FIRST_LOOP);
4449         b(NOMATCH);
4450 
4451       BIND(STR1_LOOP);
4452         adds(cnt1tmp, cnt1_neg, 8);
4453         add(cnt2tmp, cnt2_neg, 8);
4454         br(GE, LAST_WORD);
4455 
4456       BIND(STR1_NEXT);
4457         ldr(ch1, Address(str1, cnt1tmp));
4458         ldr(ch2, Address(str2, cnt2tmp));
4459         cmp(ch1, ch2);
4460         br(NE, STR2_NEXT);
4461         adds(cnt1tmp, cnt1tmp, 8);
4462         add(cnt2tmp, cnt2tmp, 8);
4463         br(LT, STR1_NEXT);
4464 
4465       BIND(LAST_WORD);
4466         ldr(ch1, Address(str1));
4467         sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
4468         ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
4469         cmp(ch1, ch2);
4470         br(NE, STR2_NEXT);
4471         b(MATCH);
4472 
4473       BIND(DOSHORT);
4474         cmp(cnt1, 2);
4475         br(LT, DO1);
4476         br(GT, DO3);
4477     }
4478 
4479     if (icnt1 == 4) {
4480       Label CH1_LOOP;
4481 
4482         ldr(ch1, str1);
4483         sub(cnt2, cnt2, 4);
4484         mov(result_tmp, cnt2);
4485         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4486         sub(cnt2_neg, zr, cnt2, LSL, 1);
4487 
4488       BIND(CH1_LOOP);
4489         ldr(ch2, Address(str2, cnt2_neg));
4490         cmp(ch1, ch2);
4491         br(EQ, MATCH);
4492         adds(cnt2_neg, cnt2_neg, 2);
4493         br(LE, CH1_LOOP);
4494         b(NOMATCH);
4495     }
4496 
4497     if (icnt1 == -1 || icnt1 == 2) {
4498       Label CH1_LOOP;
4499 
4500       BIND(DO2);
4501         ldrw(ch1, str1);
4502         sub(cnt2, cnt2, 2);
4503         mov(result_tmp, cnt2);
4504         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4505         sub(cnt2_neg, zr, cnt2, LSL, 1);
4506 
4507       BIND(CH1_LOOP);
4508         ldrw(ch2, Address(str2, cnt2_neg));
4509         cmp(ch1, ch2);
4510         br(EQ, MATCH);
4511         adds(cnt2_neg, cnt2_neg, 2);
4512         br(LE, CH1_LOOP);
4513         b(NOMATCH);
4514     }
4515 
4516     if (icnt1 == -1 || icnt1 == 3) {
4517       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4518 
4519       BIND(DO3);
4520         ldrw(first, str1);
4521         ldrh(ch1, Address(str1, 4));
4522 
4523         sub(cnt2, cnt2, 3);
4524         mov(result_tmp, cnt2);
4525         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4526         sub(cnt2_neg, zr, cnt2, LSL, 1);
4527 
4528       BIND(FIRST_LOOP);
4529         ldrw(ch2, Address(str2, cnt2_neg));
4530         cmpw(first, ch2);
4531         br(EQ, STR1_LOOP);
4532       BIND(STR2_NEXT);
4533         adds(cnt2_neg, cnt2_neg, 2);
4534         br(LE, FIRST_LOOP);
4535         b(NOMATCH);
4536 
4537       BIND(STR1_LOOP);
4538         add(cnt2tmp, cnt2_neg, 4);
4539         ldrh(ch2, Address(str2, cnt2tmp));
4540         cmp(ch1, ch2);
4541         br(NE, STR2_NEXT);
4542         b(MATCH);
4543     }
4544 
4545     if (icnt1 == -1 || icnt1 == 1) {
4546       Label CH1_LOOP, HAS_ZERO;
4547       Label DO1_SHORT, DO1_LOOP;
4548 
4549       BIND(DO1);
4550         ldrh(ch1, str1);
4551         cmp(cnt2, 4);
4552         br(LT, DO1_SHORT);
4553 
4554         orr(ch1, ch1, ch1, LSL, 16);
4555         orr(ch1, ch1, ch1, LSL, 32);
4556 
4557         sub(cnt2, cnt2, 4);
4558         mov(result_tmp, cnt2);
4559         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4560         sub(cnt2_neg, zr, cnt2, LSL, 1);
4561 
4562         mov(tmp3, 0x0001000100010001);
4563       BIND(CH1_LOOP);
4564         ldr(ch2, Address(str2, cnt2_neg));
4565         eor(ch2, ch1, ch2);
4566         sub(tmp1, ch2, tmp3);
4567         orr(tmp2, ch2, 0x7fff7fff7fff7fff);
4568         bics(tmp1, tmp1, tmp2);
4569         br(NE, HAS_ZERO);
4570         adds(cnt2_neg, cnt2_neg, 8);
4571         br(LT, CH1_LOOP);
4572 
4573         cmp(cnt2_neg, 8);
4574         mov(cnt2_neg, 0);
4575         br(LT, CH1_LOOP);
4576         b(NOMATCH);
4577 
4578       BIND(HAS_ZERO);
4579         rev(tmp1, tmp1);
4580         clz(tmp1, tmp1);
4581         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4582         b(MATCH);
4583 
4584       BIND(DO1_SHORT);
4585         mov(result_tmp, cnt2);
4586         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4587         sub(cnt2_neg, zr, cnt2, LSL, 1);
4588       BIND(DO1_LOOP);
4589         ldrh(ch2, Address(str2, cnt2_neg));
4590         cmpw(ch1, ch2);
4591         br(EQ, MATCH);
4592         adds(cnt2_neg, cnt2_neg, 2);
4593         br(LT, DO1_LOOP);
4594     }
4595   }
4596   BIND(NOMATCH);
4597     mov(result, -1);
4598     b(DONE);
4599   BIND(MATCH);
4600     add(result, result_tmp, cnt2_neg, ASR, 1);
4601   BIND(DONE);
4602 }
4603 
4604 // Compare strings.
4605 void MacroAssembler::string_compare(Register str1, Register str2,
4606                                     Register cnt1, Register cnt2, Register result,
4607                                     Register tmp1) {
4608   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4609     NEXT_WORD, DIFFERENCE;
4610 
4611   BLOCK_COMMENT("string_compare {");
4612 
4613   // Compute the minimum of the string lengths and save the difference.
4614   subsw(tmp1, cnt1, cnt2);
4615   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4616 
4617   // A very short string
4618   cmpw(cnt2, 4);
4619   br(Assembler::LT, SHORT_STRING);
4620 
4621   // Check if the strings start at the same location.
4622   cmp(str1, str2);
4623   br(Assembler::EQ, LENGTH_DIFF);
4624 
4625   // Compare longwords
4626   {
4627     subw(cnt2, cnt2, 4); // The last longword is a special case
4628 
4629     // Move both string pointers to the last longword of their
4630     // strings, negate the remaining count, and convert it to bytes.
4631     lea(str1, Address(str1, cnt2, Address::uxtw(1)));
4632     lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4633     sub(cnt2, zr, cnt2, LSL, 1);
4634 
4635     // Loop, loading longwords and comparing them into rscratch2.
4636     bind(NEXT_WORD);
4637     ldr(result, Address(str1, cnt2));
4638     ldr(cnt1, Address(str2, cnt2));
4639     adds(cnt2, cnt2, wordSize);
4640     eor(rscratch2, result, cnt1);
4641     cbnz(rscratch2, DIFFERENCE);
4642     br(Assembler::LT, NEXT_WORD);
4643 
4644     // Last longword.  In the case where length == 4 we compare the
4645     // same longword twice, but that's still faster than another
4646     // conditional branch.
4647 
4648     ldr(result, Address(str1));
4649     ldr(cnt1, Address(str2));
4650     eor(rscratch2, result, cnt1);
4651     cbz(rscratch2, LENGTH_DIFF);
4652 
4653     // Find the first different characters in the longwords and
4654     // compute their difference.
4655     bind(DIFFERENCE);
4656     rev(rscratch2, rscratch2);
4657     clz(rscratch2, rscratch2);
4658     andr(rscratch2, rscratch2, -16);
4659     lsrv(result, result, rscratch2);
4660     uxthw(result, result);
4661     lsrv(cnt1, cnt1, rscratch2);
4662     uxthw(cnt1, cnt1);
4663     subw(result, result, cnt1);
4664     b(DONE);
4665   }
4666 
4667   bind(SHORT_STRING);
4668   // Is the minimum length zero?
4669   cbz(cnt2, LENGTH_DIFF);
4670 
4671   bind(SHORT_LOOP);
4672   load_unsigned_short(result, Address(post(str1, 2)));
4673   load_unsigned_short(cnt1, Address(post(str2, 2)));
4674   subw(result, result, cnt1);
4675   cbnz(result, DONE);
4676   sub(cnt2, cnt2, 1);
4677   cbnz(cnt2, SHORT_LOOP);
4678 
4679   // Strings are equal up to min length.  Return the length difference.
4680   bind(LENGTH_DIFF);
4681   mov(result, tmp1);
4682 
4683   // That's it
4684   bind(DONE);
4685 
4686   BLOCK_COMMENT("} string_compare");
4687 }
4688 
4689 
4690 // base:     Address of a buffer to be zeroed, 8 bytes aligned.
4691 // cnt:      Count in HeapWords.
4692 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4693 void MacroAssembler::zero_words(Register base, Register cnt)
4694 {
4695   if (UseBlockZeroing) {
4696     block_zero(base, cnt);
4697   } else {
4698     fill_words(base, cnt, zr);
4699   }
4700 }
4701 
4702 // r10 = base:   Address of a buffer to be zeroed, 8 bytes aligned.
4703 // cnt:          Immediate count in HeapWords.
4704 // r11 = tmp:    For use as cnt if we need to call out
4705 #define ShortArraySize (18 * BytesPerLong)
4706 void MacroAssembler::zero_words(Register base, u_int64_t cnt)
4707 {
4708   Register tmp = r11;
4709   int i = cnt & 1;  // store any odd word to start
4710   if (i) str(zr, Address(base));
4711 
4712   if (cnt <= ShortArraySize / BytesPerLong) {
4713     for (; i < (int)cnt; i += 2)
4714       stp(zr, zr, Address(base, i * wordSize));
4715   } else if (UseBlockZeroing && cnt >= (u_int64_t)(BlockZeroingLowLimit >> LogBytesPerWord)) {
4716     mov(tmp, cnt);
4717     block_zero(base, tmp, true);
4718   } else {
4719     const int unroll = 4; // Number of stp(zr, zr) instructions we'll unroll
4720     int remainder = cnt % (2 * unroll);
4721     for (; i < remainder; i += 2)
4722       stp(zr, zr, Address(base, i * wordSize));
4723 
4724     Label loop;
4725     Register cnt_reg = rscratch1;
4726     Register loop_base = rscratch2;
4727     cnt = cnt - remainder;
4728     mov(cnt_reg, cnt);
4729     // adjust base and prebias by -2 * wordSize so we can pre-increment
4730     add(loop_base, base, (remainder - 2) * wordSize);
4731     bind(loop);
4732     sub(cnt_reg, cnt_reg, 2 * unroll);
4733     for (i = 1; i < unroll; i++)
4734       stp(zr, zr, Address(loop_base, 2 * i * wordSize));
4735     stp(zr, zr, Address(pre(loop_base, 2 * unroll * wordSize)));
4736     cbnz(cnt_reg, loop);
4737   }
4738 }
4739 
4740 // base:   Address of a buffer to be filled, 8 bytes aligned.
4741 // cnt:    Count in 8-byte unit.
4742 // value:  Value to be filled with.
4743 // base will point to the end of the buffer after filling.
4744 void MacroAssembler::fill_words(Register base, Register cnt, Register value)
4745 {
4746 //  Algorithm:
4747 //
4748 //    scratch1 = cnt & 7;
4749 //    cnt -= scratch1;
4750 //    p += scratch1;
4751 //    switch (scratch1) {
4752 //      do {
4753 //        cnt -= 8;
4754 //          p[-8] = v;
4755 //        case 7:
4756 //          p[-7] = v;
4757 //        case 6:
4758 //          p[-6] = v;
4759 //          // ...
4760 //        case 1:
4761 //          p[-1] = v;
4762 //        case 0:
4763 //          p += 8;
4764 //      } while (cnt);
4765 //    }
4766 
4767   assert_different_registers(base, cnt, value, rscratch1, rscratch2);
4768 
4769   Label fini, skip, entry, loop;
4770   const int unroll = 8; // Number of stp instructions we'll unroll
4771 
4772   cbz(cnt, fini);
4773   tbz(base, 3, skip);
4774   str(value, Address(post(base, 8)));
4775   sub(cnt, cnt, 1);
4776   bind(skip);
4777 
4778   andr(rscratch1, cnt, (unroll-1) * 2);
4779   sub(cnt, cnt, rscratch1);
4780   add(base, base, rscratch1, Assembler::LSL, 3);
4781   adr(rscratch2, entry);
4782   sub(rscratch2, rscratch2, rscratch1, Assembler::LSL, 1);
4783   br(rscratch2);
4784 
4785   bind(loop);
4786   add(base, base, unroll * 16);
4787   for (int i = -unroll; i < 0; i++)
4788     stp(value, value, Address(base, i * 16));
4789   bind(entry);
4790   subs(cnt, cnt, unroll * 2);
4791   br(Assembler::GE, loop);
4792 
4793   tbz(cnt, 0, fini);
4794   str(value, Address(post(base, 8)));
4795   bind(fini);
4796 }
4797 
4798 // Use DC ZVA to do fast zeroing.
4799 // base:   Address of a buffer to be zeroed, 8 bytes aligned.
4800 // cnt:    Count in HeapWords.
4801 // is_large: True when 'cnt' is known to be >= BlockZeroingLowLimit.
4802 void MacroAssembler::block_zero(Register base, Register cnt, bool is_large)
4803 {
4804   Label small;
4805   Label store_pair, loop_store_pair, done;
4806   Label base_aligned;
4807 
4808   assert_different_registers(base, cnt, rscratch1);
4809   guarantee(base == r10 && cnt == r11, "fix register usage");
4810 
4811   Register tmp = rscratch1;
4812   Register tmp2 = rscratch2;
4813   int zva_length = VM_Version::zva_length();
4814 
4815   // Ensure ZVA length can be divided by 16. This is required by
4816   // the subsequent operations.
4817   assert (zva_length % 16 == 0, "Unexpected ZVA Length");
4818 
4819   if (!is_large) cbz(cnt, done);
4820   tbz(base, 3, base_aligned);
4821   str(zr, Address(post(base, 8)));
4822   sub(cnt, cnt, 1);
4823   bind(base_aligned);
4824 
4825   // Ensure count >= zva_length * 2 so that it still deserves a zva after
4826   // alignment.
4827   if (!is_large || !(BlockZeroingLowLimit >= zva_length * 2)) {
4828     int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
4829     subs(tmp, cnt, low_limit >> 3);
4830     br(Assembler::LT, small);
4831   }
4832 
4833   far_call(StubRoutines::aarch64::get_zero_longs());
4834 
4835   bind(small);
4836 
4837   const int unroll = 8; // Number of stp instructions we'll unroll
4838   Label small_loop, small_table_end;
4839 
4840   andr(tmp, cnt, (unroll-1) * 2);
4841   sub(cnt, cnt, tmp);
4842   add(base, base, tmp, Assembler::LSL, 3);
4843   adr(tmp2, small_table_end);
4844   sub(tmp2, tmp2, tmp, Assembler::LSL, 1);
4845   br(tmp2);
4846 
4847   bind(small_loop);
4848   add(base, base, unroll * 16);
4849   for (int i = -unroll; i < 0; i++)
4850     stp(zr, zr, Address(base, i * 16));
4851   bind(small_table_end);
4852   subs(cnt, cnt, unroll * 2);
4853   br(Assembler::GE, small_loop);
4854 
4855   tbz(cnt, 0, done);
4856   str(zr, Address(post(base, 8)));
4857 
4858   bind(done);
4859 }
4860 
4861 void MacroAssembler::string_equals(Register str1, Register str2,
4862                                    Register cnt, Register result,
4863                                    Register tmp1) {
4864   Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
4865     NEXT_WORD;
4866 
4867   const Register tmp2 = rscratch1;
4868   assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
4869 
4870   BLOCK_COMMENT("string_equals {");
4871 
4872   // Start by assuming that the strings are not equal.
4873   mov(result, zr);
4874 
4875   // A very short string
4876   cmpw(cnt, 4);
4877   br(Assembler::LT, SHORT_STRING);
4878 
4879   // Check if the strings start at the same location.
4880   cmp(str1, str2);
4881   br(Assembler::EQ, SAME_CHARS);
4882 
4883   // Compare longwords
4884   {
4885     subw(cnt, cnt, 4); // The last longword is a special case
4886 
4887     // Move both string pointers to the last longword of their
4888     // strings, negate the remaining count, and convert it to bytes.
4889     lea(str1, Address(str1, cnt, Address::uxtw(1)));
4890     lea(str2, Address(str2, cnt, Address::uxtw(1)));
4891     sub(cnt, zr, cnt, LSL, 1);
4892 
4893     // Loop, loading longwords and comparing them into rscratch2.
4894     bind(NEXT_WORD);
4895     ldr(tmp1, Address(str1, cnt));
4896     ldr(tmp2, Address(str2, cnt));
4897     adds(cnt, cnt, wordSize);
4898     eor(rscratch2, tmp1, tmp2);
4899     cbnz(rscratch2, DONE);
4900     br(Assembler::LT, NEXT_WORD);
4901 
4902     // Last longword.  In the case where length == 4 we compare the
4903     // same longword twice, but that's still faster than another
4904     // conditional branch.
4905 
4906     ldr(tmp1, Address(str1));
4907     ldr(tmp2, Address(str2));
4908     eor(rscratch2, tmp1, tmp2);
4909     cbz(rscratch2, SAME_CHARS);
4910     b(DONE);
4911   }
4912 
4913   bind(SHORT_STRING);
4914   // Is the length zero?
4915   cbz(cnt, SAME_CHARS);
4916 
4917   bind(SHORT_LOOP);
4918   load_unsigned_short(tmp1, Address(post(str1, 2)));
4919   load_unsigned_short(tmp2, Address(post(str2, 2)));
4920   subw(tmp1, tmp1, tmp2);
4921   cbnz(tmp1, DONE);
4922   sub(cnt, cnt, 1);
4923   cbnz(cnt, SHORT_LOOP);
4924 
4925   // Strings are equal.
4926   bind(SAME_CHARS);
4927   mov(result, true);
4928 
4929   // That's it
4930   bind(DONE);
4931 
4932   BLOCK_COMMENT("} string_equals");
4933 }
4934 
4935 // Compare char[] arrays aligned to 4 bytes
4936 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4937                                         Register result, Register tmp1)
4938 {
4939   Register cnt1 = rscratch1;
4940   Register cnt2 = rscratch2;
4941   Register tmp2 = rscratch2;
4942 
4943   Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4944 
4945   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4946   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4947 
4948   BLOCK_COMMENT("char_arrays_equals  {");
4949 
4950     // different until proven equal
4951     mov(result, false);
4952 
4953     // same array?
4954     cmpoops(ary1, ary2);
4955     br(Assembler::EQ, SAME);
4956 
4957     // ne if either null
4958     cbz(ary1, DIFFER);
4959     cbz(ary2, DIFFER);
4960 
4961     // lengths ne?
4962     ldrw(cnt1, Address(ary1, length_offset));
4963     ldrw(cnt2, Address(ary2, length_offset));
4964     cmp(cnt1, cnt2);
4965     br(Assembler::NE, DIFFER);
4966 
4967     lea(ary1, Address(ary1, base_offset));
4968     lea(ary2, Address(ary2, base_offset));
4969 
4970     subs(cnt1, cnt1, 4);
4971     br(LT, TAIL03);
4972 
4973   BIND(NEXT);
4974     ldr(tmp1, Address(post(ary1, 8)));
4975     ldr(tmp2, Address(post(ary2, 8)));
4976     subs(cnt1, cnt1, 4);
4977     eor(tmp1, tmp1, tmp2);
4978     cbnz(tmp1, DIFFER);
4979     br(GE, NEXT);
4980 
4981   BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4982     tst(cnt1, 0b10);
4983     br(EQ, TAIL01);
4984     ldrw(tmp1, Address(post(ary1, 4)));
4985     ldrw(tmp2, Address(post(ary2, 4)));
4986     cmp(tmp1, tmp2);
4987     br(NE, DIFFER);
4988   BIND(TAIL01);  // 0-1 chars left
4989     tst(cnt1, 0b01);
4990     br(EQ, SAME);
4991     ldrh(tmp1, ary1);
4992     ldrh(tmp2, ary2);
4993     cmp(tmp1, tmp2);
4994     br(NE, DIFFER);
4995 
4996   BIND(SAME);
4997     mov(result, true);
4998   BIND(DIFFER); // result already set
4999   
5000   BLOCK_COMMENT("} char_arrays_equals");
5001 }
5002 
5003 // encode char[] to byte[] in ISO_8859_1
5004 void MacroAssembler::encode_iso_array(Register src, Register dst,
5005                       Register len, Register result,
5006                       FloatRegister Vtmp1, FloatRegister Vtmp2,
5007                       FloatRegister Vtmp3, FloatRegister Vtmp4)
5008 {
5009     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
5010     Register tmp1 = rscratch1;
5011 
5012       mov(result, len); // Save initial len
5013 
5014 #ifndef BUILTIN_SIM
5015       subs(len, len, 32);
5016       br(LT, LOOP_8);
5017 
5018 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
5019 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
5020 // any char could not fit in a byte, so clear the FPSR so we can test it.
5021       clear_fpsr();
5022 
5023     BIND(NEXT_32);
5024       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
5025       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
5026       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
5027       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
5028       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
5029       get_fpsr(tmp1);
5030       cbnzw(tmp1, LOOP_8);
5031       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
5032       subs(len, len, 32);
5033       add(src, src, 64);
5034       br(GE, NEXT_32);
5035 
5036     BIND(LOOP_8);
5037       adds(len, len, 32-8);
5038       br(LT, LOOP_1);
5039       clear_fpsr(); // QC may be set from loop above, clear again
5040     BIND(NEXT_8);
5041       ld1(Vtmp1, T8H, src);
5042       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
5043       get_fpsr(tmp1);
5044       cbnzw(tmp1, LOOP_1);
5045       st1(Vtmp1, T8B, post(dst, 8));
5046       subs(len, len, 8);
5047       add(src, src, 16);
5048       br(GE, NEXT_8);
5049 
5050     BIND(LOOP_1);
5051       adds(len, len, 8);
5052       br(LE, DONE);
5053 #else
5054       cbz(len, DONE);
5055 #endif
5056     BIND(NEXT_1);
5057       ldrh(tmp1, Address(post(src, 2)));
5058       tst(tmp1, 0xff00);
5059       br(NE, DONE);
5060       strb(tmp1, Address(post(dst, 1)));
5061       subs(len, len, 1);
5062       br(GT, NEXT_1);
5063 
5064     BIND(DONE);
5065       sub(result, result, len); // Return index where we stopped
5066 }