1 /*
   2 /*
   3  * Copyright (c) 2014, Red Hat Inc.
   4  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
   5  * All rights reserved.
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This code is free software; you can redistribute it and/or modify it
   9  * under the terms of the GNU General Public License version 2 only, as
  10  * published by the Free Software Foundation.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  *
  26  */
  27 
  28 #include <sys/types.h>
  29 
  30 #include "precompiled.hpp"
  31 #include "asm/assembler.hpp"
  32 #include "asm/assembler.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 
  35 #include "compiler/disassembler.hpp"
  36 #include "memory/resourceArea.hpp"
  37 #include "runtime/biasedLocking.hpp"
  38 #include "runtime/icache.hpp"
  39 #include "runtime/interfaceSupport.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 
  42 // #include "gc_interface/collectedHeap.inline.hpp"
  43 // #include "interpreter/interpreter.hpp"
  44 // #include "memory/cardTableModRefBS.hpp"
  45 // #include "prims/methodHandles.hpp"
  46 // #include "runtime/biasedLocking.hpp"
  47 // #include "runtime/interfaceSupport.hpp"
  48 // #include "runtime/objectMonitor.hpp"
  49 // #include "runtime/os.hpp"
  50 // #include "runtime/sharedRuntime.hpp"
  51 // #include "runtime/stubRoutines.hpp"
  52 
  53 #if INCLUDE_ALL_GCS
  54 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  55 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  56 #include "gc_implementation/g1/heapRegion.hpp"
  57 #endif
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #define STOP(error) stop(error)
  62 #else
  63 #define BLOCK_COMMENT(str) block_comment(str)
  64 #define STOP(error) block_comment(error); stop(error)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Patch any kind of instruction; there may be several instructions.
  70 // Return the total length (in bytes) of the instructions.
  71 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  72   int instructions = 1;
  73   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  74   long offset = (target - branch) >> 2;
  75   unsigned insn = *(unsigned*)branch;
  76   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  77     // Load register (literal)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  80     // Unconditional branch (immediate)
  81     Instruction_aarch64::spatch(branch, 25, 0, offset);
  82   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  83     // Conditional branch (immediate)
  84     Instruction_aarch64::spatch(branch, 23, 5, offset);
  85   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  86     // Compare & branch (immediate)
  87     Instruction_aarch64::spatch(branch, 23, 5, offset);
  88   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  89     // Test & branch (immediate)
  90     Instruction_aarch64::spatch(branch, 18, 5, offset);
  91   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  92     // PC-rel. addressing
  93     offset = target-branch;
  94     int shift = Instruction_aarch64::extract(insn, 31, 31);
  95     if (shift) {
  96       u_int64_t dest = (u_int64_t)target;
  97       uint64_t pc_page = (uint64_t)branch >> 12;
  98       uint64_t adr_page = (uint64_t)target >> 12;
  99       unsigned offset_lo = dest & 0xfff;
 100       offset = adr_page - pc_page;
 101 
 102       // We handle 3 types of PC relative addressing
 103       //   1 - adrp    Rx, target_page
 104       //       ldr/str Ry, [Rx, #offset_in_page]
 105       //   2 - adrp    Rx, target_page
 106       //       add     Ry, Rx, #offset_in_page
 107       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 108       // In the first 2 cases we must check that Rx is the same in the adrp and the
 109       // subsequent ldr/str or add instruction. Otherwise we could accidentally end
 110       // up treating a type 3 relocation as a type 1 or 2 just because it happened
 111       // to be followed by a random unrelated ldr/str or add instruction.
 112       //
 113       // In the case of a type 3 relocation, we know that these are only generated
 114       // for the safepoint polling page, or for the card type byte map base so we
 115       // assert as much and of course that the offset is 0.
 116       //
 117       unsigned insn2 = ((unsigned*)branch)[1];
 118       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 9, 5)) {
 121         // Load/store register (unsigned immediate)
 122         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 123         Instruction_aarch64::patch(branch + sizeof (unsigned),
 124                                     21, 10, offset_lo >> size);
 125         guarantee(((dest >> size) << size) == dest, "misaligned target");
 126         instructions = 2;
 127       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 128                 Instruction_aarch64::extract(insn, 4, 0) ==
 129                         Instruction_aarch64::extract(insn2, 4, 0)) {
 130         // add (immediate)
 131         Instruction_aarch64::patch(branch + sizeof (unsigned),
 132                                    21, 10, offset_lo);
 133         instructions = 2;
 134       } else {
 135         assert((jbyte *)target ==
 136                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 137                target == StubRoutines::crc_table_addr() ||
 138                (address)target == os::get_polling_page(),
 139                "adrp must be polling page or byte map base");
 140         assert(offset_lo == 0, "offset must be 0 for polling page or byte map base");
 141       }
 142     }
 143     int offset_lo = offset & 3;
 144     offset >>= 2;
 145     Instruction_aarch64::spatch(branch, 23, 5, offset);
 146     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 147   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 148     u_int64_t dest = (u_int64_t)target;
 149     // Move wide constant
 150     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 151     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 152     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 153     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 154     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 155     assert(target_addr_for_insn(branch) == target, "should be");
 156     instructions = 3;
 157   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 158              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 159     // nothing to do
 160     assert(target == 0, "did not expect to relocate target for polling page load");
 161   } else {
 162     ShouldNotReachHere();
 163   }
 164   return instructions * NativeInstruction::instruction_size;
 165 }
 166 
 167 int MacroAssembler::patch_oop(address insn_addr, address o) {
 168   int instructions;
 169   unsigned insn = *(unsigned*)insn_addr;
 170   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 171 
 172   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 173   // narrow OOPs by setting the upper 16 bits in the first
 174   // instruction.
 175   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 176     // Move narrow OOP
 177     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 178     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 179     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 180     instructions = 2;
 181   } else {
 182     // Move wide OOP
 183     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 184     uintptr_t dest = (uintptr_t)o;
 185     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 186     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 187     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 188     instructions = 3;
 189   }
 190   return instructions * NativeInstruction::instruction_size;
 191 }
 192 
 193 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 194   long offset = 0;
 195   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 196     // Load register (literal)
 197     offset = Instruction_aarch64::sextract(insn, 23, 5);
 198     return address(((uint64_t)insn_addr + (offset << 2)));
 199   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 200     // Unconditional branch (immediate)
 201     offset = Instruction_aarch64::sextract(insn, 25, 0);
 202   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 203     // Conditional branch (immediate)
 204     offset = Instruction_aarch64::sextract(insn, 23, 5);
 205   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 206     // Compare & branch (immediate)
 207     offset = Instruction_aarch64::sextract(insn, 23, 5);
 208    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 209     // Test & branch (immediate)
 210     offset = Instruction_aarch64::sextract(insn, 18, 5);
 211   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 212     // PC-rel. addressing
 213     offset = Instruction_aarch64::extract(insn, 30, 29);
 214     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 215     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 216     if (shift) {
 217       offset <<= shift;
 218       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 219       target_page &= ((uint64_t)-1) << shift;
 220       // Return the target address for the following sequences
 221       //   1 - adrp    Rx, target_page
 222       //       ldr/str Ry, [Rx, #offset_in_page]
 223       //   2 - adrp    Rx, target_page         ]
 224       //       add     Ry, Rx, #offset_in_page
 225       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 226       //
 227       // In the first two cases  we check that the register is the same and
 228       // return the target_page + the offset within the page.
 229       // Otherwise we assume it is a page aligned relocation and return
 230       // the target page only. The only cases this is generated is for
 231       // the safepoint polling page or for the card table byte map base so
 232       // we assert as much.
 233       //
 234       unsigned insn2 = ((unsigned*)insn_addr)[1];
 235       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 236                 Instruction_aarch64::extract(insn, 4, 0) ==
 237                         Instruction_aarch64::extract(insn2, 9, 5)) {
 238         // Load/store register (unsigned immediate)
 239         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 240         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 241         return address(target_page + (byte_offset << size));
 242       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 243                 Instruction_aarch64::extract(insn, 4, 0) ==
 244                         Instruction_aarch64::extract(insn2, 4, 0)) {
 245         // add (immediate)
 246         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 247         return address(target_page + byte_offset);
 248       } else {
 249         assert((jbyte *)target_page ==
 250                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 251                (address)target_page == os::get_polling_page(),
 252                "adrp must be polling page or byte map base");
 253         return (address)target_page;
 254       }
 255     } else {
 256       ShouldNotReachHere();
 257     }
 258   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 259     u_int32_t *insns = (u_int32_t *)insn_addr;
 260     // Move wide constant: movz, movk, movk.  See movptr().
 261     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 262     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 263     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 264                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 265                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 266   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 267              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 268     return 0;
 269   } else {
 270     ShouldNotReachHere();
 271   }
 272   return address(((uint64_t)insn_addr + (offset << 2)));
 273 }
 274 
 275 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 276   dsb(Assembler::SY);
 277 }
 278 
 279 
 280 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 281                                            bool clear_pc) {
 282   // we must set sp to zero to clear frame
 283   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 284   // must clear fp, so that compiled frames are not confused; it is
 285   // possible that we need it only for debugging
 286   if (clear_fp) {
 287     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 288   }
 289 
 290   if (clear_pc) {
 291     str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 292   }
 293 }
 294 
 295 // Calls to C land
 296 //
 297 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 298 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 299 // has to be reset to 0. This is required to allow proper stack traversal.
 300 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 301                                          Register last_java_fp,
 302                                          Register last_java_pc,
 303                                          Register scratch) {
 304 
 305   if (last_java_pc->is_valid()) {
 306       str(last_java_pc, Address(rthread,
 307                                 JavaThread::frame_anchor_offset()
 308                                 + JavaFrameAnchor::last_Java_pc_offset()));
 309     }
 310 
 311   // determine last_java_sp register
 312   if (last_java_sp == sp) {
 313     mov(scratch, sp);
 314     last_java_sp = scratch;
 315   } else if (!last_java_sp->is_valid()) {
 316     last_java_sp = esp;
 317   }
 318 
 319   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 320 
 321   // last_java_fp is optional
 322   if (last_java_fp->is_valid()) {
 323     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 324   }
 325 }
 326 
 327 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 328                                          Register last_java_fp,
 329                                          address  last_java_pc,
 330                                          Register scratch) {
 331   if (last_java_pc != NULL) {
 332     adr(scratch, last_java_pc);
 333   } else {
 334     // FIXME: This is almost never correct.  We should delete all
 335     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 336     // correct return address instead.
 337     adr(scratch, pc());
 338   }
 339 
 340   str(scratch, Address(rthread,
 341                        JavaThread::frame_anchor_offset()
 342                        + JavaFrameAnchor::last_Java_pc_offset()));
 343 
 344   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 345 }
 346 
 347 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 348                                          Register last_java_fp,
 349                                          Label &L,
 350                                          Register scratch) {
 351   if (L.is_bound()) {
 352     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 353   } else {
 354     InstructionMark im(this);
 355     L.add_patch_at(code(), locator());
 356     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 357   }
 358 }
 359 
 360 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 361   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 362   assert(CodeCache::find_blob(entry.target()) != NULL,
 363          "destination of far call not found in code cache");
 364   if (far_branches()) {
 365     unsigned long offset;
 366     // We can use ADRP here because we know that the total size of
 367     // the code cache cannot exceed 2Gb.
 368     adrp(tmp, entry, offset);
 369     add(tmp, tmp, offset);
 370     if (cbuf) cbuf->set_insts_mark();
 371     blr(tmp);
 372   } else {
 373     if (cbuf) cbuf->set_insts_mark();
 374     bl(entry);
 375   }
 376 }
 377 
 378 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 379   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 380   assert(CodeCache::find_blob(entry.target()) != NULL,
 381          "destination of far call not found in code cache");
 382   if (far_branches()) {
 383     unsigned long offset;
 384     // We can use ADRP here because we know that the total size of
 385     // the code cache cannot exceed 2Gb.
 386     adrp(tmp, entry, offset);
 387     add(tmp, tmp, offset);
 388     if (cbuf) cbuf->set_insts_mark();
 389     br(tmp);
 390   } else {
 391     if (cbuf) cbuf->set_insts_mark();
 392     b(entry);
 393   }
 394 }
 395 
 396 int MacroAssembler::biased_locking_enter(Register lock_reg,
 397                                          Register obj_reg,
 398                                          Register swap_reg,
 399                                          Register tmp_reg,
 400                                          bool swap_reg_contains_mark,
 401                                          Label& done,
 402                                          Label* slow_case,
 403                                          BiasedLockingCounters* counters) {
 404   assert(UseBiasedLocking, "why call this otherwise?");
 405   assert_different_registers(lock_reg, obj_reg, swap_reg);
 406 
 407   if (PrintBiasedLockingStatistics && counters == NULL)
 408     counters = BiasedLocking::counters();
 409 
 410   bool need_tmp_reg = false;
 411   if (tmp_reg == noreg) {
 412     tmp_reg = rscratch2;
 413   }
 414   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 415   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 416   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 417   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 418   Address saved_mark_addr(lock_reg, 0);
 419 
 420   // Biased locking
 421   // See whether the lock is currently biased toward our thread and
 422   // whether the epoch is still valid
 423   // Note that the runtime guarantees sufficient alignment of JavaThread
 424   // pointers to allow age to be placed into low bits
 425   // First check to see whether biasing is even enabled for this object
 426   Label cas_label;
 427   int null_check_offset = -1;
 428   if (!swap_reg_contains_mark) {
 429     null_check_offset = offset();
 430     ldr(swap_reg, mark_addr);
 431   }
 432   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 433   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 434   br(Assembler::NE, cas_label);
 435   // The bias pattern is present in the object's header. Need to check
 436   // whether the bias owner and the epoch are both still current.
 437   load_prototype_header(tmp_reg, obj_reg);
 438   orr(tmp_reg, tmp_reg, rthread);
 439   eor(tmp_reg, swap_reg, tmp_reg);
 440   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 441   if (counters != NULL) {
 442     Label around;
 443     cbnz(tmp_reg, around);
 444     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 445     b(done);
 446     bind(around);
 447   } else {
 448     cbz(tmp_reg, done);
 449   }
 450 
 451   Label try_revoke_bias;
 452   Label try_rebias;
 453 
 454   // At this point we know that the header has the bias pattern and
 455   // that we are not the bias owner in the current epoch. We need to
 456   // figure out more details about the state of the header in order to
 457   // know what operations can be legally performed on the object's
 458   // header.
 459 
 460   // If the low three bits in the xor result aren't clear, that means
 461   // the prototype header is no longer biased and we have to revoke
 462   // the bias on this object.
 463   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 464   cbnz(rscratch1, try_revoke_bias);
 465 
 466   // Biasing is still enabled for this data type. See whether the
 467   // epoch of the current bias is still valid, meaning that the epoch
 468   // bits of the mark word are equal to the epoch bits of the
 469   // prototype header. (Note that the prototype header's epoch bits
 470   // only change at a safepoint.) If not, attempt to rebias the object
 471   // toward the current thread. Note that we must be absolutely sure
 472   // that the current epoch is invalid in order to do this because
 473   // otherwise the manipulations it performs on the mark word are
 474   // illegal.
 475   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 476   cbnz(rscratch1, try_rebias);
 477 
 478   // The epoch of the current bias is still valid but we know nothing
 479   // about the owner; it might be set or it might be clear. Try to
 480   // acquire the bias of the object using an atomic operation. If this
 481   // fails we will go in to the runtime to revoke the object's bias.
 482   // Note that we first construct the presumed unbiased header so we
 483   // don't accidentally blow away another thread's valid bias.
 484   {
 485     Label here;
 486     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 487     andr(swap_reg, swap_reg, rscratch1);
 488     orr(tmp_reg, swap_reg, rthread);
 489     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 490     // If the biasing toward our thread failed, this means that
 491     // another thread succeeded in biasing it toward itself and we
 492     // need to revoke that bias. The revocation will occur in the
 493     // interpreter runtime in the slow case.
 494     bind(here);
 495     if (counters != NULL) {
 496       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 497                   tmp_reg, rscratch1);
 498     }
 499   }
 500   b(done);
 501 
 502   bind(try_rebias);
 503   // At this point we know the epoch has expired, meaning that the
 504   // current "bias owner", if any, is actually invalid. Under these
 505   // circumstances _only_, we are allowed to use the current header's
 506   // value as the comparison value when doing the cas to acquire the
 507   // bias in the current epoch. In other words, we allow transfer of
 508   // the bias from one thread to another directly in this situation.
 509   //
 510   // FIXME: due to a lack of registers we currently blow away the age
 511   // bits in this situation. Should attempt to preserve them.
 512   {
 513     Label here;
 514     load_prototype_header(tmp_reg, obj_reg);
 515     orr(tmp_reg, rthread, tmp_reg);
 516     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 517     // If the biasing toward our thread failed, then another thread
 518     // succeeded in biasing it toward itself and we need to revoke that
 519     // bias. The revocation will occur in the runtime in the slow case.
 520     bind(here);
 521     if (counters != NULL) {
 522       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 523                   tmp_reg, rscratch1);
 524     }
 525   }
 526   b(done);
 527 
 528   bind(try_revoke_bias);
 529   // The prototype mark in the klass doesn't have the bias bit set any
 530   // more, indicating that objects of this data type are not supposed
 531   // to be biased any more. We are going to try to reset the mark of
 532   // this object to the prototype value and fall through to the
 533   // CAS-based locking scheme. Note that if our CAS fails, it means
 534   // that another thread raced us for the privilege of revoking the
 535   // bias of this particular object, so it's okay to continue in the
 536   // normal locking code.
 537   //
 538   // FIXME: due to a lack of registers we currently blow away the age
 539   // bits in this situation. Should attempt to preserve them.
 540   {
 541     Label here, nope;
 542     load_prototype_header(tmp_reg, obj_reg);
 543     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 544     bind(here);
 545 
 546     // Fall through to the normal CAS-based lock, because no matter what
 547     // the result of the above CAS, some thread must have succeeded in
 548     // removing the bias bit from the object's header.
 549     if (counters != NULL) {
 550       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 551                   rscratch1);
 552     }
 553     bind(nope);
 554   }
 555 
 556   bind(cas_label);
 557 
 558   return null_check_offset;
 559 }
 560 
 561 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 562   assert(UseBiasedLocking, "why call this otherwise?");
 563 
 564   // Check for biased locking unlock case, which is a no-op
 565   // Note: we do not have to check the thread ID for two reasons.
 566   // First, the interpreter checks for IllegalMonitorStateException at
 567   // a higher level. Second, if the bias was revoked while we held the
 568   // lock, the object could not be rebiased toward another thread, so
 569   // the bias bit would be clear.
 570   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 571   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 572   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 573   br(Assembler::EQ, done);
 574 }
 575 
 576 
 577 // added to make this compile
 578 
 579 REGISTER_DEFINITION(Register, noreg);
 580 
 581 static void pass_arg0(MacroAssembler* masm, Register arg) {
 582   if (c_rarg0 != arg ) {
 583     masm->mov(c_rarg0, arg);
 584   }
 585 }
 586 
 587 static void pass_arg1(MacroAssembler* masm, Register arg) {
 588   if (c_rarg1 != arg ) {
 589     masm->mov(c_rarg1, arg);
 590   }
 591 }
 592 
 593 static void pass_arg2(MacroAssembler* masm, Register arg) {
 594   if (c_rarg2 != arg ) {
 595     masm->mov(c_rarg2, arg);
 596   }
 597 }
 598 
 599 static void pass_arg3(MacroAssembler* masm, Register arg) {
 600   if (c_rarg3 != arg ) {
 601     masm->mov(c_rarg3, arg);
 602   }
 603 }
 604 
 605 void MacroAssembler::call_VM_base(Register oop_result,
 606                                   Register java_thread,
 607                                   Register last_java_sp,
 608                                   address  entry_point,
 609                                   int      number_of_arguments,
 610                                   bool     check_exceptions) {
 611    // determine java_thread register
 612   if (!java_thread->is_valid()) {
 613     java_thread = rthread;
 614   }
 615 
 616   // determine last_java_sp register
 617   if (!last_java_sp->is_valid()) {
 618     last_java_sp = esp;
 619   }
 620 
 621   // debugging support
 622   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 623   assert(java_thread == rthread, "unexpected register");
 624 #ifdef ASSERT
 625   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 626   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 627 #endif // ASSERT
 628 
 629   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 630   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 631 
 632   // push java thread (becomes first argument of C function)
 633 
 634   mov(c_rarg0, java_thread);
 635 
 636   // set last Java frame before call
 637   assert(last_java_sp != rfp, "can't use rfp");
 638 
 639   Label l;
 640   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 641 
 642   // do the call, remove parameters
 643   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 644 
 645   // reset last Java frame
 646   // Only interpreter should have to clear fp
 647   reset_last_Java_frame(true, true);
 648 
 649    // C++ interp handles this in the interpreter
 650   check_and_handle_popframe(java_thread);
 651   check_and_handle_earlyret(java_thread);
 652 
 653   if (check_exceptions) {
 654     // check for pending exceptions (java_thread is set upon return)
 655     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 656     Label ok;
 657     cbz(rscratch1, ok);
 658     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 659     br(rscratch1);
 660     bind(ok);
 661   }
 662 
 663   // get oop result if there is one and reset the value in the thread
 664   if (oop_result->is_valid()) {
 665     get_vm_result(oop_result, java_thread);
 666   }
 667 }
 668 
 669 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 670   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 671 }
 672 
 673 // Maybe emit a call via a trampoline.  If the code cache is small
 674 // trampolines won't be emitted.
 675 
 676 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 677   assert(entry.rspec().type() == relocInfo::runtime_call_type
 678          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 679          || entry.rspec().type() == relocInfo::static_call_type
 680          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 681 
 682   unsigned int start_offset = offset();
 683   if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
 684     emit_trampoline_stub(offset(), entry.target());
 685   }
 686 
 687   if (cbuf) cbuf->set_insts_mark();
 688   relocate(entry.rspec());
 689   if (Assembler::reachable_from_branch_at(pc(), entry.target())) {
 690     bl(entry.target());
 691   } else {
 692     bl(pc());
 693   }
 694 }
 695 
 696 
 697 // Emit a trampoline stub for a call to a target which is too far away.
 698 //
 699 // code sequences:
 700 //
 701 // call-site:
 702 //   branch-and-link to <destination> or <trampoline stub>
 703 //
 704 // Related trampoline stub for this call site in the stub section:
 705 //   load the call target from the constant pool
 706 //   branch (LR still points to the call site above)
 707 
 708 void MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 709                                              address dest) {
 710   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 711   if (stub == NULL) {
 712     start_a_stub(Compile::MAX_stubs_size/2);
 713     Compile::current()->env()->record_out_of_memory_failure();
 714     return;
 715   }
 716 
 717   // Create a trampoline stub relocation which relates this trampoline stub
 718   // with the call instruction at insts_call_instruction_offset in the
 719   // instructions code-section.
 720   align(wordSize);
 721   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 722                                             + insts_call_instruction_offset));
 723   const int stub_start_offset = offset();
 724 
 725   // Now, create the trampoline stub's code:
 726   // - load the call
 727   // - call
 728   Label target;
 729   ldr(rscratch1, target);
 730   br(rscratch1);
 731   bind(target);
 732   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 733          "should be");
 734   emit_int64((int64_t)dest);
 735 
 736   const address stub_start_addr = addr_at(stub_start_offset);
 737 
 738   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 739 
 740   end_a_stub();
 741 }
 742 
 743 void MacroAssembler::ic_call(address entry) {
 744   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 745   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 746   // unsigned long offset;
 747   // ldr_constant(rscratch2, const_ptr);
 748   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 749   trampoline_call(Address(entry, rh));
 750 }
 751 
 752 // Implementation of call_VM versions
 753 
 754 void MacroAssembler::call_VM(Register oop_result,
 755                              address entry_point,
 756                              bool check_exceptions) {
 757   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 758 }
 759 
 760 void MacroAssembler::call_VM(Register oop_result,
 761                              address entry_point,
 762                              Register arg_1,
 763                              bool check_exceptions) {
 764   pass_arg1(this, arg_1);
 765   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 766 }
 767 
 768 void MacroAssembler::call_VM(Register oop_result,
 769                              address entry_point,
 770                              Register arg_1,
 771                              Register arg_2,
 772                              bool check_exceptions) {
 773   assert(arg_1 != c_rarg2, "smashed arg");
 774   pass_arg2(this, arg_2);
 775   pass_arg1(this, arg_1);
 776   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 777 }
 778 
 779 void MacroAssembler::call_VM(Register oop_result,
 780                              address entry_point,
 781                              Register arg_1,
 782                              Register arg_2,
 783                              Register arg_3,
 784                              bool check_exceptions) {
 785   assert(arg_1 != c_rarg3, "smashed arg");
 786   assert(arg_2 != c_rarg3, "smashed arg");
 787   pass_arg3(this, arg_3);
 788 
 789   assert(arg_1 != c_rarg2, "smashed arg");
 790   pass_arg2(this, arg_2);
 791 
 792   pass_arg1(this, arg_1);
 793   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 794 }
 795 
 796 void MacroAssembler::call_VM(Register oop_result,
 797                              Register last_java_sp,
 798                              address entry_point,
 799                              int number_of_arguments,
 800                              bool check_exceptions) {
 801   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 802 }
 803 
 804 void MacroAssembler::call_VM(Register oop_result,
 805                              Register last_java_sp,
 806                              address entry_point,
 807                              Register arg_1,
 808                              bool check_exceptions) {
 809   pass_arg1(this, arg_1);
 810   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 811 }
 812 
 813 void MacroAssembler::call_VM(Register oop_result,
 814                              Register last_java_sp,
 815                              address entry_point,
 816                              Register arg_1,
 817                              Register arg_2,
 818                              bool check_exceptions) {
 819 
 820   assert(arg_1 != c_rarg2, "smashed arg");
 821   pass_arg2(this, arg_2);
 822   pass_arg1(this, arg_1);
 823   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 824 }
 825 
 826 void MacroAssembler::call_VM(Register oop_result,
 827                              Register last_java_sp,
 828                              address entry_point,
 829                              Register arg_1,
 830                              Register arg_2,
 831                              Register arg_3,
 832                              bool check_exceptions) {
 833   assert(arg_1 != c_rarg3, "smashed arg");
 834   assert(arg_2 != c_rarg3, "smashed arg");
 835   pass_arg3(this, arg_3);
 836   assert(arg_1 != c_rarg2, "smashed arg");
 837   pass_arg2(this, arg_2);
 838   pass_arg1(this, arg_1);
 839   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 840 }
 841 
 842 
 843 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 844   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 845   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 846   verify_oop(oop_result, "broken oop in call_VM_base");
 847 }
 848 
 849 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 850   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 851   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 852 }
 853 
 854 void MacroAssembler::align(int modulus) {
 855   while (offset() % modulus != 0) nop();
 856 }
 857 
 858 // these are no-ops overridden by InterpreterMacroAssembler
 859 
 860 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 861 
 862 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 863 
 864 
 865 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 866                                                       Register tmp,
 867                                                       int offset) {
 868   intptr_t value = *delayed_value_addr;
 869   if (value != 0)
 870     return RegisterOrConstant(value + offset);
 871 
 872   // load indirectly to solve generation ordering problem
 873   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 874 
 875   if (offset != 0)
 876     add(tmp, tmp, offset);
 877 
 878   return RegisterOrConstant(tmp);
 879 }
 880 
 881 
 882 void MacroAssembler:: notify(int type) {
 883   if (type == bytecode_start) {
 884     // set_last_Java_frame(esp, rfp, (address)NULL);
 885     Assembler:: notify(type);
 886     // reset_last_Java_frame(true, false);
 887   }
 888   else
 889     Assembler:: notify(type);
 890 }
 891 
 892 // Look up the method for a megamorphic invokeinterface call.
 893 // The target method is determined by <intf_klass, itable_index>.
 894 // The receiver klass is in recv_klass.
 895 // On success, the result will be in method_result, and execution falls through.
 896 // On failure, execution transfers to the given label.
 897 void MacroAssembler::lookup_interface_method(Register recv_klass,
 898                                              Register intf_klass,
 899                                              RegisterOrConstant itable_index,
 900                                              Register method_result,
 901                                              Register scan_temp,
 902                                              Label& L_no_such_interface) {
 903   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 904   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 905          "caller must use same register for non-constant itable index as for method");
 906 
 907   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 908   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 909   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 910   int scan_step   = itableOffsetEntry::size() * wordSize;
 911   int vte_size    = vtableEntry::size() * wordSize;
 912   assert(vte_size == wordSize, "else adjust times_vte_scale");
 913 
 914   ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 915 
 916   // %%% Could store the aligned, prescaled offset in the klassoop.
 917   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 918   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 919   add(scan_temp, scan_temp, vtable_base);
 920   if (HeapWordsPerLong > 1) {
 921     // Round up to align_object_offset boundary
 922     // see code for instanceKlass::start_of_itable!
 923     round_to(scan_temp, BytesPerLong);
 924   }
 925 
 926   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 927   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 928   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 929   lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 930   if (itentry_off)
 931     add(recv_klass, recv_klass, itentry_off);
 932 
 933   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 934   //   if (scan->interface() == intf) {
 935   //     result = (klass + scan->offset() + itable_index);
 936   //   }
 937   // }
 938   Label search, found_method;
 939 
 940   for (int peel = 1; peel >= 0; peel--) {
 941     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 942     cmp(intf_klass, method_result);
 943 
 944     if (peel) {
 945       br(Assembler::EQ, found_method);
 946     } else {
 947       br(Assembler::NE, search);
 948       // (invert the test to fall through to found_method...)
 949     }
 950 
 951     if (!peel)  break;
 952 
 953     bind(search);
 954 
 955     // Check that the previous entry is non-null.  A null entry means that
 956     // the receiver class doesn't implement the interface, and wasn't the
 957     // same as when the caller was compiled.
 958     cbz(method_result, L_no_such_interface);
 959     add(scan_temp, scan_temp, scan_step);
 960   }
 961 
 962   bind(found_method);
 963 
 964   // Got a hit.
 965   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 966   ldr(method_result, Address(recv_klass, scan_temp));
 967 }
 968 
 969 // virtual method calling
 970 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 971                                            RegisterOrConstant vtable_index,
 972                                            Register method_result) {
 973   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 974   assert(vtableEntry::size() * wordSize == 8,
 975          "adjust the scaling in the code below");
 976   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 977 
 978   if (vtable_index.is_register()) {
 979     lea(method_result, Address(recv_klass,
 980                                vtable_index.as_register(),
 981                                Address::lsl(LogBytesPerWord)));
 982     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 983   } else {
 984     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 985     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 986   }
 987 }
 988 
 989 void MacroAssembler::check_klass_subtype(Register sub_klass,
 990                            Register super_klass,
 991                            Register temp_reg,
 992                            Label& L_success) {
 993   Label L_failure;
 994   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 995   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 996   bind(L_failure);
 997 }
 998 
 999 
1000 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
1001                                                    Register super_klass,
1002                                                    Register temp_reg,
1003                                                    Label* L_success,
1004                                                    Label* L_failure,
1005                                                    Label* L_slow_path,
1006                                         RegisterOrConstant super_check_offset) {
1007   assert_different_registers(sub_klass, super_klass, temp_reg);
1008   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1009   if (super_check_offset.is_register()) {
1010     assert_different_registers(sub_klass, super_klass,
1011                                super_check_offset.as_register());
1012   } else if (must_load_sco) {
1013     assert(temp_reg != noreg, "supply either a temp or a register offset");
1014   }
1015 
1016   Label L_fallthrough;
1017   int label_nulls = 0;
1018   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1019   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1020   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1021   assert(label_nulls <= 1, "at most one NULL in the batch");
1022 
1023   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1024   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1025   Address super_check_offset_addr(super_klass, sco_offset);
1026 
1027   // Hacked jmp, which may only be used just before L_fallthrough.
1028 #define final_jmp(label)                                                \
1029   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1030   else                            b(label)                /*omit semi*/
1031 
1032   // If the pointers are equal, we are done (e.g., String[] elements).
1033   // This self-check enables sharing of secondary supertype arrays among
1034   // non-primary types such as array-of-interface.  Otherwise, each such
1035   // type would need its own customized SSA.
1036   // We move this check to the front of the fast path because many
1037   // type checks are in fact trivially successful in this manner,
1038   // so we get a nicely predicted branch right at the start of the check.
1039   cmp(sub_klass, super_klass);
1040   br(Assembler::EQ, *L_success);
1041 
1042   // Check the supertype display:
1043   if (must_load_sco) {
1044     // Positive movl does right thing on LP64.
1045     ldrw(temp_reg, super_check_offset_addr);
1046     super_check_offset = RegisterOrConstant(temp_reg);
1047   }
1048   Address super_check_addr(sub_klass, super_check_offset);
1049   ldr(rscratch1, super_check_addr);
1050   cmp(super_klass, rscratch1); // load displayed supertype
1051 
1052   // This check has worked decisively for primary supers.
1053   // Secondary supers are sought in the super_cache ('super_cache_addr').
1054   // (Secondary supers are interfaces and very deeply nested subtypes.)
1055   // This works in the same check above because of a tricky aliasing
1056   // between the super_cache and the primary super display elements.
1057   // (The 'super_check_addr' can address either, as the case requires.)
1058   // Note that the cache is updated below if it does not help us find
1059   // what we need immediately.
1060   // So if it was a primary super, we can just fail immediately.
1061   // Otherwise, it's the slow path for us (no success at this point).
1062 
1063   if (super_check_offset.is_register()) {
1064     br(Assembler::EQ, *L_success);
1065     cmp(super_check_offset.as_register(), sc_offset);
1066     if (L_failure == &L_fallthrough) {
1067       br(Assembler::EQ, *L_slow_path);
1068     } else {
1069       br(Assembler::NE, *L_failure);
1070       final_jmp(*L_slow_path);
1071     }
1072   } else if (super_check_offset.as_constant() == sc_offset) {
1073     // Need a slow path; fast failure is impossible.
1074     if (L_slow_path == &L_fallthrough) {
1075       br(Assembler::EQ, *L_success);
1076     } else {
1077       br(Assembler::NE, *L_slow_path);
1078       final_jmp(*L_success);
1079     }
1080   } else {
1081     // No slow path; it's a fast decision.
1082     if (L_failure == &L_fallthrough) {
1083       br(Assembler::EQ, *L_success);
1084     } else {
1085       br(Assembler::NE, *L_failure);
1086       final_jmp(*L_success);
1087     }
1088   }
1089 
1090   bind(L_fallthrough);
1091 
1092 #undef final_jmp
1093 }
1094 
1095 // These two are taken from x86, but they look generally useful
1096 
1097 // scans count pointer sized words at [addr] for occurence of value,
1098 // generic
1099 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1100                                 Register scratch) {
1101   Label Lloop, Lexit;
1102   cbz(count, Lexit);
1103   bind(Lloop);
1104   ldr(scratch, post(addr, wordSize));
1105   cmp(value, scratch);
1106   br(EQ, Lexit);
1107   sub(count, count, 1);
1108   cbnz(count, Lloop);
1109   bind(Lexit);
1110 }
1111 
1112 // scans count 4 byte words at [addr] for occurence of value,
1113 // generic
1114 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1115                                 Register scratch) {
1116   Label Lloop, Lexit;
1117   cbz(count, Lexit);
1118   bind(Lloop);
1119   ldrw(scratch, post(addr, wordSize));
1120   cmpw(value, scratch);
1121   br(EQ, Lexit);
1122   sub(count, count, 1);
1123   cbnz(count, Lloop);
1124   bind(Lexit);
1125 }
1126 
1127 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1128                                                    Register super_klass,
1129                                                    Register temp_reg,
1130                                                    Register temp2_reg,
1131                                                    Label* L_success,
1132                                                    Label* L_failure,
1133                                                    bool set_cond_codes) {
1134   assert_different_registers(sub_klass, super_klass, temp_reg);
1135   if (temp2_reg != noreg)
1136     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1137 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1138 
1139   Label L_fallthrough;
1140   int label_nulls = 0;
1141   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1142   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1143   assert(label_nulls <= 1, "at most one NULL in the batch");
1144 
1145   // a couple of useful fields in sub_klass:
1146   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1147   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1148   Address secondary_supers_addr(sub_klass, ss_offset);
1149   Address super_cache_addr(     sub_klass, sc_offset);
1150 
1151   BLOCK_COMMENT("check_klass_subtype_slow_path");
1152 
1153   // Do a linear scan of the secondary super-klass chain.
1154   // This code is rarely used, so simplicity is a virtue here.
1155   // The repne_scan instruction uses fixed registers, which we must spill.
1156   // Don't worry too much about pre-existing connections with the input regs.
1157 
1158   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1159   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1160 
1161   // Get super_klass value into r0 (even if it was in r5 or r2).
1162   RegSet pushed_registers;
1163   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1164   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1165 
1166   if (super_klass != r0 || UseCompressedOops) {
1167     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1168   }
1169 
1170   push(pushed_registers, sp);
1171 
1172 #ifndef PRODUCT
1173   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1174   Address pst_counter_addr(rscratch2);
1175   ldr(rscratch1, pst_counter_addr);
1176   add(rscratch1, rscratch1, 1);
1177   str(rscratch1, pst_counter_addr);
1178 #endif //PRODUCT
1179 
1180   // We will consult the secondary-super array.
1181   ldr(r5, secondary_supers_addr);
1182   // Load the array length.  (Positive movl does right thing on LP64.)
1183   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1184   // Skip to start of data.
1185   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1186 
1187   cmp(sp, zr); // Clear Z flag; SP is never zero
1188   // Scan R2 words at [R5] for an occurrence of R0.
1189   // Set NZ/Z based on last compare.
1190   repne_scan(r5, r0, r2, rscratch1);
1191 
1192   // Unspill the temp. registers:
1193   pop(pushed_registers, sp);
1194 
1195   br(Assembler::NE, *L_failure);
1196 
1197   // Success.  Cache the super we found and proceed in triumph.
1198   str(super_klass, super_cache_addr);
1199 
1200   if (L_success != &L_fallthrough) {
1201     b(*L_success);
1202   }
1203 
1204 #undef IS_A_TEMP
1205 
1206   bind(L_fallthrough);
1207 }
1208 
1209 
1210 void MacroAssembler::verify_oop(Register reg, const char* s) {
1211   if (!VerifyOops) return;
1212 
1213   // Pass register number to verify_oop_subroutine
1214   const char* b = NULL;
1215   {
1216     ResourceMark rm;
1217     stringStream ss;
1218     ss.print("verify_oop: %s: %s", reg->name(), s);
1219     b = code_string(ss.as_string());
1220   }
1221   BLOCK_COMMENT("verify_oop {");
1222 
1223   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1224   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1225 
1226   mov(r0, reg);
1227   mov(rscratch1, (address)b);
1228 
1229   // call indirectly to solve generation ordering problem
1230   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1231   ldr(rscratch2, Address(rscratch2));
1232   blr(rscratch2);
1233 
1234   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1235   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1236 
1237   BLOCK_COMMENT("} verify_oop");
1238 }
1239 
1240 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1241   if (!VerifyOops) return;
1242 
1243   const char* b = NULL;
1244   {
1245     ResourceMark rm;
1246     stringStream ss;
1247     ss.print("verify_oop_addr: %s", s);
1248     b = code_string(ss.as_string());
1249   }
1250   BLOCK_COMMENT("verify_oop_addr {");
1251 
1252   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1253   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1254 
1255   // addr may contain sp so we will have to adjust it based on the
1256   // pushes that we just did.
1257   if (addr.uses(sp)) {
1258     lea(r0, addr);
1259     ldr(r0, Address(r0, 4 * wordSize));
1260   } else {
1261     ldr(r0, addr);
1262   }
1263   mov(rscratch1, (address)b);
1264 
1265   // call indirectly to solve generation ordering problem
1266   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1267   ldr(rscratch2, Address(rscratch2));
1268   blr(rscratch2);
1269 
1270   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1271   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1272 
1273   BLOCK_COMMENT("} verify_oop_addr");
1274 }
1275 
1276 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1277                                          int extra_slot_offset) {
1278   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1279   int stackElementSize = Interpreter::stackElementSize;
1280   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1281 #ifdef ASSERT
1282   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1283   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1284 #endif
1285   if (arg_slot.is_constant()) {
1286     return Address(esp, arg_slot.as_constant() * stackElementSize
1287                    + offset);
1288   } else {
1289     add(rscratch1, esp, arg_slot.as_register(),
1290         ext::uxtx, exact_log2(stackElementSize));
1291     return Address(rscratch1, offset);
1292   }
1293 }
1294 
1295 void MacroAssembler::call_VM_leaf_base(address entry_point,
1296                                        int number_of_arguments,
1297                                        Label *retaddr) {
1298   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1299 }
1300 
1301 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1302                                         int number_of_gp_arguments,
1303                                         int number_of_fp_arguments,
1304                                         ret_type type,
1305                                         Label *retaddr) {
1306   Label E, L;
1307 
1308   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1309 
1310   // We add 1 to number_of_arguments because the thread in arg0 is
1311   // not counted
1312   mov(rscratch1, entry_point);
1313   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1314   if (retaddr)
1315     bind(*retaddr);
1316 
1317   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1318   maybe_isb();
1319 }
1320 
1321 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1322   call_VM_leaf_base(entry_point, number_of_arguments);
1323 }
1324 
1325 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1326   pass_arg0(this, arg_0);
1327   call_VM_leaf_base(entry_point, 1);
1328 }
1329 
1330 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1331   pass_arg0(this, arg_0);
1332   pass_arg1(this, arg_1);
1333   call_VM_leaf_base(entry_point, 2);
1334 }
1335 
1336 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1337                                   Register arg_1, Register arg_2) {
1338   pass_arg0(this, arg_0);
1339   pass_arg1(this, arg_1);
1340   pass_arg2(this, arg_2);
1341   call_VM_leaf_base(entry_point, 3);
1342 }
1343 
1344 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1345   pass_arg0(this, arg_0);
1346   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1347 }
1348 
1349 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1350 
1351   assert(arg_0 != c_rarg1, "smashed arg");
1352   pass_arg1(this, arg_1);
1353   pass_arg0(this, arg_0);
1354   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1355 }
1356 
1357 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1358   assert(arg_0 != c_rarg2, "smashed arg");
1359   assert(arg_1 != c_rarg2, "smashed arg");
1360   pass_arg2(this, arg_2);
1361   assert(arg_0 != c_rarg1, "smashed arg");
1362   pass_arg1(this, arg_1);
1363   pass_arg0(this, arg_0);
1364   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1365 }
1366 
1367 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1368   assert(arg_0 != c_rarg3, "smashed arg");
1369   assert(arg_1 != c_rarg3, "smashed arg");
1370   assert(arg_2 != c_rarg3, "smashed arg");
1371   pass_arg3(this, arg_3);
1372   assert(arg_0 != c_rarg2, "smashed arg");
1373   assert(arg_1 != c_rarg2, "smashed arg");
1374   pass_arg2(this, arg_2);
1375   assert(arg_0 != c_rarg1, "smashed arg");
1376   pass_arg1(this, arg_1);
1377   pass_arg0(this, arg_0);
1378   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1379 }
1380 
1381 void MacroAssembler::null_check(Register reg, int offset) {
1382   if (needs_explicit_null_check(offset)) {
1383     // provoke OS NULL exception if reg = NULL by
1384     // accessing M[reg] w/o changing any registers
1385     // NOTE: this is plenty to provoke a segv
1386     ldr(zr, Address(reg));
1387   } else {
1388     // nothing to do, (later) access of M[reg + offset]
1389     // will provoke OS NULL exception if reg = NULL
1390   }
1391 }
1392 
1393 // MacroAssembler protected routines needed to implement
1394 // public methods
1395 
1396 void MacroAssembler::mov(Register r, Address dest) {
1397   code_section()->relocate(pc(), dest.rspec());
1398   u_int64_t imm64 = (u_int64_t)dest.target();
1399   movptr(r, imm64);
1400 }
1401 
1402 // Move a constant pointer into r.  In AArch64 mode the virtual
1403 // address space is 48 bits in size, so we only need three
1404 // instructions to create a patchable instruction sequence that can
1405 // reach anywhere.
1406 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1407 #ifndef PRODUCT
1408   {
1409     char buffer[64];
1410     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1411     block_comment(buffer);
1412   }
1413 #endif
1414   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1415   movz(r, imm64 & 0xffff);
1416   imm64 >>= 16;
1417   movk(r, imm64 & 0xffff, 16);
1418   imm64 >>= 16;
1419   movk(r, imm64 & 0xffff, 32);
1420 }
1421 
1422 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1423 {
1424 #ifndef PRODUCT
1425   {
1426     char buffer[64];
1427     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1428     block_comment(buffer);
1429   }
1430 #endif
1431   if (operand_valid_for_logical_immediate(false, imm64)) {
1432     orr(dst, zr, imm64);
1433   } else {
1434     // we can use a combination of MOVZ or MOVN with
1435     // MOVK to build up the constant
1436     u_int64_t imm_h[4];
1437     int zero_count = 0;
1438     int neg_count = 0;
1439     int i;
1440     for (i = 0; i < 4; i++) {
1441       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1442       if (imm_h[i] == 0) {
1443         zero_count++;
1444       } else if (imm_h[i] == 0xffffL) {
1445         neg_count++;
1446       }
1447     }
1448     if (zero_count == 4) {
1449       // one MOVZ will do
1450       movz(dst, 0);
1451     } else if (neg_count == 4) {
1452       // one MOVN will do
1453       movn(dst, 0);
1454     } else if (zero_count == 3) {
1455       for (i = 0; i < 4; i++) {
1456         if (imm_h[i] != 0L) {
1457           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1458           break;
1459         }
1460       }
1461     } else if (neg_count == 3) {
1462       // one MOVN will do
1463       for (int i = 0; i < 4; i++) {
1464         if (imm_h[i] != 0xffffL) {
1465           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1466           break;
1467         }
1468       }
1469     } else if (zero_count == 2) {
1470       // one MOVZ and one MOVK will do
1471       for (i = 0; i < 3; i++) {
1472         if (imm_h[i] != 0L) {
1473           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1474           i++;
1475           break;
1476         }
1477       }
1478       for (;i < 4; i++) {
1479         if (imm_h[i] != 0L) {
1480           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1481         }
1482       }
1483     } else if (neg_count == 2) {
1484       // one MOVN and one MOVK will do
1485       for (i = 0; i < 4; i++) {
1486         if (imm_h[i] != 0xffffL) {
1487           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1488           i++;
1489           break;
1490         }
1491       }
1492       for (;i < 4; i++) {
1493         if (imm_h[i] != 0xffffL) {
1494           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1495         }
1496       }
1497     } else if (zero_count == 1) {
1498       // one MOVZ and two MOVKs will do
1499       for (i = 0; i < 4; i++) {
1500         if (imm_h[i] != 0L) {
1501           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1502           i++;
1503           break;
1504         }
1505       }
1506       for (;i < 4; i++) {
1507         if (imm_h[i] != 0x0L) {
1508           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1509         }
1510       }
1511     } else if (neg_count == 1) {
1512       // one MOVN and two MOVKs will do
1513       for (i = 0; i < 4; i++) {
1514         if (imm_h[i] != 0xffffL) {
1515           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1516           i++;
1517           break;
1518         }
1519       }
1520       for (;i < 4; i++) {
1521         if (imm_h[i] != 0xffffL) {
1522           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1523         }
1524       }
1525     } else {
1526       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1527       movz(dst, (u_int32_t)imm_h[0], 0);
1528       for (i = 1; i < 4; i++) {
1529         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1530       }
1531     }
1532   }
1533 }
1534 
1535 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1536 {
1537 #ifndef PRODUCT
1538     {
1539       char buffer[64];
1540       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1541       block_comment(buffer);
1542     }
1543 #endif
1544   if (operand_valid_for_logical_immediate(true, imm32)) {
1545     orrw(dst, zr, imm32);
1546   } else {
1547     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1548     // constant
1549     u_int32_t imm_h[2];
1550     imm_h[0] = imm32 & 0xffff;
1551     imm_h[1] = ((imm32 >> 16) & 0xffff);
1552     if (imm_h[0] == 0) {
1553       movzw(dst, imm_h[1], 16);
1554     } else if (imm_h[0] == 0xffff) {
1555       movnw(dst, imm_h[1] ^ 0xffff, 16);
1556     } else if (imm_h[1] == 0) {
1557       movzw(dst, imm_h[0], 0);
1558     } else if (imm_h[1] == 0xffff) {
1559       movnw(dst, imm_h[0] ^ 0xffff, 0);
1560     } else {
1561       // use a MOVZ and MOVK (makes it easier to debug)
1562       movzw(dst, imm_h[0], 0);
1563       movkw(dst, imm_h[1], 16);
1564     }
1565   }
1566 }
1567 
1568 // Form an address from base + offset in Rd.  Rd may or may
1569 // not actually be used: you must use the Address that is returned.
1570 // It is up to you to ensure that the shift provided matches the size
1571 // of your data.
1572 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1573   if (Address::offset_ok_for_immed(byte_offset, shift))
1574     // It fits; no need for any heroics
1575     return Address(base, byte_offset);
1576 
1577   // Don't do anything clever with negative or misaligned offsets
1578   unsigned mask = (1 << shift) - 1;
1579   if (byte_offset < 0 || byte_offset & mask) {
1580     mov(Rd, byte_offset);
1581     add(Rd, base, Rd);
1582     return Address(Rd);
1583   }
1584 
1585   // See if we can do this with two 12-bit offsets
1586   {
1587     unsigned long word_offset = byte_offset >> shift;
1588     unsigned long masked_offset = word_offset & 0xfff000;
1589     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1590         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1591       add(Rd, base, masked_offset << shift);
1592       word_offset -= masked_offset;
1593       return Address(Rd, word_offset << shift);
1594     }
1595   }
1596 
1597   // Do it the hard way
1598   mov(Rd, byte_offset);
1599   add(Rd, base, Rd);
1600   return Address(Rd);
1601 }
1602 
1603 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
1604   Label retry_load;
1605   bind(retry_load);
1606   // flush and load exclusive from the memory location
1607   ldxrw(tmp, counter_addr);
1608   addw(tmp, tmp, 1);
1609   // if we store+flush with no intervening write tmp wil be zero
1610   stxrw(tmp, tmp, counter_addr);
1611   cbnzw(tmp, retry_load);
1612 }
1613 
1614 
1615 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1616                                     bool want_remainder, Register scratch)
1617 {
1618   // Full implementation of Java idiv and irem.  The function
1619   // returns the (pc) offset of the div instruction - may be needed
1620   // for implicit exceptions.
1621   //
1622   // constraint : ra/rb =/= scratch
1623   //         normal case
1624   //
1625   // input : ra: dividend
1626   //         rb: divisor
1627   //
1628   // result: either
1629   //         quotient  (= ra idiv rb)
1630   //         remainder (= ra irem rb)
1631 
1632   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1633 
1634   int idivl_offset = offset();
1635   if (! want_remainder) {
1636     sdivw(result, ra, rb);
1637   } else {
1638     sdivw(scratch, ra, rb);
1639     msubw(result, scratch, rb, ra);
1640   }
1641 
1642   return idivl_offset;
1643 }
1644 
1645 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1646                                     bool want_remainder, Register scratch)
1647 {
1648   // Full implementation of Java ldiv and lrem.  The function
1649   // returns the (pc) offset of the div instruction - may be needed
1650   // for implicit exceptions.
1651   //
1652   // constraint : ra/rb =/= scratch
1653   //         normal case
1654   //
1655   // input : ra: dividend
1656   //         rb: divisor
1657   //
1658   // result: either
1659   //         quotient  (= ra idiv rb)
1660   //         remainder (= ra irem rb)
1661 
1662   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1663 
1664   int idivq_offset = offset();
1665   if (! want_remainder) {
1666     sdiv(result, ra, rb);
1667   } else {
1668     sdiv(scratch, ra, rb);
1669     msub(result, scratch, rb, ra);
1670   }
1671 
1672   return idivq_offset;
1673 }
1674 
1675 // MacroAssembler routines found actually to be needed
1676 
1677 void MacroAssembler::push(Register src)
1678 {
1679   str(src, Address(pre(esp, -1 * wordSize)));
1680 }
1681 
1682 void MacroAssembler::pop(Register dst)
1683 {
1684   ldr(dst, Address(post(esp, 1 * wordSize)));
1685 }
1686 
1687 // Note: load_unsigned_short used to be called load_unsigned_word.
1688 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1689   int off = offset();
1690   ldrh(dst, src);
1691   return off;
1692 }
1693 
1694 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1695   int off = offset();
1696   ldrb(dst, src);
1697   return off;
1698 }
1699 
1700 int MacroAssembler::load_signed_short(Register dst, Address src) {
1701   int off = offset();
1702   ldrsh(dst, src);
1703   return off;
1704 }
1705 
1706 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1707   int off = offset();
1708   ldrsb(dst, src);
1709   return off;
1710 }
1711 
1712 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1713   int off = offset();
1714   ldrshw(dst, src);
1715   return off;
1716 }
1717 
1718 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1719   int off = offset();
1720   ldrsbw(dst, src);
1721   return off;
1722 }
1723 
1724 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1725   switch (size_in_bytes) {
1726   case  8:  ldr(dst, src); break;
1727   case  4:  ldrw(dst, src); break;
1728   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1729   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1730   default:  ShouldNotReachHere();
1731   }
1732 }
1733 
1734 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1735   switch (size_in_bytes) {
1736   case  8:  str(src, dst); break;
1737   case  4:  strw(src, dst); break;
1738   case  2:  strh(src, dst); break;
1739   case  1:  strb(src, dst); break;
1740   default:  ShouldNotReachHere();
1741   }
1742 }
1743 
1744 void MacroAssembler::decrementw(Register reg, int value)
1745 {
1746   if (value < 0)  { incrementw(reg, -value);      return; }
1747   if (value == 0) {                               return; }
1748   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1749   /* else */ {
1750     guarantee(reg != rscratch2, "invalid dst for register decrement");
1751     movw(rscratch2, (unsigned)value);
1752     subw(reg, reg, rscratch2);
1753   }
1754 }
1755 
1756 void MacroAssembler::decrement(Register reg, int value)
1757 {
1758   if (value < 0)  { increment(reg, -value);      return; }
1759   if (value == 0) {                              return; }
1760   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1761   /* else */ {
1762     assert(reg != rscratch2, "invalid dst for register decrement");
1763     mov(rscratch2, (unsigned long)value);
1764     sub(reg, reg, rscratch2);
1765   }
1766 }
1767 
1768 void MacroAssembler::decrementw(Address dst, int value)
1769 {
1770   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1771   ldrw(rscratch1, dst);
1772   decrementw(rscratch1, value);
1773   strw(rscratch1, dst);
1774 }
1775 
1776 void MacroAssembler::decrement(Address dst, int value)
1777 {
1778   assert(!dst.uses(rscratch1), "invalid address for decrement");
1779   ldr(rscratch1, dst);
1780   decrement(rscratch1, value);
1781   str(rscratch1, dst);
1782 }
1783 
1784 void MacroAssembler::incrementw(Register reg, int value)
1785 {
1786   if (value < 0)  { decrementw(reg, -value);      return; }
1787   if (value == 0) {                               return; }
1788   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1789   /* else */ {
1790     assert(reg != rscratch2, "invalid dst for register increment");
1791     movw(rscratch2, (unsigned)value);
1792     addw(reg, reg, rscratch2);
1793   }
1794 }
1795 
1796 void MacroAssembler::increment(Register reg, int value)
1797 {
1798   if (value < 0)  { decrement(reg, -value);      return; }
1799   if (value == 0) {                              return; }
1800   if (value < (1 << 12)) { add(reg, reg, value); return; }
1801   /* else */ {
1802     assert(reg != rscratch2, "invalid dst for register increment");
1803     movw(rscratch2, (unsigned)value);
1804     add(reg, reg, rscratch2);
1805   }
1806 }
1807 
1808 void MacroAssembler::incrementw(Address dst, int value)
1809 {
1810   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1811   ldrw(rscratch1, dst);
1812   incrementw(rscratch1, value);
1813   strw(rscratch1, dst);
1814 }
1815 
1816 void MacroAssembler::increment(Address dst, int value)
1817 {
1818   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1819   ldr(rscratch1, dst);
1820   increment(rscratch1, value);
1821   str(rscratch1, dst);
1822 }
1823 
1824 
1825 void MacroAssembler::pusha() {
1826   push(0x7fffffff, sp);
1827 }
1828 
1829 void MacroAssembler::popa() {
1830   pop(0x7fffffff, sp);
1831 }
1832 
1833 // Push lots of registers in the bit set supplied.  Don't push sp.
1834 // Return the number of words pushed
1835 int MacroAssembler::push(unsigned int bitset, Register stack) {
1836   int words_pushed = 0;
1837 
1838   // Scan bitset to accumulate register pairs
1839   unsigned char regs[32];
1840   int count = 0;
1841   for (int reg = 0; reg <= 30; reg++) {
1842     if (1 & bitset)
1843       regs[count++] = reg;
1844     bitset >>= 1;
1845   }
1846   regs[count++] = zr->encoding_nocheck();
1847   count &= ~1;  // Only push an even nuber of regs
1848 
1849   if (count) {
1850     stp(as_Register(regs[0]), as_Register(regs[1]),
1851        Address(pre(stack, -count * wordSize)));
1852     words_pushed += 2;
1853   }
1854   for (int i = 2; i < count; i += 2) {
1855     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1856        Address(stack, i * wordSize));
1857     words_pushed += 2;
1858   }
1859 
1860   assert(words_pushed == count, "oops, pushed != count");
1861 
1862   return count;
1863 }
1864 
1865 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1866   int words_pushed = 0;
1867 
1868   // Scan bitset to accumulate register pairs
1869   unsigned char regs[32];
1870   int count = 0;
1871   for (int reg = 0; reg <= 30; reg++) {
1872     if (1 & bitset)
1873       regs[count++] = reg;
1874     bitset >>= 1;
1875   }
1876   regs[count++] = zr->encoding_nocheck();
1877   count &= ~1;
1878 
1879   for (int i = 2; i < count; i += 2) {
1880     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1881        Address(stack, i * wordSize));
1882     words_pushed += 2;
1883   }
1884   if (count) {
1885     ldp(as_Register(regs[0]), as_Register(regs[1]),
1886        Address(post(stack, count * wordSize)));
1887     words_pushed += 2;
1888   }
1889 
1890   assert(words_pushed == count, "oops, pushed != count");
1891 
1892   return count;
1893 }
1894 #ifdef ASSERT
1895 void MacroAssembler::verify_heapbase(const char* msg) {
1896 #if 0
1897   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1898   assert (Universe::heap() != NULL, "java heap should be initialized");
1899   if (CheckCompressedOops) {
1900     Label ok;
1901     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1902     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1903     br(Assembler::EQ, ok);
1904     stop(msg);
1905     bind(ok);
1906     pop(1 << rscratch1->encoding(), sp);
1907   }
1908 #endif
1909 }
1910 #endif
1911 
1912 void MacroAssembler::stop(const char* msg) {
1913   address ip = pc();
1914   pusha();
1915   mov(c_rarg0, (address)msg);
1916   mov(c_rarg1, (address)ip);
1917   mov(c_rarg2, sp);
1918   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
1919   // call(c_rarg3);
1920   blrt(c_rarg3, 3, 0, 1);
1921   hlt(0);
1922 }
1923 
1924 // If a constant does not fit in an immediate field, generate some
1925 // number of MOV instructions and then perform the operation.
1926 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
1927                                            add_sub_imm_insn insn1,
1928                                            add_sub_reg_insn insn2) {
1929   assert(Rd != zr, "Rd = zr and not setting flags?");
1930   if (operand_valid_for_add_sub_immediate((int)imm)) {
1931     (this->*insn1)(Rd, Rn, imm);
1932   } else {
1933     if (uabs(imm) < (1 << 24)) {
1934        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
1935        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
1936     } else {
1937        assert_different_registers(Rd, Rn);
1938        mov(Rd, (uint64_t)imm);
1939        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1940     }
1941   }
1942 }
1943 
1944 // Seperate vsn which sets the flags. Optimisations are more restricted
1945 // because we must set the flags correctly.
1946 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
1947                                            add_sub_imm_insn insn1,
1948                                            add_sub_reg_insn insn2) {
1949   if (operand_valid_for_add_sub_immediate((int)imm)) {
1950     (this->*insn1)(Rd, Rn, imm);
1951   } else {
1952     assert_different_registers(Rd, Rn);
1953     assert(Rd != zr, "overflow in immediate operand");
1954     mov(Rd, (uint64_t)imm);
1955     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1956   }
1957 }
1958 
1959 
1960 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
1961   if (increment.is_register()) {
1962     add(Rd, Rn, increment.as_register());
1963   } else {
1964     add(Rd, Rn, increment.as_constant());
1965   }
1966 }
1967 
1968 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
1969   if (increment.is_register()) {
1970     addw(Rd, Rn, increment.as_register());
1971   } else {
1972     addw(Rd, Rn, increment.as_constant());
1973   }
1974 }
1975 
1976 void MacroAssembler::reinit_heapbase()
1977 {
1978   if (UseCompressedOops) {
1979     if (Universe::is_fully_initialized()) {
1980       mov(rheapbase, Universe::narrow_ptrs_base());
1981     } else {
1982       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1983       ldr(rheapbase, Address(rheapbase));
1984     }
1985   }
1986 }
1987 
1988 // this simulates the behaviour of the x86 cmpxchg instruction using a
1989 // load linked/store conditional pair. we use the acquire/release
1990 // versions of these instructions so that we flush pending writes as
1991 // per Java semantics.
1992 
1993 // n.b the x86 version assumes the old value to be compared against is
1994 // in rax and updates rax with the value located in memory if the
1995 // cmpxchg fails. we supply a register for the old value explicitly
1996 
1997 // the aarch64 load linked/store conditional instructions do not
1998 // accept an offset. so, unlike x86, we must provide a plain register
1999 // to identify the memory word to be compared/exchanged rather than a
2000 // register+offset Address.
2001 
2002 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2003                                 Label &succeed, Label *fail) {
2004   // oldv holds comparison value
2005   // newv holds value to write in exchange
2006   // addr identifies memory word to compare against/update
2007   // tmp returns 0/1 for success/failure
2008   Label retry_load, nope;
2009   
2010   bind(retry_load);
2011   // flush and load exclusive from the memory location
2012   // and fail if it is not what we expect
2013   ldaxr(tmp, addr);
2014   cmp(tmp, oldv);
2015   br(Assembler::NE, nope);
2016   // if we store+flush with no intervening write tmp wil be zero
2017   stlxr(tmp, newv, addr);
2018   cbzw(tmp, succeed);
2019   // retry so we only ever return after a load fails to compare
2020   // ensures we don't return a stale value after a failed write.
2021   b(retry_load);
2022   // if the memory word differs we return it in oldv and signal a fail
2023   bind(nope);
2024   membar(AnyAny);
2025   mov(oldv, tmp);
2026   if (fail)
2027     b(*fail);
2028 }
2029 
2030 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2031                                 Label &succeed, Label *fail) {
2032   // oldv holds comparison value
2033   // newv holds value to write in exchange
2034   // addr identifies memory word to compare against/update
2035   // tmp returns 0/1 for success/failure
2036   Label retry_load, nope;
2037   
2038   bind(retry_load);
2039   // flush and load exclusive from the memory location
2040   // and fail if it is not what we expect
2041   ldaxrw(tmp, addr);
2042   cmp(tmp, oldv);
2043   br(Assembler::NE, nope);
2044   // if we store+flush with no intervening write tmp wil be zero
2045   stlxrw(tmp, newv, addr);
2046   cbzw(tmp, succeed);
2047   // retry so we only ever return after a load fails to compare
2048   // ensures we don't return a stale value after a failed write.
2049   b(retry_load);
2050   // if the memory word differs we return it in oldv and signal a fail
2051   bind(nope);
2052   membar(AnyAny);
2053   mov(oldv, tmp);
2054   if (fail)
2055     b(*fail);
2056 }
2057 
2058 static bool different(Register a, RegisterOrConstant b, Register c) {
2059   if (b.is_constant())
2060     return a != c;
2061   else
2062     return a != b.as_register() && a != c && b.as_register() != c;
2063 }
2064 
2065 #define ATOMIC_OP(LDXR, OP, STXR)                                       \
2066 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2067   Register result = rscratch2;                                          \
2068   if (prev->is_valid())                                                 \
2069     result = different(prev, incr, addr) ? prev : rscratch2;            \
2070                                                                         \
2071   Label retry_load;                                                     \
2072   bind(retry_load);                                                     \
2073   LDXR(result, addr);                                                   \
2074   OP(rscratch1, result, incr);                                          \
2075   STXR(rscratch1, rscratch1, addr);                                     \
2076   cbnzw(rscratch1, retry_load);                                         \
2077   if (prev->is_valid() && prev != result)                               \
2078     mov(prev, result);                                                  \
2079 }
2080 
2081 ATOMIC_OP(ldxr, add, stxr)
2082 ATOMIC_OP(ldxrw, addw, stxrw)
2083 
2084 #undef ATOMIC_OP
2085 
2086 #define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
2087 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2088   Register result = rscratch2;                                          \
2089   if (prev->is_valid())                                                 \
2090     result = different(prev, newv, addr) ? prev : rscratch2;            \
2091                                                                         \
2092   Label retry_load;                                                     \
2093   bind(retry_load);                                                     \
2094   LDXR(result, addr);                                                   \
2095   STXR(rscratch1, newv, addr);                                          \
2096   cbnzw(rscratch1, retry_load);                                         \
2097   if (prev->is_valid() && prev != result)                               \
2098     mov(prev, result);                                                  \
2099 }
2100 
2101 ATOMIC_XCHG(xchg, ldxr, stxr)
2102 ATOMIC_XCHG(xchgw, ldxrw, stxrw)
2103 
2104 #undef ATOMIC_XCHG
2105 
2106 void MacroAssembler::incr_allocated_bytes(Register thread,
2107                                           Register var_size_in_bytes,
2108                                           int con_size_in_bytes,
2109                                           Register t1) {
2110   if (!thread->is_valid()) {
2111     thread = rthread;
2112   }
2113   assert(t1->is_valid(), "need temp reg");
2114 
2115   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2116   if (var_size_in_bytes->is_valid()) {
2117     add(t1, t1, var_size_in_bytes);
2118   } else {
2119     add(t1, t1, con_size_in_bytes);
2120   }
2121   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2122 }
2123 
2124 #ifndef PRODUCT
2125 extern "C" void findpc(intptr_t x);
2126 #endif
2127 
2128 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2129 {
2130   // In order to get locks to work, we need to fake a in_VM state
2131   if (ShowMessageBoxOnError ) {
2132     JavaThread* thread = JavaThread::current();
2133     JavaThreadState saved_state = thread->thread_state();
2134     thread->set_thread_state(_thread_in_vm);
2135 #ifndef PRODUCT
2136     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2137       ttyLocker ttyl;
2138       BytecodeCounter::print();
2139     }
2140 #endif
2141     if (os::message_box(msg, "Execution stopped, print registers?")) {
2142       ttyLocker ttyl;
2143       tty->print_cr(" pc = 0x%016lx", pc);
2144 #ifndef PRODUCT
2145       tty->cr();
2146       findpc(pc);
2147       tty->cr();
2148 #endif
2149       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2150       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2151       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2152       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2153       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2154       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2155       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2156       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2157       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2158       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2159       tty->print_cr("r10 = 0x%016lx", regs[10]);
2160       tty->print_cr("r11 = 0x%016lx", regs[11]);
2161       tty->print_cr("r12 = 0x%016lx", regs[12]);
2162       tty->print_cr("r13 = 0x%016lx", regs[13]);
2163       tty->print_cr("r14 = 0x%016lx", regs[14]);
2164       tty->print_cr("r15 = 0x%016lx", regs[15]);
2165       tty->print_cr("r16 = 0x%016lx", regs[16]);
2166       tty->print_cr("r17 = 0x%016lx", regs[17]);
2167       tty->print_cr("r18 = 0x%016lx", regs[18]);
2168       tty->print_cr("r19 = 0x%016lx", regs[19]);
2169       tty->print_cr("r20 = 0x%016lx", regs[20]);
2170       tty->print_cr("r21 = 0x%016lx", regs[21]);
2171       tty->print_cr("r22 = 0x%016lx", regs[22]);
2172       tty->print_cr("r23 = 0x%016lx", regs[23]);
2173       tty->print_cr("r24 = 0x%016lx", regs[24]);
2174       tty->print_cr("r25 = 0x%016lx", regs[25]);
2175       tty->print_cr("r26 = 0x%016lx", regs[26]);
2176       tty->print_cr("r27 = 0x%016lx", regs[27]);
2177       tty->print_cr("r28 = 0x%016lx", regs[28]);
2178       tty->print_cr("r30 = 0x%016lx", regs[30]);
2179       tty->print_cr("r31 = 0x%016lx", regs[31]);
2180       BREAKPOINT;
2181     }
2182     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2183   } else {
2184     ttyLocker ttyl;
2185     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2186                     msg);
2187     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
2188   }
2189 }
2190 
2191 #ifdef BUILTIN_SIM
2192 // routine to generate an x86 prolog for a stub function which
2193 // bootstraps into the generated ARM code which directly follows the
2194 // stub
2195 //
2196 // the argument encodes the number of general and fp registers
2197 // passed by the caller and the callng convention (currently just
2198 // the number of general registers and assumes C argument passing)
2199 
2200 extern "C" {
2201 int aarch64_stub_prolog_size();
2202 void aarch64_stub_prolog();
2203 void aarch64_prolog();
2204 }
2205 
2206 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2207                                    address *prolog_ptr)
2208 {
2209   int calltype = (((ret_type & 0x3) << 8) |
2210                   ((fp_arg_count & 0xf) << 4) |
2211                   (gp_arg_count & 0xf));
2212 
2213   // the addresses for the x86 to ARM entry code we need to use
2214   address start = pc();
2215   // printf("start = %lx\n", start);
2216   int byteCount =  aarch64_stub_prolog_size();
2217   // printf("byteCount = %x\n", byteCount);
2218   int instructionCount = (byteCount + 3)/ 4;
2219   // printf("instructionCount = %x\n", instructionCount);
2220   for (int i = 0; i < instructionCount; i++) {
2221     nop();
2222   }
2223 
2224   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2225 
2226   // write the address of the setup routine and the call format at the
2227   // end of into the copied code
2228   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2229   if (prolog_ptr)
2230     patch_end[-2] = (u_int64_t)prolog_ptr;
2231   patch_end[-1] = calltype;
2232 }
2233 #endif
2234 
2235 void MacroAssembler::push_CPU_state() {
2236     push(0x3fffffff, sp);         // integer registers except lr & sp
2237 
2238     for (int i = 30; i >= 0; i -= 2)
2239       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2240            Address(pre(sp, -2 * wordSize)));
2241 }
2242 
2243 void MacroAssembler::pop_CPU_state() {
2244   for (int i = 0; i < 32; i += 2)
2245     ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2246          Address(post(sp, 2 * wordSize)));
2247 
2248   pop(0x3fffffff, sp);         // integer registers except lr & sp
2249 }
2250 
2251 /**
2252  * Emits code to update CRC-32 with a byte value according to constants in table
2253  *
2254  * @param [in,out]crc   Register containing the crc.
2255  * @param [in]val       Register containing the byte to fold into the CRC.
2256  * @param [in]table     Register containing the table of crc constants.
2257  *
2258  * uint32_t crc;
2259  * val = crc_table[(val ^ crc) & 0xFF];
2260  * crc = val ^ (crc >> 8);
2261  *
2262  */
2263 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2264   eor(val, val, crc);
2265   andr(val, val, 0xff);
2266   ldrw(val, Address(table, val, Address::lsl(2)));
2267   eor(crc, val, crc, Assembler::LSR, 8);
2268 }
2269 
2270 /**
2271  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2272  *
2273  * @param [in,out]crc   Register containing the crc.
2274  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2275  * @param [in]table0    Register containing table 0 of crc constants.
2276  * @param [in]table1    Register containing table 1 of crc constants.
2277  * @param [in]table2    Register containing table 2 of crc constants.
2278  * @param [in]table3    Register containing table 3 of crc constants.
2279  *
2280  * uint32_t crc;
2281  *   v = crc ^ v
2282  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2283  *
2284  */
2285 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2286         Register table0, Register table1, Register table2, Register table3,
2287         bool upper) {
2288   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2289   uxtb(tmp, v);
2290   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2291   ubfx(tmp, v, 8, 8);
2292   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2293   eor(crc, crc, tmp);
2294   ubfx(tmp, v, 16, 8);
2295   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2296   eor(crc, crc, tmp);
2297   ubfx(tmp, v, 24, 8);
2298   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2299   eor(crc, crc, tmp);
2300 }
2301 
2302 /**
2303  * @param crc   register containing existing CRC (32-bit)
2304  * @param buf   register pointing to input byte buffer (byte*)
2305  * @param len   register containing number of bytes
2306  * @param table register that will contain address of CRC table
2307  * @param tmp   scratch register
2308  */
2309 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2310         Register table0, Register table1, Register table2, Register table3,
2311         Register tmp, Register tmp2, Register tmp3) {
2312   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2313   unsigned long offset;
2314 
2315     ornw(crc, zr, crc);
2316 
2317   if (UseCRC32) {
2318     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2319 
2320       subs(len, len, 64);
2321       br(Assembler::GE, CRC_by64_loop);
2322       adds(len, len, 64-4);
2323       br(Assembler::GE, CRC_by4_loop);
2324       adds(len, len, 4);
2325       br(Assembler::GT, CRC_by1_loop);
2326       b(L_exit);
2327 
2328     BIND(CRC_by4_loop);
2329       ldrw(tmp, Address(post(buf, 4)));
2330       subs(len, len, 4);
2331       crc32w(crc, crc, tmp);
2332       br(Assembler::GE, CRC_by4_loop);
2333       adds(len, len, 4);
2334       br(Assembler::LE, L_exit);
2335     BIND(CRC_by1_loop);
2336       ldrb(tmp, Address(post(buf, 1)));
2337       subs(len, len, 1);
2338       crc32b(crc, crc, tmp);
2339       br(Assembler::GT, CRC_by1_loop);
2340       b(L_exit);
2341 
2342       align(CodeEntryAlignment);
2343     BIND(CRC_by64_loop);
2344       subs(len, len, 64);
2345       ldp(tmp, tmp3, Address(post(buf, 16)));
2346       crc32x(crc, crc, tmp);
2347       crc32x(crc, crc, tmp3);
2348       ldp(tmp, tmp3, Address(post(buf, 16)));
2349       crc32x(crc, crc, tmp);
2350       crc32x(crc, crc, tmp3);
2351       ldp(tmp, tmp3, Address(post(buf, 16)));
2352       crc32x(crc, crc, tmp);
2353       crc32x(crc, crc, tmp3);
2354       ldp(tmp, tmp3, Address(post(buf, 16)));
2355       crc32x(crc, crc, tmp);
2356       crc32x(crc, crc, tmp3);
2357       br(Assembler::GE, CRC_by64_loop);
2358       adds(len, len, 64-4);
2359       br(Assembler::GE, CRC_by4_loop);
2360       adds(len, len, 4);
2361       br(Assembler::GT, CRC_by1_loop);
2362     BIND(L_exit);
2363       ornw(crc, zr, crc);
2364       return;
2365   }
2366 
2367     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
2368     if (offset) add(table0, table0, offset);
2369     add(table1, table0, 1*256*sizeof(juint));
2370     add(table2, table0, 2*256*sizeof(juint));
2371     add(table3, table0, 3*256*sizeof(juint));
2372 
2373   if (UseNeon) {
2374       cmp(len, 64);
2375       br(Assembler::LT, L_by16);
2376       eor(v16, T16B, v16, v16);
2377 
2378     Label L_fold;
2379 
2380       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
2381 
2382       ld1(v0, v1, T2D, post(buf, 32));
2383       ld1r(v4, T2D, post(tmp, 8));
2384       ld1r(v5, T2D, post(tmp, 8));
2385       ld1r(v6, T2D, post(tmp, 8));
2386       ld1r(v7, T2D, post(tmp, 8));
2387       mov(v16, T4S, 0, crc);
2388 
2389       eor(v0, T16B, v0, v16);
2390       sub(len, len, 64);
2391 
2392     BIND(L_fold);
2393       pmull(v22, T8H, v0, v5, T8B);
2394       pmull(v20, T8H, v0, v7, T8B);
2395       pmull(v23, T8H, v0, v4, T8B);
2396       pmull(v21, T8H, v0, v6, T8B);
2397 
2398       pmull2(v18, T8H, v0, v5, T16B);
2399       pmull2(v16, T8H, v0, v7, T16B);
2400       pmull2(v19, T8H, v0, v4, T16B);
2401       pmull2(v17, T8H, v0, v6, T16B);
2402 
2403       uzp1(v24, v20, v22, T8H);
2404       uzp2(v25, v20, v22, T8H);
2405       eor(v20, T16B, v24, v25);
2406 
2407       uzp1(v26, v16, v18, T8H);
2408       uzp2(v27, v16, v18, T8H);
2409       eor(v16, T16B, v26, v27);
2410 
2411       ushll2(v22, T4S, v20, T8H, 8);
2412       ushll(v20, T4S, v20, T4H, 8);
2413 
2414       ushll2(v18, T4S, v16, T8H, 8);
2415       ushll(v16, T4S, v16, T4H, 8);
2416 
2417       eor(v22, T16B, v23, v22);
2418       eor(v18, T16B, v19, v18);
2419       eor(v20, T16B, v21, v20);
2420       eor(v16, T16B, v17, v16);
2421 
2422       uzp1(v17, v16, v20, T2D);
2423       uzp2(v21, v16, v20, T2D);
2424       eor(v17, T16B, v17, v21);
2425 
2426       ushll2(v20, T2D, v17, T4S, 16);
2427       ushll(v16, T2D, v17, T2S, 16);
2428 
2429       eor(v20, T16B, v20, v22);
2430       eor(v16, T16B, v16, v18);
2431 
2432       uzp1(v17, v20, v16, T2D);
2433       uzp2(v21, v20, v16, T2D);
2434       eor(v28, T16B, v17, v21);
2435 
2436       pmull(v22, T8H, v1, v5, T8B);
2437       pmull(v20, T8H, v1, v7, T8B);
2438       pmull(v23, T8H, v1, v4, T8B);
2439       pmull(v21, T8H, v1, v6, T8B);
2440 
2441       pmull2(v18, T8H, v1, v5, T16B);
2442       pmull2(v16, T8H, v1, v7, T16B);
2443       pmull2(v19, T8H, v1, v4, T16B);
2444       pmull2(v17, T8H, v1, v6, T16B);
2445 
2446       ld1(v0, v1, T2D, post(buf, 32));
2447 
2448       uzp1(v24, v20, v22, T8H);
2449       uzp2(v25, v20, v22, T8H);
2450       eor(v20, T16B, v24, v25);
2451 
2452       uzp1(v26, v16, v18, T8H);
2453       uzp2(v27, v16, v18, T8H);
2454       eor(v16, T16B, v26, v27);
2455 
2456       ushll2(v22, T4S, v20, T8H, 8);
2457       ushll(v20, T4S, v20, T4H, 8);
2458 
2459       ushll2(v18, T4S, v16, T8H, 8);
2460       ushll(v16, T4S, v16, T4H, 8);
2461 
2462       eor(v22, T16B, v23, v22);
2463       eor(v18, T16B, v19, v18);
2464       eor(v20, T16B, v21, v20);
2465       eor(v16, T16B, v17, v16);
2466 
2467       uzp1(v17, v16, v20, T2D);
2468       uzp2(v21, v16, v20, T2D);
2469       eor(v16, T16B, v17, v21);
2470 
2471       ushll2(v20, T2D, v16, T4S, 16);
2472       ushll(v16, T2D, v16, T2S, 16);
2473 
2474       eor(v20, T16B, v22, v20);
2475       eor(v16, T16B, v16, v18);
2476 
2477       uzp1(v17, v20, v16, T2D);
2478       uzp2(v21, v20, v16, T2D);
2479       eor(v20, T16B, v17, v21);
2480 
2481       shl(v16, v28, T2D, 1);
2482       shl(v17, v20, T2D, 1);
2483 
2484       eor(v0, T16B, v0, v16);
2485       eor(v1, T16B, v1, v17);
2486 
2487       subs(len, len, 32);
2488       br(Assembler::GE, L_fold);
2489 
2490       mov(crc, 0);
2491       mov(tmp, v0, T1D, 0);
2492       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2493       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2494       mov(tmp, v0, T1D, 1);
2495       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2496       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2497       mov(tmp, v1, T1D, 0);
2498       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2499       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2500       mov(tmp, v1, T1D, 1);
2501       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2502       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2503 
2504       add(len, len, 32);
2505   }
2506 
2507   BIND(L_by16);
2508     subs(len, len, 16);
2509     br(Assembler::GE, L_by16_loop);
2510     adds(len, len, 16-4);
2511     br(Assembler::GE, L_by4_loop);
2512     adds(len, len, 4);
2513     br(Assembler::GT, L_by1_loop);
2514     b(L_exit);
2515 
2516   BIND(L_by4_loop);
2517     ldrw(tmp, Address(post(buf, 4)));
2518     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
2519     subs(len, len, 4);
2520     br(Assembler::GE, L_by4_loop);
2521     adds(len, len, 4);
2522     br(Assembler::LE, L_exit);
2523   BIND(L_by1_loop);
2524     subs(len, len, 1);
2525     ldrb(tmp, Address(post(buf, 1)));
2526     update_byte_crc32(crc, tmp, table0);
2527     br(Assembler::GT, L_by1_loop);
2528     b(L_exit);
2529 
2530     align(CodeEntryAlignment);
2531   BIND(L_by16_loop);
2532     subs(len, len, 16);
2533     ldp(tmp, tmp3, Address(post(buf, 16)));
2534     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2535     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2536     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
2537     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
2538     br(Assembler::GE, L_by16_loop);
2539     adds(len, len, 16-4);
2540     br(Assembler::GE, L_by4_loop);
2541     adds(len, len, 4);
2542     br(Assembler::GT, L_by1_loop);
2543   BIND(L_exit);
2544     ornw(crc, zr, crc);
2545 }
2546 
2547 SkipIfEqual::SkipIfEqual(
2548     MacroAssembler* masm, const bool* flag_addr, bool value) {
2549   _masm = masm;
2550   unsigned long offset;
2551   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
2552   _masm->ldrb(rscratch1, Address(rscratch1, offset));
2553   _masm->cbzw(rscratch1, _label);
2554 }
2555 
2556 SkipIfEqual::~SkipIfEqual() {
2557   _masm->bind(_label);
2558 }
2559 
2560 void MacroAssembler::cmpptr(Register src1, Address src2) {
2561   unsigned long offset;
2562   adrp(rscratch1, src2, offset);
2563   ldr(rscratch1, Address(rscratch1, offset));
2564   cmp(src1, rscratch1);
2565 }
2566 
2567 void MacroAssembler::store_check(Register obj) {
2568   // Does a store check for the oop in register obj. The content of
2569   // register obj is destroyed afterwards.
2570   store_check_part_1(obj);
2571   store_check_part_2(obj);
2572 }
2573 
2574 void MacroAssembler::store_check(Register obj, Address dst) {
2575   store_check(obj);
2576 }
2577 
2578 
2579 // split the store check operation so that other instructions can be scheduled inbetween
2580 void MacroAssembler::store_check_part_1(Register obj) {
2581   BarrierSet* bs = Universe::heap()->barrier_set();
2582   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
2583   lsr(obj, obj, CardTableModRefBS::card_shift);
2584 }
2585 
2586 void MacroAssembler::store_check_part_2(Register obj) {
2587   BarrierSet* bs = Universe::heap()->barrier_set();
2588   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
2589   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
2590   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
2591 
2592   // The calculation for byte_map_base is as follows:
2593   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
2594   // So this essentially converts an address to a displacement and
2595   // it will never need to be relocated.
2596 
2597   // FIXME: It's not likely that disp will fit into an offset so we
2598   // don't bother to check, but it could save an instruction.
2599   intptr_t disp = (intptr_t) ct->byte_map_base;
2600   mov(rscratch1, disp);
2601   strb(zr, Address(obj, rscratch1));
2602 }
2603 
2604 void MacroAssembler::load_klass(Register dst, Register src) {
2605   if (UseCompressedClassPointers) {
2606     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2607     decode_klass_not_null(dst);
2608   } else {
2609     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2610   }
2611 }
2612 
2613 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
2614   if (UseCompressedClassPointers) {
2615     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
2616     if (Universe::narrow_klass_base() == NULL) {
2617       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
2618       return;
2619     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
2620                && Universe::narrow_klass_shift() == 0) {
2621       // Only the bottom 32 bits matter
2622       cmpw(trial_klass, tmp);
2623       return;
2624     }
2625     decode_klass_not_null(tmp);
2626   } else {
2627     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
2628   }
2629   cmp(trial_klass, tmp);
2630 }
2631 
2632 void MacroAssembler::load_prototype_header(Register dst, Register src) {
2633   load_klass(dst, src);
2634   ldr(dst, Address(dst, Klass::prototype_header_offset()));
2635 }
2636 
2637 void MacroAssembler::store_klass(Register dst, Register src) {
2638   // FIXME: Should this be a store release?  concurrent gcs assumes
2639   // klass length is valid if klass field is not null.
2640   if (UseCompressedClassPointers) {
2641     encode_klass_not_null(src);
2642     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2643   } else {
2644     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2645   }
2646 }
2647 
2648 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2649   if (UseCompressedClassPointers) {
2650     // Store to klass gap in destination
2651     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2652   }
2653 }
2654 
2655 // Algorithm must match oop.inline.hpp encode_heap_oop.
2656 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2657 #ifdef ASSERT
2658   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
2659 #endif
2660   verify_oop(s, "broken oop in encode_heap_oop");
2661   if (Universe::narrow_oop_base() == NULL) {
2662     if (Universe::narrow_oop_shift() != 0) {
2663       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2664       lsr(d, s, LogMinObjAlignmentInBytes);
2665     } else {
2666       mov(d, s);
2667     }
2668   } else {
2669     subs(d, s, rheapbase);
2670     csel(d, d, zr, Assembler::HS);
2671     lsr(d, d, LogMinObjAlignmentInBytes);
2672 
2673     /*  Old algorithm: is this any worse?
2674     Label nonnull;
2675     cbnz(r, nonnull);
2676     sub(r, r, rheapbase);
2677     bind(nonnull);
2678     lsr(r, r, LogMinObjAlignmentInBytes);
2679     */
2680   }
2681 }
2682 
2683 void MacroAssembler::encode_heap_oop_not_null(Register r) {
2684 #ifdef ASSERT
2685   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
2686   if (CheckCompressedOops) {
2687     Label ok;
2688     cbnz(r, ok);
2689     stop("null oop passed to encode_heap_oop_not_null");
2690     bind(ok);
2691   }
2692 #endif
2693   verify_oop(r, "broken oop in encode_heap_oop_not_null");
2694   if (Universe::narrow_oop_base() != NULL) {
2695     sub(r, r, rheapbase);
2696   }
2697   if (Universe::narrow_oop_shift() != 0) {
2698     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2699     lsr(r, r, LogMinObjAlignmentInBytes);
2700   }
2701 }
2702 
2703 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
2704 #ifdef ASSERT
2705   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
2706   if (CheckCompressedOops) {
2707     Label ok;
2708     cbnz(src, ok);
2709     stop("null oop passed to encode_heap_oop_not_null2");
2710     bind(ok);
2711   }
2712 #endif
2713   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
2714 
2715   Register data = src;
2716   if (Universe::narrow_oop_base() != NULL) {
2717     sub(dst, src, rheapbase);
2718     data = dst;
2719   }
2720   if (Universe::narrow_oop_shift() != 0) {
2721     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2722     lsr(dst, data, LogMinObjAlignmentInBytes);
2723     data = dst;
2724   }
2725   if (data == src)
2726     mov(dst, src);
2727 }
2728 
2729 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
2730 #ifdef ASSERT
2731   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
2732 #endif
2733   if (Universe::narrow_oop_base() == NULL) {
2734     if (Universe::narrow_oop_shift() != 0 || d != s) {
2735       lsl(d, s, Universe::narrow_oop_shift());
2736     }
2737   } else {
2738     Label done;
2739     if (d != s)
2740       mov(d, s);
2741     cbz(s, done);
2742     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
2743     bind(done);
2744   }
2745   verify_oop(d, "broken oop in decode_heap_oop");
2746 }
2747 
2748 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
2749   assert (UseCompressedOops, "should only be used for compressed headers");
2750   assert (Universe::heap() != NULL, "java heap should be initialized");
2751   // Cannot assert, unverified entry point counts instructions (see .ad file)
2752   // vtableStubs also counts instructions in pd_code_size_limit.
2753   // Also do not verify_oop as this is called by verify_oop.
2754   if (Universe::narrow_oop_shift() != 0) {
2755     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2756     if (Universe::narrow_oop_base() != NULL) {
2757       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
2758     } else {
2759       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
2760     }
2761   } else {
2762     assert (Universe::narrow_oop_base() == NULL, "sanity");
2763   }
2764 }
2765 
2766 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
2767   assert (UseCompressedOops, "should only be used for compressed headers");
2768   assert (Universe::heap() != NULL, "java heap should be initialized");
2769   // Cannot assert, unverified entry point counts instructions (see .ad file)
2770   // vtableStubs also counts instructions in pd_code_size_limit.
2771   // Also do not verify_oop as this is called by verify_oop.
2772   if (Universe::narrow_oop_shift() != 0) {
2773     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2774     if (Universe::narrow_oop_base() != NULL) {
2775       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
2776     } else {
2777       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
2778     }
2779   } else {
2780     assert (Universe::narrow_oop_base() == NULL, "sanity");
2781     if (dst != src) {
2782       mov(dst, src);
2783     }
2784   }
2785 }
2786 
2787 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
2788   if (Universe::narrow_klass_base() == NULL) {
2789     if (Universe::narrow_klass_shift() != 0) {
2790       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
2791       lsr(dst, src, LogKlassAlignmentInBytes);
2792     } else {
2793       if (dst != src) mov(dst, src);
2794     }
2795     return;
2796   }
2797 
2798   if (use_XOR_for_compressed_class_base) {
2799     if (Universe::narrow_klass_shift() != 0) {
2800       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
2801       lsr(dst, dst, LogKlassAlignmentInBytes);
2802     } else {
2803       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
2804     }
2805     return;
2806   }
2807 
2808   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
2809       && Universe::narrow_klass_shift() == 0) {
2810     movw(dst, src);
2811     return;
2812   }
2813 
2814 #ifdef ASSERT
2815   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
2816 #endif
2817 
2818   Register rbase = dst;
2819   if (dst == src) rbase = rheapbase;
2820   mov(rbase, (uint64_t)Universe::narrow_klass_base());
2821   sub(dst, src, rbase);
2822   if (Universe::narrow_klass_shift() != 0) {
2823     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
2824     lsr(dst, dst, LogKlassAlignmentInBytes);
2825   }
2826   if (dst == src) reinit_heapbase();
2827 }
2828 
2829 void MacroAssembler::encode_klass_not_null(Register r) {
2830   encode_klass_not_null(r, r);
2831 }
2832 
2833 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
2834   Register rbase = dst;
2835   assert (UseCompressedClassPointers, "should only be used for compressed headers");
2836 
2837   if (Universe::narrow_klass_base() == NULL) {
2838     if (Universe::narrow_klass_shift() != 0) {
2839       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
2840       lsl(dst, src, LogKlassAlignmentInBytes);
2841     } else {
2842       if (dst != src) mov(dst, src);
2843     }
2844     return;
2845   }
2846 
2847   if (use_XOR_for_compressed_class_base) {
2848     if (Universe::narrow_klass_shift() != 0) {
2849       lsl(dst, src, LogKlassAlignmentInBytes);
2850       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
2851     } else {
2852       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
2853     }
2854     return;
2855   }
2856 
2857   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
2858       && Universe::narrow_klass_shift() == 0) {
2859     if (dst != src)
2860       movw(dst, src);
2861     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
2862     return;
2863   }
2864 
2865   // Cannot assert, unverified entry point counts instructions (see .ad file)
2866   // vtableStubs also counts instructions in pd_code_size_limit.
2867   // Also do not verify_oop as this is called by verify_oop.
2868   if (dst == src) rbase = rheapbase;
2869   mov(rbase, (uint64_t)Universe::narrow_klass_base());
2870   if (Universe::narrow_klass_shift() != 0) {
2871     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
2872     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
2873   } else {
2874     add(dst, rbase, src);
2875   }
2876   if (dst == src) reinit_heapbase();
2877 }
2878 
2879 void  MacroAssembler::decode_klass_not_null(Register r) {
2880   decode_klass_not_null(r, r);
2881 }
2882 
2883 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
2884   assert (UseCompressedOops, "should only be used for compressed oops");
2885   assert (Universe::heap() != NULL, "java heap should be initialized");
2886   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
2887 
2888   int oop_index = oop_recorder()->find_index(obj);
2889   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
2890 
2891   InstructionMark im(this);
2892   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2893   code_section()->relocate(inst_mark(), rspec);
2894   movz(dst, 0xDEAD, 16);
2895   movk(dst, 0xBEEF);
2896 }
2897 
2898 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
2899   assert (UseCompressedClassPointers, "should only be used for compressed headers");
2900   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
2901   int index = oop_recorder()->find_index(k);
2902   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
2903 
2904   InstructionMark im(this);
2905   RelocationHolder rspec = metadata_Relocation::spec(index);
2906   code_section()->relocate(inst_mark(), rspec);
2907   narrowKlass nk = Klass::encode_klass(k);
2908   movz(dst, (nk >> 16), 16);
2909   movk(dst, nk & 0xffff);
2910 }
2911 
2912 void MacroAssembler::load_heap_oop(Register dst, Address src)
2913 {
2914   if (UseCompressedOops) {
2915     ldrw(dst, src);
2916     decode_heap_oop(dst);
2917   } else {
2918     ldr(dst, src);
2919   }
2920 }
2921 
2922 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
2923 {
2924   if (UseCompressedOops) {
2925     ldrw(dst, src);
2926     decode_heap_oop_not_null(dst);
2927   } else {
2928     ldr(dst, src);
2929   }
2930 }
2931 
2932 void MacroAssembler::store_heap_oop(Address dst, Register src) {
2933   if (UseCompressedOops) {
2934     assert(!dst.uses(src), "not enough registers");
2935     encode_heap_oop(src);
2936     strw(src, dst);
2937   } else
2938     str(src, dst);
2939 }
2940 
2941 // Used for storing NULLs.
2942 void MacroAssembler::store_heap_oop_null(Address dst) {
2943   if (UseCompressedOops) {
2944     strw(zr, dst);
2945   } else
2946     str(zr, dst);
2947 }
2948 
2949 #if INCLUDE_ALL_GCS
2950 void MacroAssembler::g1_write_barrier_pre(Register obj,
2951                                           Register pre_val,
2952                                           Register thread,
2953                                           Register tmp,
2954                                           bool tosca_live,
2955                                           bool expand_call) {
2956   // If expand_call is true then we expand the call_VM_leaf macro
2957   // directly to skip generating the check by
2958   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
2959 
2960 #ifdef _LP64
2961   assert(thread == rthread, "must be");
2962 #endif // _LP64
2963 
2964   Label done;
2965   Label runtime;
2966 
2967   assert(pre_val != noreg, "check this code");
2968 
2969   if (obj != noreg)
2970     assert_different_registers(obj, pre_val, tmp);
2971 
2972   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
2973                                        PtrQueue::byte_offset_of_active()));
2974   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
2975                                        PtrQueue::byte_offset_of_index()));
2976   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
2977                                        PtrQueue::byte_offset_of_buf()));
2978 
2979 
2980   // Is marking active?
2981   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
2982     ldrw(tmp, in_progress);
2983   } else {
2984     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
2985     ldrb(tmp, in_progress);
2986   }
2987   cbzw(tmp, done);
2988 
2989   // Do we need to load the previous value?
2990   if (obj != noreg) {
2991     load_heap_oop(pre_val, Address(obj, 0));
2992   }
2993 
2994   // Is the previous value null?
2995   cbz(pre_val, done);
2996 
2997   // Can we store original value in the thread's buffer?
2998   // Is index == 0?
2999   // (The index field is typed as size_t.)
3000 
3001   ldr(tmp, index);                      // tmp := *index_adr
3002   cbz(tmp, runtime);                    // tmp == 0?
3003                                         // If yes, goto runtime
3004 
3005   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3006   str(tmp, index);                      // *index_adr := tmp
3007   ldr(rscratch1, buffer);
3008   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3009 
3010   // Record the previous value
3011   str(pre_val, Address(tmp, 0));
3012   b(done);
3013 
3014   bind(runtime);
3015   // save the live input values
3016   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3017 
3018   // Calling the runtime using the regular call_VM_leaf mechanism generates
3019   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3020   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3021   //
3022   // If we care generating the pre-barrier without a frame (e.g. in the
3023   // intrinsified Reference.get() routine) then ebp might be pointing to
3024   // the caller frame and so this check will most likely fail at runtime.
3025   //
3026   // Expanding the call directly bypasses the generation of the check.
3027   // So when we do not have have a full interpreter frame on the stack
3028   // expand_call should be passed true.
3029 
3030   if (expand_call) {
3031     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
3032     pass_arg1(this, thread);
3033     pass_arg0(this, pre_val);
3034     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3035   } else {
3036     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3037   }
3038 
3039   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3040 
3041   bind(done);
3042 }
3043 
3044 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3045                                            Register new_val,
3046                                            Register thread,
3047                                            Register tmp,
3048                                            Register tmp2) {
3049 #ifdef _LP64
3050   assert(thread == rthread, "must be");
3051 #endif // _LP64
3052 
3053   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3054                                        PtrQueue::byte_offset_of_index()));
3055   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3056                                        PtrQueue::byte_offset_of_buf()));
3057 
3058   BarrierSet* bs = Universe::heap()->barrier_set();
3059   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3060   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3061 
3062   Label done;
3063   Label runtime;
3064 
3065   // Does store cross heap regions?
3066 
3067   eor(tmp, store_addr, new_val);
3068   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3069   cbz(tmp, done);
3070 
3071   // crosses regions, storing NULL?
3072 
3073   cbz(new_val, done);
3074 
3075   // storing region crossing non-NULL, is card already dirty?
3076 
3077   ExternalAddress cardtable((address) ct->byte_map_base);
3078   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3079   const Register card_addr = tmp;
3080 
3081   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3082 
3083   unsigned long offset;
3084   adrp(tmp2, cardtable, offset);
3085 
3086   // get the address of the card
3087   add(card_addr, card_addr, tmp2);
3088   ldrb(tmp2, Address(card_addr, offset));
3089   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3090   br(Assembler::EQ, done);
3091 
3092   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3093 
3094   membar(Assembler::StoreLoad);
3095 
3096   ldrb(tmp2, Address(card_addr, offset));
3097   cbzw(tmp2, done);
3098 
3099   // storing a region crossing, non-NULL oop, card is clean.
3100   // dirty card and log.
3101 
3102   strb(zr, Address(card_addr, offset));
3103 
3104   ldr(rscratch1, queue_index);
3105   cbz(rscratch1, runtime);
3106   sub(rscratch1, rscratch1, wordSize);
3107   str(rscratch1, queue_index);
3108 
3109   ldr(tmp2, buffer);
3110   str(card_addr, Address(tmp2, rscratch1));
3111   b(done);
3112 
3113   bind(runtime);
3114   // save the live input values
3115   push(store_addr->bit(true) | new_val->bit(true), sp);
3116   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3117   pop(store_addr->bit(true) | new_val->bit(true), sp);
3118 
3119   bind(done);
3120 }
3121 
3122 #endif // INCLUDE_ALL_GCS
3123 
3124 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3125   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3126   int index = oop_recorder()->allocate_metadata_index(obj);
3127   RelocationHolder rspec = metadata_Relocation::spec(index);
3128   return Address((address)obj, rspec);
3129 }
3130 
3131 // Move an oop into a register.  immediate is true if we want
3132 // immediate instrcutions, i.e. we are not going to patch this
3133 // instruction while the code is being executed by another thread.  In
3134 // that case we can use move immediates rather than the constant pool.
3135 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3136   int oop_index;
3137   if (obj == NULL) {
3138     oop_index = oop_recorder()->allocate_oop_index(obj);
3139   } else {
3140     oop_index = oop_recorder()->find_index(obj);
3141     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3142   }
3143   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3144   if (! immediate) {
3145     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3146     ldr_constant(dst, Address(dummy, rspec));
3147   } else
3148     mov(dst, Address((address)obj, rspec));
3149 }
3150 
3151 // Move a metadata address into a register.
3152 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3153   int oop_index;
3154   if (obj == NULL) {
3155     oop_index = oop_recorder()->allocate_metadata_index(obj);
3156   } else {
3157     oop_index = oop_recorder()->find_index(obj);
3158   }
3159   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3160   mov(dst, Address((address)obj, rspec));
3161 }
3162 
3163 Address MacroAssembler::constant_oop_address(jobject obj) {
3164   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3165   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3166   int oop_index = oop_recorder()->find_index(obj);
3167   return Address((address)obj, oop_Relocation::spec(oop_index));
3168 }
3169 
3170 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3171 void MacroAssembler::tlab_allocate(Register obj,
3172                                    Register var_size_in_bytes,
3173                                    int con_size_in_bytes,
3174                                    Register t1,
3175                                    Register t2,
3176                                    Label& slow_case) {
3177   assert_different_registers(obj, t2);
3178   assert_different_registers(obj, var_size_in_bytes);
3179   Register end = t2;
3180 
3181   // verify_tlab();
3182 
3183   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3184   if (var_size_in_bytes == noreg) {
3185     lea(end, Address(obj, con_size_in_bytes));
3186   } else {
3187     lea(end, Address(obj, var_size_in_bytes));
3188   }
3189   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3190   cmp(end, rscratch1);
3191   br(Assembler::HI, slow_case);
3192 
3193   // update the tlab top pointer
3194   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3195 
3196   // recover var_size_in_bytes if necessary
3197   if (var_size_in_bytes == end) {
3198     sub(var_size_in_bytes, var_size_in_bytes, obj);
3199   }
3200   // verify_tlab();
3201 }
3202 
3203 // Preserves r19, and r3.
3204 Register MacroAssembler::tlab_refill(Label& retry,
3205                                      Label& try_eden,
3206                                      Label& slow_case) {
3207   Register top = r0;
3208   Register t1  = r2;
3209   Register t2  = r4;
3210   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3211   Label do_refill, discard_tlab;
3212 
3213   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3214     // No allocation in the shared eden.
3215     b(slow_case);
3216   }
3217 
3218   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3219   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3220 
3221   // calculate amount of free space
3222   sub(t1, t1, top);
3223   lsr(t1, t1, LogHeapWordSize);
3224 
3225   // Retain tlab and allocate object in shared space if
3226   // the amount free in the tlab is too large to discard.
3227 
3228   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3229   cmp(t1, rscratch1);
3230   br(Assembler::LE, discard_tlab);
3231 
3232   // Retain
3233   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3234   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3235   add(rscratch1, rscratch1, t2);
3236   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3237 
3238   if (TLABStats) {
3239     // increment number of slow_allocations
3240     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3241          1, rscratch1);
3242   }
3243   b(try_eden);
3244 
3245   bind(discard_tlab);
3246   if (TLABStats) {
3247     // increment number of refills
3248     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3249          rscratch1);
3250     // accumulate wastage -- t1 is amount free in tlab
3251     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3252          rscratch1);
3253   }
3254 
3255   // if tlab is currently allocated (top or end != null) then
3256   // fill [top, end + alignment_reserve) with array object
3257   cbz(top, do_refill);
3258 
3259   // set up the mark word
3260   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3261   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3262   // set the length to the remaining space
3263   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3264   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3265   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3266   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3267   // set klass to intArrayKlass
3268   {
3269     unsigned long offset;
3270     // dubious reloc why not an oop reloc?
3271     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3272          offset);
3273     ldr(t1, Address(rscratch1, offset));
3274   }
3275   // store klass last.  concurrent gcs assumes klass length is valid if
3276   // klass field is not null.
3277   store_klass(top, t1);
3278 
3279   mov(t1, top);
3280   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3281   sub(t1, t1, rscratch1);
3282   incr_allocated_bytes(rthread, t1, 0, rscratch1);
3283 
3284   // refill the tlab with an eden allocation
3285   bind(do_refill);
3286   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3287   lsl(t1, t1, LogHeapWordSize);
3288   // allocate new tlab, address returned in top
3289   eden_allocate(top, t1, 0, t2, slow_case);
3290 
3291   // Check that t1 was preserved in eden_allocate.
3292 #ifdef ASSERT
3293   if (UseTLAB) {
3294     Label ok;
3295     Register tsize = r4;
3296     assert_different_registers(tsize, rthread, t1);
3297     str(tsize, Address(pre(sp, -16)));
3298     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3299     lsl(tsize, tsize, LogHeapWordSize);
3300     cmp(t1, tsize);
3301     br(Assembler::EQ, ok);
3302     STOP("assert(t1 != tlab size)");
3303     should_not_reach_here();
3304 
3305     bind(ok);
3306     ldr(tsize, Address(post(sp, 16)));
3307   }
3308 #endif
3309   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3310   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3311   add(top, top, t1);
3312   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
3313   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3314   verify_tlab();
3315   b(retry);
3316 
3317   return rthread; // for use by caller
3318 }
3319 
3320 // Defines obj, preserves var_size_in_bytes
3321 void MacroAssembler::eden_allocate(Register obj,
3322                                    Register var_size_in_bytes,
3323                                    int con_size_in_bytes,
3324                                    Register t1,
3325                                    Label& slow_case) {
3326   assert_different_registers(obj, var_size_in_bytes, t1);
3327   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3328     b(slow_case);
3329   } else {
3330     Register end = t1;
3331     Register heap_end = rscratch2;
3332     Label retry;
3333     bind(retry);
3334     {
3335       unsigned long offset;
3336       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
3337       ldr(heap_end, Address(rscratch1, offset));
3338     }
3339 
3340     ExternalAddress heap_top((address) Universe::heap()->top_addr());
3341 
3342     // Get the current top of the heap
3343     {
3344       unsigned long offset;
3345       adrp(rscratch1, heap_top, offset);
3346       // Use add() here after ARDP, rather than lea().
3347       // lea() does not generate anything if its offset is zero.
3348       // However, relocs expect to find either an ADD or a load/store
3349       // insn after an ADRP.  add() always generates an ADD insn, even
3350       // for add(Rn, Rn, 0).
3351       add(rscratch1, rscratch1, offset);
3352       ldaxr(obj, rscratch1);
3353     }
3354 
3355     // Adjust it my the size of our new object
3356     if (var_size_in_bytes == noreg) {
3357       lea(end, Address(obj, con_size_in_bytes));
3358     } else {
3359       lea(end, Address(obj, var_size_in_bytes));
3360     }
3361 
3362     // if end < obj then we wrapped around high memory
3363     cmp(end, obj);
3364     br(Assembler::LO, slow_case);
3365 
3366     cmp(end, heap_end);
3367     br(Assembler::HI, slow_case);
3368 
3369     // If heap_top hasn't been changed by some other thread, update it.
3370     stlxr(rscratch1, end, rscratch1);
3371     cbnzw(rscratch1, retry);
3372   }
3373 }
3374 
3375 void MacroAssembler::verify_tlab() {
3376 #ifdef ASSERT
3377   if (UseTLAB && VerifyOops) {
3378     Label next, ok;
3379 
3380     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
3381 
3382     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3383     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3384     cmp(rscratch2, rscratch1);
3385     br(Assembler::HS, next);
3386     STOP("assert(top >= start)");
3387     should_not_reach_here();
3388 
3389     bind(next);
3390     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3391     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3392     cmp(rscratch2, rscratch1);
3393     br(Assembler::HS, ok);
3394     STOP("assert(top <= end)");
3395     should_not_reach_here();
3396 
3397     bind(ok);
3398     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
3399   }
3400 #endif
3401 }
3402 
3403 // Writes to stack successive pages until offset reached to check for
3404 // stack overflow + shadow pages.  This clobbers tmp.
3405 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3406   assert_different_registers(tmp, size, rscratch1);
3407   mov(tmp, sp);
3408   // Bang stack for total size given plus shadow page size.
3409   // Bang one page at a time because large size can bang beyond yellow and
3410   // red zones.
3411   Label loop;
3412   mov(rscratch1, os::vm_page_size());
3413   bind(loop);
3414   lea(tmp, Address(tmp, -os::vm_page_size()));
3415   subsw(size, size, rscratch1);
3416   str(size, Address(tmp));
3417   br(Assembler::GT, loop);
3418 
3419   // Bang down shadow pages too.
3420   // At this point, (tmp-0) is the last address touched, so don't
3421   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3422   // was post-decremented.)  Skip this address by starting at i=1, and
3423   // touch a few more pages below.  N.B.  It is important to touch all
3424   // the way down to and including i=StackShadowPages.
3425   for (int i = 0; i< StackShadowPages-1; i++) {
3426     // this could be any sized move but this is can be a debugging crumb
3427     // so the bigger the better.
3428     lea(tmp, Address(tmp, -os::vm_page_size()));
3429     str(size, Address(tmp));
3430   }
3431 }
3432 
3433 
3434 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
3435   unsigned long off;
3436   adrp(r, Address(page, rtype), off);
3437   InstructionMark im(this);
3438   code_section()->relocate(inst_mark(), rtype);
3439   ldrw(zr, Address(r, off));
3440   return inst_mark();
3441 }
3442 
3443 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
3444   InstructionMark im(this);
3445   code_section()->relocate(inst_mark(), rtype);
3446   ldrw(zr, Address(r, 0));
3447   return inst_mark();
3448 }
3449 
3450 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
3451   relocInfo::relocType rtype = dest.rspec().reloc()->type();
3452   if (uabs(pc() - dest.target()) >= (1LL << 32)) {
3453     guarantee(rtype == relocInfo::none
3454               || rtype == relocInfo::external_word_type
3455               || rtype == relocInfo::poll_type
3456               || rtype == relocInfo::poll_return_type,
3457               "can only use a fixed address with an ADRP");
3458     // Out of range.  This doesn't happen very often, but we have to
3459     // handle it
3460     mov(reg1, dest);
3461     byte_offset = 0;
3462   } else {
3463     InstructionMark im(this);
3464     code_section()->relocate(inst_mark(), dest.rspec());
3465     byte_offset = (uint64_t)dest.target() & 0xfff;
3466     _adrp(reg1, dest.target());
3467   }
3468 }
3469 
3470   bool MacroAssembler::use_acq_rel_for_volatile_fields() {
3471 #ifdef PRODUCT
3472     return false;
3473 #else
3474     return UseAcqRelForVolatileFields;
3475 #endif
3476   }
3477 
3478 void MacroAssembler::build_frame(int framesize) {
3479   if (framesize == 0) {
3480     // Is this even possible?
3481     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3482   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
3483     sub(sp, sp, framesize);
3484     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3485   } else {
3486     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3487     if (framesize < ((1 << 12) + 2 * wordSize))
3488       sub(sp, sp, framesize - 2 * wordSize);
3489     else {
3490       mov(rscratch1, framesize - 2 * wordSize);
3491       sub(sp, sp, rscratch1);
3492     }
3493   }
3494 }
3495 
3496 void MacroAssembler::remove_frame(int framesize) {
3497   if (framesize == 0) {
3498     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
3499   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
3500     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3501     add(sp, sp, framesize);
3502   } else {
3503     if (framesize < ((1 << 12) + 2 * wordSize))
3504       add(sp, sp, framesize - 2 * wordSize);
3505     else {
3506       mov(rscratch1, framesize - 2 * wordSize);
3507       add(sp, sp, rscratch1);
3508     }
3509     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
3510   }
3511 }
3512 
3513 
3514 // Search for str1 in str2 and return index or -1
3515 void MacroAssembler::string_indexof(Register str2, Register str1,
3516                                     Register cnt2, Register cnt1,
3517                                     Register tmp1, Register tmp2,
3518                                     Register tmp3, Register tmp4,
3519                                     int icnt1, Register result) {
3520   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
3521 
3522   Register ch1 = rscratch1;
3523   Register ch2 = rscratch2;
3524   Register cnt1tmp = tmp1;
3525   Register cnt2tmp = tmp2;
3526   Register cnt1_neg = cnt1;
3527   Register cnt2_neg = cnt2;
3528   Register result_tmp = tmp4;
3529 
3530   // Note, inline_string_indexOf() generates checks:
3531   // if (substr.count > string.count) return -1;
3532   // if (substr.count == 0) return 0;
3533 
3534 // We have two strings, a source string in str2, cnt2 and a pattern string
3535 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
3536 
3537 // For larger pattern and source we use a simplified Boyer Moore algorithm.
3538 // With a small pattern and source we use linear scan.
3539 
3540   if (icnt1 == -1) {
3541     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
3542     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
3543     br(LO, LINEARSEARCH);       // a byte array.
3544     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
3545     br(HS, LINEARSEARCH);
3546   }
3547 
3548 // The Boyer Moore alogorithm is based on the description here:-
3549 //
3550 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
3551 //
3552 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
3553 // and the 'Good Suffix' rule.
3554 //
3555 // These rules are essentially heuristics for how far we can shift the
3556 // pattern along the search string.
3557 //
3558 // The implementation here uses the 'Bad Character' rule only because of the
3559 // complexity of initialisation for the 'Good Suffix' rule.
3560 //
3561 // This is also known as the Boyer-Moore-Horspool algorithm:-
3562 //
3563 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
3564 //
3565 // #define ASIZE 128
3566 //
3567 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
3568 //       int i, j;
3569 //       unsigned c;
3570 //       unsigned char bc[ASIZE];
3571 //    
3572 //       /* Preprocessing */
3573 //       for (i = 0; i < ASIZE; ++i)
3574 //          bc[i] = 0;
3575 //       for (i = 0; i < m - 1; ) {
3576 //          c = x[i];
3577 //          ++i;
3578 //          if (c < ASIZE) bc[c] = i;
3579 //       }
3580 //    
3581 //       /* Searching */
3582 //       j = 0;
3583 //       while (j <= n - m) {
3584 //          c = y[i+j];
3585 //          if (x[m-1] == c)
3586 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
3587 //          if (i < 0) return j;
3588 //          if (c < ASIZE)
3589 //            j = j - bc[y[j+m-1]] + m;
3590 //          else
3591 //            j += 1; // Advance by 1 only if char >= ASIZE
3592 //       }
3593 //    }
3594 
3595   if (icnt1 == -1) {
3596     BIND(BM);
3597 
3598     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
3599     Label BMADV, BMMATCH, BMCHECKEND;
3600 
3601     Register cnt1end = tmp2;
3602     Register str2end = cnt2;
3603     Register skipch = tmp2;
3604 
3605     // Restrict ASIZE to 128 to reduce stack space/initialisation.
3606     // The presence of chars >= ASIZE in the target string does not affect
3607     // performance, but we must be careful not to initialise them in the stack
3608     // array.
3609     // The presence of chars >= ASIZE in the source string may adversely affect
3610     // performance since we can only advance by one when we encounter one.
3611 
3612       stp(zr, zr, pre(sp, -128));
3613       for (int i = 1; i < 8; i++)
3614           stp(zr, zr, Address(sp, i*16));
3615 
3616       mov(cnt1tmp, 0);
3617       sub(cnt1end, cnt1, 1);
3618     BIND(BCLOOP);
3619       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3620       cmp(ch1, 128);
3621       add(cnt1tmp, cnt1tmp, 1);
3622       br(HS, BCSKIP);
3623       strb(cnt1tmp, Address(sp, ch1));
3624     BIND(BCSKIP);
3625       cmp(cnt1tmp, cnt1end);
3626       br(LT, BCLOOP);
3627 
3628       mov(result_tmp, str2);
3629 
3630       sub(cnt2, cnt2, cnt1);
3631       add(str2end, str2, cnt2, LSL, 1);
3632     BIND(BMLOOPSTR2);
3633       sub(cnt1tmp, cnt1, 1);
3634       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3635       ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
3636       cmp(ch1, skipch);
3637       br(NE, BMSKIP);
3638       subs(cnt1tmp, cnt1tmp, 1);
3639       br(LT, BMMATCH);
3640     BIND(BMLOOPSTR1);
3641       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3642       ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
3643       cmp(ch1, ch2);
3644       br(NE, BMSKIP);
3645       subs(cnt1tmp, cnt1tmp, 1);
3646       br(GE, BMLOOPSTR1);
3647     BIND(BMMATCH);
3648       sub(result_tmp, str2, result_tmp);
3649       lsr(result, result_tmp, 1);
3650       add(sp, sp, 128);
3651       b(DONE);
3652     BIND(BMADV);
3653       add(str2, str2, 2);
3654       b(BMCHECKEND);
3655     BIND(BMSKIP);
3656       cmp(skipch, 128);
3657       br(HS, BMADV);
3658       ldrb(ch2, Address(sp, skipch));
3659       add(str2, str2, cnt1, LSL, 1);
3660       sub(str2, str2, ch2, LSL, 1);
3661     BIND(BMCHECKEND);
3662       cmp(str2, str2end);
3663       br(LE, BMLOOPSTR2);
3664       add(sp, sp, 128);
3665       b(NOMATCH);
3666   }
3667 
3668   BIND(LINEARSEARCH);
3669   {
3670     Label DO1, DO2, DO3;
3671 
3672     Register str2tmp = tmp2;
3673     Register first = tmp3;
3674 
3675     if (icnt1 == -1)
3676     {
3677         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
3678 
3679         cmp(cnt1, 4);
3680         br(LT, DOSHORT);
3681 
3682         sub(cnt2, cnt2, cnt1);
3683         sub(cnt1, cnt1, 4);
3684         mov(result_tmp, cnt2);
3685 
3686         lea(str1, Address(str1, cnt1, Address::uxtw(1)));
3687         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3688         sub(cnt1_neg, zr, cnt1, LSL, 1);
3689         sub(cnt2_neg, zr, cnt2, LSL, 1);
3690         ldr(first, Address(str1, cnt1_neg));
3691 
3692       BIND(FIRST_LOOP);
3693         ldr(ch2, Address(str2, cnt2_neg));
3694         cmp(first, ch2);
3695         br(EQ, STR1_LOOP);
3696       BIND(STR2_NEXT);
3697         adds(cnt2_neg, cnt2_neg, 2);
3698         br(LE, FIRST_LOOP);
3699         b(NOMATCH);
3700 
3701       BIND(STR1_LOOP);
3702         adds(cnt1tmp, cnt1_neg, 8);
3703         add(cnt2tmp, cnt2_neg, 8);
3704         br(GE, LAST_WORD);
3705 
3706       BIND(STR1_NEXT);
3707         ldr(ch1, Address(str1, cnt1tmp));
3708         ldr(ch2, Address(str2, cnt2tmp));
3709         cmp(ch1, ch2);
3710         br(NE, STR2_NEXT);
3711         adds(cnt1tmp, cnt1tmp, 8);
3712         add(cnt2tmp, cnt2tmp, 8);
3713         br(LT, STR1_NEXT);
3714 
3715       BIND(LAST_WORD);
3716         ldr(ch1, Address(str1));
3717         sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
3718         ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
3719         cmp(ch1, ch2);
3720         br(NE, STR2_NEXT);
3721         b(MATCH);
3722 
3723       BIND(DOSHORT);
3724         cmp(cnt1, 2);
3725         br(LT, DO1);
3726         br(GT, DO3);
3727     }
3728 
3729     if (icnt1 == 4) {
3730       Label CH1_LOOP;
3731 
3732         ldr(ch1, str1);
3733         sub(cnt2, cnt2, 4);
3734         mov(result_tmp, cnt2);
3735         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3736         sub(cnt2_neg, zr, cnt2, LSL, 1);
3737 
3738       BIND(CH1_LOOP);
3739         ldr(ch2, Address(str2, cnt2_neg));
3740         cmp(ch1, ch2);
3741         br(EQ, MATCH);
3742         adds(cnt2_neg, cnt2_neg, 2);
3743         br(LE, CH1_LOOP);
3744         b(NOMATCH);
3745     }
3746 
3747     if (icnt1 == -1 || icnt1 == 2) {
3748       Label CH1_LOOP;
3749 
3750       BIND(DO2);
3751         ldrw(ch1, str1);
3752         sub(cnt2, cnt2, 2);
3753         mov(result_tmp, cnt2);
3754         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3755         sub(cnt2_neg, zr, cnt2, LSL, 1);
3756 
3757       BIND(CH1_LOOP);
3758         ldrw(ch2, Address(str2, cnt2_neg));
3759         cmp(ch1, ch2);
3760         br(EQ, MATCH);
3761         adds(cnt2_neg, cnt2_neg, 2);
3762         br(LE, CH1_LOOP);
3763         b(NOMATCH);
3764     }
3765 
3766     if (icnt1 == -1 || icnt1 == 3) {
3767       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
3768 
3769       BIND(DO3);
3770         ldrw(first, str1);
3771         ldrh(ch1, Address(str1, 4));
3772 
3773         sub(cnt2, cnt2, 3);
3774         mov(result_tmp, cnt2);
3775         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3776         sub(cnt2_neg, zr, cnt2, LSL, 1);
3777 
3778       BIND(FIRST_LOOP);
3779         ldrw(ch2, Address(str2, cnt2_neg));
3780         cmpw(first, ch2);
3781         br(EQ, STR1_LOOP);
3782       BIND(STR2_NEXT);
3783         adds(cnt2_neg, cnt2_neg, 2);
3784         br(LE, FIRST_LOOP);
3785         b(NOMATCH);
3786 
3787       BIND(STR1_LOOP);
3788         add(cnt2tmp, cnt2_neg, 4);
3789         ldrh(ch2, Address(str2, cnt2tmp));
3790         cmp(ch1, ch2);
3791         br(NE, STR2_NEXT);
3792         b(MATCH);
3793     }
3794 
3795     if (icnt1 == -1 || icnt1 == 1) {
3796       Label CH1_LOOP, HAS_ZERO;
3797       Label DO1_SHORT, DO1_LOOP;
3798 
3799       BIND(DO1);
3800         ldrh(ch1, str1);
3801         cmp(cnt2, 4);
3802         br(LT, DO1_SHORT);
3803 
3804         orr(ch1, ch1, ch1, LSL, 16);
3805         orr(ch1, ch1, ch1, LSL, 32);
3806 
3807         sub(cnt2, cnt2, 4);
3808         mov(result_tmp, cnt2);
3809         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3810         sub(cnt2_neg, zr, cnt2, LSL, 1);
3811 
3812         mov(tmp3, 0x0001000100010001);
3813       BIND(CH1_LOOP);
3814         ldr(ch2, Address(str2, cnt2_neg));
3815         eor(ch2, ch1, ch2);
3816         sub(tmp1, ch2, tmp3);
3817         orr(tmp2, ch2, 0x7fff7fff7fff7fff);
3818         bics(tmp1, tmp1, tmp2);
3819         br(NE, HAS_ZERO);
3820         adds(cnt2_neg, cnt2_neg, 8);
3821         br(LT, CH1_LOOP);
3822 
3823         cmp(cnt2_neg, 8);
3824         mov(cnt2_neg, 0);
3825         br(LT, CH1_LOOP);
3826         b(NOMATCH);
3827 
3828       BIND(HAS_ZERO);
3829         rev(tmp1, tmp1);
3830         clz(tmp1, tmp1);
3831         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
3832         b(MATCH);
3833 
3834       BIND(DO1_SHORT);
3835         mov(result_tmp, cnt2);
3836         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3837         sub(cnt2_neg, zr, cnt2, LSL, 1);
3838       BIND(DO1_LOOP);
3839         ldrh(ch2, Address(str2, cnt2_neg));
3840         cmpw(ch1, ch2);
3841         br(EQ, MATCH);
3842         adds(cnt2_neg, cnt2_neg, 2);
3843         br(LT, DO1_LOOP);
3844     }
3845   }
3846   BIND(NOMATCH);
3847     mov(result, -1);
3848     b(DONE);
3849   BIND(MATCH);
3850     add(result, result_tmp, cnt2_neg, ASR, 1);
3851   BIND(DONE);
3852 }
3853 
3854 // Compare strings.
3855 void MacroAssembler::string_compare(Register str1, Register str2,
3856                                     Register cnt1, Register cnt2, Register result,
3857                                     Register tmp1) {
3858   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
3859     NEXT_WORD, DIFFERENCE;
3860 
3861   BLOCK_COMMENT("string_compare {");
3862 
3863   // Compute the minimum of the string lengths and save the difference.
3864   subsw(tmp1, cnt1, cnt2);
3865   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
3866 
3867   // A very short string
3868   cmpw(cnt2, 4);
3869   br(Assembler::LT, SHORT_STRING);
3870 
3871   // Check if the strings start at the same location.
3872   cmp(str1, str2);
3873   br(Assembler::EQ, LENGTH_DIFF);
3874 
3875   // Compare longwords
3876   {
3877     subw(cnt2, cnt2, 4); // The last longword is a special case
3878 
3879     // Move both string pointers to the last longword of their
3880     // strings, negate the remaining count, and convert it to bytes.
3881     lea(str1, Address(str1, cnt2, Address::uxtw(1)));
3882     lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3883     sub(cnt2, zr, cnt2, LSL, 1);
3884 
3885     // Loop, loading longwords and comparing them into rscratch2.
3886     bind(NEXT_WORD);
3887     ldr(result, Address(str1, cnt2));
3888     ldr(cnt1, Address(str2, cnt2));
3889     adds(cnt2, cnt2, wordSize);
3890     eor(rscratch2, result, cnt1);
3891     cbnz(rscratch2, DIFFERENCE);
3892     br(Assembler::LT, NEXT_WORD);
3893 
3894     // Last longword.  In the case where length == 4 we compare the
3895     // same longword twice, but that's still faster than another
3896     // conditional branch.
3897 
3898     ldr(result, Address(str1));
3899     ldr(cnt1, Address(str2));
3900     eor(rscratch2, result, cnt1);
3901     cbz(rscratch2, LENGTH_DIFF);
3902 
3903     // Find the first different characters in the longwords and
3904     // compute their difference.
3905     bind(DIFFERENCE);
3906     rev(rscratch2, rscratch2);
3907     clz(rscratch2, rscratch2);
3908     andr(rscratch2, rscratch2, -16);
3909     lsrv(result, result, rscratch2);
3910     uxthw(result, result);
3911     lsrv(cnt1, cnt1, rscratch2);
3912     uxthw(cnt1, cnt1);
3913     subw(result, result, cnt1);
3914     b(DONE);
3915   }
3916 
3917   bind(SHORT_STRING);
3918   // Is the minimum length zero?
3919   cbz(cnt2, LENGTH_DIFF);
3920 
3921   bind(SHORT_LOOP);
3922   load_unsigned_short(result, Address(post(str1, 2)));
3923   load_unsigned_short(cnt1, Address(post(str2, 2)));
3924   subw(result, result, cnt1);
3925   cbnz(result, DONE);
3926   sub(cnt2, cnt2, 1);
3927   cbnz(cnt2, SHORT_LOOP);
3928 
3929   // Strings are equal up to min length.  Return the length difference.
3930   bind(LENGTH_DIFF);
3931   mov(result, tmp1);
3932 
3933   // That's it
3934   bind(DONE);
3935 
3936   BLOCK_COMMENT("} string_compare");
3937 }
3938 
3939 
3940 void MacroAssembler::string_equals(Register str1, Register str2,
3941                                    Register cnt, Register result,
3942                                    Register tmp1) {
3943   Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
3944     NEXT_WORD;
3945 
3946   const Register tmp2 = rscratch1;
3947   assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
3948 
3949   BLOCK_COMMENT("string_equals {");
3950 
3951   // Start by assuming that the strings are not equal.
3952   mov(result, zr);
3953 
3954   // A very short string
3955   cmpw(cnt, 4);
3956   br(Assembler::LT, SHORT_STRING);
3957 
3958   // Check if the strings start at the same location.
3959   cmp(str1, str2);
3960   br(Assembler::EQ, SAME_CHARS);
3961 
3962   // Compare longwords
3963   {
3964     subw(cnt, cnt, 4); // The last longword is a special case
3965 
3966     // Move both string pointers to the last longword of their
3967     // strings, negate the remaining count, and convert it to bytes.
3968     lea(str1, Address(str1, cnt, Address::uxtw(1)));
3969     lea(str2, Address(str2, cnt, Address::uxtw(1)));
3970     sub(cnt, zr, cnt, LSL, 1);
3971 
3972     // Loop, loading longwords and comparing them into rscratch2.
3973     bind(NEXT_WORD);
3974     ldr(tmp1, Address(str1, cnt));
3975     ldr(tmp2, Address(str2, cnt));
3976     adds(cnt, cnt, wordSize);
3977     eor(rscratch2, tmp1, tmp2);
3978     cbnz(rscratch2, DONE);
3979     br(Assembler::LT, NEXT_WORD);
3980 
3981     // Last longword.  In the case where length == 4 we compare the
3982     // same longword twice, but that's still faster than another
3983     // conditional branch.
3984 
3985     ldr(tmp1, Address(str1));
3986     ldr(tmp2, Address(str2));
3987     eor(rscratch2, tmp1, tmp2);
3988     cbz(rscratch2, SAME_CHARS);
3989     b(DONE);
3990   }
3991 
3992   bind(SHORT_STRING);
3993   // Is the length zero?
3994   cbz(cnt, SAME_CHARS);
3995 
3996   bind(SHORT_LOOP);
3997   load_unsigned_short(tmp1, Address(post(str1, 2)));
3998   load_unsigned_short(tmp2, Address(post(str2, 2)));
3999   subw(tmp1, tmp1, tmp2);
4000   cbnz(tmp1, DONE);
4001   sub(cnt, cnt, 1);
4002   cbnz(cnt, SHORT_LOOP);
4003 
4004   // Strings are equal.
4005   bind(SAME_CHARS);
4006   mov(result, true);
4007 
4008   // That's it
4009   bind(DONE);
4010 
4011   BLOCK_COMMENT("} string_equals");
4012 }
4013 
4014 // Compare char[] arrays aligned to 4 bytes
4015 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4016                                         Register result, Register tmp1)
4017 {
4018   Register cnt1 = rscratch1;
4019   Register cnt2 = rscratch2;
4020   Register tmp2 = rscratch2;
4021 
4022   Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4023 
4024   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4025   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4026 
4027   BLOCK_COMMENT("char_arrays_equals  {");
4028 
4029     // different until proven equal
4030     mov(result, false);
4031 
4032     // same array?
4033     cmp(ary1, ary2);
4034     br(Assembler::EQ, SAME);
4035 
4036     // ne if either null
4037     cbz(ary1, DIFFER);
4038     cbz(ary2, DIFFER);
4039 
4040     // lengths ne?
4041     ldrw(cnt1, Address(ary1, length_offset));
4042     ldrw(cnt2, Address(ary2, length_offset));
4043     cmp(cnt1, cnt2);
4044     br(Assembler::NE, DIFFER);
4045 
4046     lea(ary1, Address(ary1, base_offset));
4047     lea(ary2, Address(ary2, base_offset));
4048 
4049     subs(cnt1, cnt1, 4);
4050     br(LT, TAIL03);
4051 
4052   BIND(NEXT);
4053     ldr(tmp1, Address(post(ary1, 8)));
4054     ldr(tmp2, Address(post(ary2, 8)));
4055     subs(cnt1, cnt1, 4);
4056     eor(tmp1, tmp1, tmp2);
4057     cbnz(tmp1, DIFFER);
4058     br(GE, NEXT);
4059 
4060   BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4061     tst(cnt1, 0b10);
4062     br(EQ, TAIL01);
4063     ldrw(tmp1, Address(post(ary1, 4)));
4064     ldrw(tmp2, Address(post(ary2, 4)));
4065     cmp(tmp1, tmp2);
4066     br(NE, DIFFER);
4067   BIND(TAIL01);  // 0-1 chars left
4068     tst(cnt1, 0b01);
4069     br(EQ, SAME);
4070     ldrh(tmp1, ary1);
4071     ldrh(tmp2, ary2);
4072     cmp(tmp1, tmp2);
4073     br(NE, DIFFER);
4074 
4075   BIND(SAME);
4076     mov(result, true);
4077   BIND(DIFFER); // result already set
4078   
4079   BLOCK_COMMENT("} char_arrays_equals");
4080 }
4081 
4082 // encode char[] to byte[] in ISO_8859_1
4083 void MacroAssembler::encode_iso_array(Register src, Register dst,
4084                       Register len, Register result,
4085                       FloatRegister Vtmp1, FloatRegister Vtmp2,
4086                       FloatRegister Vtmp3, FloatRegister Vtmp4)
4087 {
4088     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
4089     Register tmp1 = rscratch1;
4090 
4091       mov(result, len); // Save initial len
4092 
4093 #ifndef BUILTIN_SIM
4094       subs(len, len, 32);
4095       br(LT, LOOP_8);
4096 
4097 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
4098 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
4099 // any char could not fit in a byte, so clear the FPSR so we can test it.
4100       clear_fpsr();
4101 
4102     BIND(NEXT_32);
4103       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4104       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
4105       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
4106       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
4107       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
4108       get_fpsr(tmp1);
4109       cbnzw(tmp1, LOOP_8);
4110       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
4111       subs(len, len, 32);
4112       add(src, src, 64);
4113       br(GE, NEXT_32);
4114 
4115     BIND(LOOP_8);
4116       adds(len, len, 32-8);
4117       br(LT, LOOP_1);
4118       clear_fpsr(); // QC may be set from loop above, clear again
4119     BIND(NEXT_8);
4120       ld1(Vtmp1, T8H, src);
4121       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
4122       get_fpsr(tmp1);
4123       cbnzw(tmp1, LOOP_1);
4124       st1(Vtmp1, T8B, post(dst, 8));
4125       subs(len, len, 8);
4126       add(src, src, 16);
4127       br(GE, NEXT_8);
4128 
4129     BIND(LOOP_1);
4130       adds(len, len, 8);
4131       br(LE, DONE);
4132 #else
4133       cbz(len, DONE);
4134 #endif
4135     BIND(NEXT_1);
4136       ldrh(tmp1, Address(post(src, 2)));
4137       tst(tmp1, 0xff00);
4138       br(NE, DONE);
4139       strb(tmp1, Address(post(dst, 1)));
4140       subs(len, len, 1);
4141       br(GT, NEXT_1);
4142 
4143     BIND(DONE);
4144       sub(result, result, len); // Return index where we stopped
4145 }