1 /*
   2 /*
   3  * Copyright (c) 2013, Red Hat Inc.
   4  * Copyright (c) 1997, 2012, Oracle and/or its affiliates.
   5  * All rights reserved.
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This code is free software; you can redistribute it and/or modify it
   9  * under the terms of the GNU General Public License version 2 only, as
  10  * published by the Free Software Foundation.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  *
  26  */
  27 
  28 #include <sys/types.h>
  29 
  30 #include "precompiled.hpp"
  31 #include "asm/assembler.hpp"
  32 #include "asm/assembler.inline.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 
  35 #include "compiler/disassembler.hpp"
  36 #include "memory/resourceArea.hpp"
  37 #include "runtime/biasedLocking.hpp"
  38 #include "runtime/interfaceSupport.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 
  41 // #include "gc_interface/collectedHeap.inline.hpp"
  42 // #include "interpreter/interpreter.hpp"
  43 // #include "memory/cardTableModRefBS.hpp"
  44 // #include "prims/methodHandles.hpp"
  45 // #include "runtime/biasedLocking.hpp"
  46 // #include "runtime/interfaceSupport.hpp"
  47 // #include "runtime/objectMonitor.hpp"
  48 // #include "runtime/os.hpp"
  49 // #include "runtime/sharedRuntime.hpp"
  50 // #include "runtime/stubRoutines.hpp"
  51 
  52 #if INCLUDE_ALL_GCS
  53 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  54 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  55 #include "gc_implementation/g1/heapRegion.hpp"
  56 #endif
  57 
  58 #ifdef PRODUCT
  59 #define BLOCK_COMMENT(str) /* nothing */
  60 #define STOP(error) stop(error)
  61 #else
  62 #define BLOCK_COMMENT(str) block_comment(str)
  63 #define STOP(error) block_comment(error); stop(error)
  64 #endif
  65 
  66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  67 
  68 // Patch any kind of instruction; there may be several instructions.
  69 // Return the total length (in bytes) of the instructions.
  70 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  71   int instructions = 1;
  72   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  73   long offset = (target - branch) >> 2;
  74   unsigned insn = *(unsigned*)branch;
  75   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  76     // Load register (literal)
  77     Instruction_aarch64::spatch(branch, 23, 5, offset);
  78   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  79     // Unconditional branch (immediate)
  80     Instruction_aarch64::spatch(branch, 25, 0, offset);
  81   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  82     // Conditional branch (immediate)
  83     Instruction_aarch64::spatch(branch, 23, 5, offset);
  84   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  85     // Compare & branch (immediate)
  86     Instruction_aarch64::spatch(branch, 23, 5, offset);
  87   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  88     // Test & branch (immediate)
  89     Instruction_aarch64::spatch(branch, 18, 5, offset);
  90   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  91     // PC-rel. addressing
  92     offset = target-branch;
  93     int shift = Instruction_aarch64::extract(insn, 31, 31);
  94     if (shift) {
  95       u_int64_t dest = (u_int64_t)target;
  96       uint64_t pc_page = (uint64_t)branch >> 12;
  97       uint64_t adr_page = (uint64_t)target >> 12;
  98       unsigned offset_lo = dest & 0xfff;
  99       offset = adr_page - pc_page;
 100 
 101       // We handle 3 types of PC relative addressing
 102       //   1 - adrp    Rx, target_page
 103       //       ldr/str Ry, [Rx, #offset_in_page]
 104       //   2 - adrp    Rx, target_page
 105       //       add     Ry, Rx, #offset_in_page
 106       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 107       // In the first 2 cases we must check that Rx is the same in the adrp and the
 108       // subsequent ldr/str or add instruction. Otherwise we could accidentally end
 109       // up treating a type 3 relocation as a type 1 or 2 just because it happened
 110       // to be followed by a random unrelated ldr/str or add instruction.
 111       //
 112       // In the case of a type 3 relocation, we know that these are only generated
 113       // for the safepoint polling page, or for the card type byte map base so we
 114       // assert as much and of course that the offset is 0.
 115       //
 116       unsigned insn2 = ((unsigned*)branch)[1];
 117       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 118                 Instruction_aarch64::extract(insn, 4, 0) ==
 119                         Instruction_aarch64::extract(insn2, 9, 5)) {
 120         // Load/store register (unsigned immediate)
 121         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 122         Instruction_aarch64::patch(branch + sizeof (unsigned),
 123                                     21, 10, offset_lo >> size);
 124         guarantee(((dest >> size) << size) == dest, "misaligned target");
 125         instructions = 2;
 126       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 127                 Instruction_aarch64::extract(insn, 4, 0) ==
 128                         Instruction_aarch64::extract(insn2, 4, 0)) {
 129         // add (immediate)
 130         Instruction_aarch64::patch(branch + sizeof (unsigned),
 131                                    21, 10, offset_lo);
 132         instructions = 2;
 133       } else {
 134         assert((jbyte *)target ==
 135                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 136                target == StubRoutines::crc_table_addr() ||
 137                (address)target == os::get_polling_page(),
 138                "adrp must be polling page or byte map base");
 139         assert(offset_lo == 0, "offset must be 0 for polling page or byte map base");
 140       }
 141     }
 142     int offset_lo = offset & 3;
 143     offset >>= 2;
 144     Instruction_aarch64::spatch(branch, 23, 5, offset);
 145     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 146   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 147     u_int64_t dest = (u_int64_t)target;
 148     // Move wide constant
 149     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 150     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 151     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 152     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 153     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 154     assert(pd_call_destination(branch) == target, "should be");
 155     instructions = 3;
 156   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 157              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 158     // nothing to do
 159     assert(target == 0, "did not expect to relocate target for polling page load");
 160   } else {
 161     ShouldNotReachHere();
 162   }
 163   return instructions * NativeInstruction::instruction_size;
 164 }
 165 
 166 int MacroAssembler::patch_oop(address insn_addr, address o) {
 167   int instructions;
 168   unsigned insn = *(unsigned*)insn_addr;
 169   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 170 
 171   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 172   // narrow OOPs by setting the upper 16 bits in the first
 173   // instruction.
 174   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 175     // Move narrow OOP
 176     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 177     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 178     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 179     instructions = 2;
 180   } else {
 181     // Move wide OOP
 182     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 183     uintptr_t dest = (uintptr_t)o;
 184     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 185     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 186     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 187     instructions = 3;
 188   }
 189   return instructions * NativeInstruction::instruction_size;
 190 }
 191 
 192 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 193   long offset = 0;
 194   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 195     // Load register (literal)
 196     offset = Instruction_aarch64::sextract(insn, 23, 5);
 197     return address(((uint64_t)insn_addr + (offset << 2)));
 198   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 199     // Unconditional branch (immediate)
 200     offset = Instruction_aarch64::sextract(insn, 25, 0);
 201   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 202     // Conditional branch (immediate)
 203     offset = Instruction_aarch64::sextract(insn, 23, 5);
 204   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 205     // Compare & branch (immediate)
 206     offset = Instruction_aarch64::sextract(insn, 23, 5);
 207    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 208     // Test & branch (immediate)
 209     offset = Instruction_aarch64::sextract(insn, 18, 5);
 210   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 211     // PC-rel. addressing
 212     offset = Instruction_aarch64::extract(insn, 30, 29);
 213     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 214     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 215     if (shift) {
 216       offset <<= shift;
 217       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 218       target_page &= ((uint64_t)-1) << shift;
 219       // Return the target address for the following sequences
 220       //   1 - adrp    Rx, target_page
 221       //       ldr/str Ry, [Rx, #offset_in_page]
 222       //   [ 2 - adrp    Rx, target_page         ] Not handled
 223       //   [    add     Ry, Rx, #offset_in_page  ]
 224       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 225       //
 226       // In the case of type 1 we check that the register is the same and
 227       // return the target_page + the offset within the page.
 228       //
 229       // Otherwise we assume it is a page aligned relocation and return
 230       // the target page only. The only cases this is generated is for
 231       // the safepoint polling page or for the card table byte map base so
 232       // we assert as much.
 233       //
 234       // Note: Strangely, we do not handle 'type 2' relocation (adrp followed
 235       // by add) which is handled in pd_patch_instruction above.
 236       //
 237       unsigned insn2 = ((unsigned*)insn_addr)[1];
 238       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 239                 Instruction_aarch64::extract(insn, 4, 0) ==
 240                         Instruction_aarch64::extract(insn2, 9, 5)) {
 241         // Load/store register (unsigned immediate)
 242         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 243         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 244         return address(target_page + (byte_offset << size));
 245       } else {
 246         assert((jbyte *)target_page ==
 247                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 248                (address)target_page == os::get_polling_page(),
 249                "adrp must be polling page or byte map base");
 250         return (address)target_page;
 251       }
 252     } else {
 253       ShouldNotReachHere();
 254     }
 255   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 256     u_int32_t *insns = (u_int32_t *)insn_addr;
 257     // Move wide constant: movz, movk, movk.  See movptr().
 258     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 259     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 260     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 261                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 262                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 263   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 264              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 265     return 0;
 266   } else {
 267     ShouldNotReachHere();
 268   }
 269   return address(((uint64_t)insn_addr + (offset << 2)));
 270 }
 271 
 272 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 273   dsb(Assembler::SY);
 274 }
 275 
 276 
 277 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 278                                            bool clear_pc) {
 279   // we must set sp to zero to clear frame
 280   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 281   // must clear fp, so that compiled frames are not confused; it is
 282   // possible that we need it only for debugging
 283   if (clear_fp) {
 284     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 285   }
 286 
 287   if (clear_pc) {
 288     str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 289   }
 290 }
 291 
 292 // Calls to C land
 293 //
 294 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 295 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 296 // has to be reset to 0. This is required to allow proper stack traversal.
 297 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 298                                          Register last_java_fp,
 299                                          Register last_java_pc,
 300                                          Register scratch) {
 301 
 302   if (last_java_pc->is_valid()) {
 303       str(last_java_pc, Address(rthread,
 304                                 JavaThread::frame_anchor_offset()
 305                                 + JavaFrameAnchor::last_Java_pc_offset()));
 306     }
 307 
 308   // determine last_java_sp register
 309   if (last_java_sp == sp) {
 310     mov(scratch, sp);
 311     last_java_sp = scratch;
 312   } else if (!last_java_sp->is_valid()) {
 313     last_java_sp = esp;
 314   }
 315 
 316   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 317 
 318   // last_java_fp is optional
 319   if (last_java_fp->is_valid()) {
 320     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 321   }
 322 }
 323 
 324 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 325                                          Register last_java_fp,
 326                                          address  last_java_pc,
 327                                          Register scratch) {
 328   if (last_java_pc != NULL) {
 329     adr(scratch, last_java_pc);
 330   } else {
 331     // FIXME: This is almost never correct.  We should delete all
 332     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 333     // correct return address instead.
 334     adr(scratch, pc());
 335   }
 336 
 337   str(scratch, Address(rthread,
 338                        JavaThread::frame_anchor_offset()
 339                        + JavaFrameAnchor::last_Java_pc_offset()));
 340 
 341   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 342 }
 343 
 344 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 345                                          Register last_java_fp,
 346                                          Label &L,
 347                                          Register scratch) {
 348   if (L.is_bound()) {
 349     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 350   } else {
 351     InstructionMark im(this);
 352     L.add_patch_at(code(), locator());
 353     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 354   }
 355 }
 356 
 357 int MacroAssembler::biased_locking_enter(Register lock_reg,
 358                                          Register obj_reg,
 359                                          Register swap_reg,
 360                                          Register tmp_reg,
 361                                          bool swap_reg_contains_mark,
 362                                          Label& done,
 363                                          Label* slow_case,
 364                                          BiasedLockingCounters* counters) {
 365   assert(UseBiasedLocking, "why call this otherwise?");
 366   assert_different_registers(lock_reg, obj_reg, swap_reg);
 367 
 368   if (PrintBiasedLockingStatistics && counters == NULL)
 369     counters = BiasedLocking::counters();
 370 
 371   bool need_tmp_reg = false;
 372   if (tmp_reg == noreg) {
 373     tmp_reg = rscratch2;
 374   }
 375   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 376   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 377   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 378   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 379   Address saved_mark_addr(lock_reg, 0);
 380 
 381   // Biased locking
 382   // See whether the lock is currently biased toward our thread and
 383   // whether the epoch is still valid
 384   // Note that the runtime guarantees sufficient alignment of JavaThread
 385   // pointers to allow age to be placed into low bits
 386   // First check to see whether biasing is even enabled for this object
 387   Label cas_label;
 388   int null_check_offset = -1;
 389   if (!swap_reg_contains_mark) {
 390     null_check_offset = offset();
 391     ldr(swap_reg, mark_addr);
 392   }
 393   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 394   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 395   br(Assembler::NE, cas_label);
 396   // The bias pattern is present in the object's header. Need to check
 397   // whether the bias owner and the epoch are both still current.
 398   load_prototype_header(tmp_reg, obj_reg);
 399   orr(tmp_reg, tmp_reg, rthread);
 400   eor(tmp_reg, swap_reg, tmp_reg);
 401   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 402   if (counters != NULL) {
 403     Label around;
 404     cbnz(tmp_reg, around);
 405     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 406     b(done);
 407     bind(around);
 408   } else {
 409     cbz(tmp_reg, done);
 410   }
 411 
 412   Label try_revoke_bias;
 413   Label try_rebias;
 414 
 415   // At this point we know that the header has the bias pattern and
 416   // that we are not the bias owner in the current epoch. We need to
 417   // figure out more details about the state of the header in order to
 418   // know what operations can be legally performed on the object's
 419   // header.
 420 
 421   // If the low three bits in the xor result aren't clear, that means
 422   // the prototype header is no longer biased and we have to revoke
 423   // the bias on this object.
 424   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 425   cbnz(rscratch1, try_revoke_bias);
 426 
 427   // Biasing is still enabled for this data type. See whether the
 428   // epoch of the current bias is still valid, meaning that the epoch
 429   // bits of the mark word are equal to the epoch bits of the
 430   // prototype header. (Note that the prototype header's epoch bits
 431   // only change at a safepoint.) If not, attempt to rebias the object
 432   // toward the current thread. Note that we must be absolutely sure
 433   // that the current epoch is invalid in order to do this because
 434   // otherwise the manipulations it performs on the mark word are
 435   // illegal.
 436   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 437   cbnz(rscratch1, try_rebias);
 438 
 439   // The epoch of the current bias is still valid but we know nothing
 440   // about the owner; it might be set or it might be clear. Try to
 441   // acquire the bias of the object using an atomic operation. If this
 442   // fails we will go in to the runtime to revoke the object's bias.
 443   // Note that we first construct the presumed unbiased header so we
 444   // don't accidentally blow away another thread's valid bias.
 445   {
 446     Label here;
 447     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 448     andr(swap_reg, swap_reg, rscratch1);
 449     orr(tmp_reg, swap_reg, rthread);
 450     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 451     // If the biasing toward our thread failed, this means that
 452     // another thread succeeded in biasing it toward itself and we
 453     // need to revoke that bias. The revocation will occur in the
 454     // interpreter runtime in the slow case.
 455     bind(here);
 456     if (counters != NULL) {
 457       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 458                   tmp_reg, rscratch1);
 459     }
 460   }
 461   b(done);
 462 
 463   bind(try_rebias);
 464   // At this point we know the epoch has expired, meaning that the
 465   // current "bias owner", if any, is actually invalid. Under these
 466   // circumstances _only_, we are allowed to use the current header's
 467   // value as the comparison value when doing the cas to acquire the
 468   // bias in the current epoch. In other words, we allow transfer of
 469   // the bias from one thread to another directly in this situation.
 470   //
 471   // FIXME: due to a lack of registers we currently blow away the age
 472   // bits in this situation. Should attempt to preserve them.
 473   {
 474     Label here;
 475     load_prototype_header(tmp_reg, obj_reg);
 476     orr(tmp_reg, rthread, tmp_reg);
 477     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 478     // If the biasing toward our thread failed, then another thread
 479     // succeeded in biasing it toward itself and we need to revoke that
 480     // bias. The revocation will occur in the runtime in the slow case.
 481     bind(here);
 482     if (counters != NULL) {
 483       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 484                   tmp_reg, rscratch1);
 485     }
 486   }
 487   b(done);
 488 
 489   bind(try_revoke_bias);
 490   // The prototype mark in the klass doesn't have the bias bit set any
 491   // more, indicating that objects of this data type are not supposed
 492   // to be biased any more. We are going to try to reset the mark of
 493   // this object to the prototype value and fall through to the
 494   // CAS-based locking scheme. Note that if our CAS fails, it means
 495   // that another thread raced us for the privilege of revoking the
 496   // bias of this particular object, so it's okay to continue in the
 497   // normal locking code.
 498   //
 499   // FIXME: due to a lack of registers we currently blow away the age
 500   // bits in this situation. Should attempt to preserve them.
 501   {
 502     Label here, nope;
 503     load_prototype_header(tmp_reg, obj_reg);
 504     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 505     bind(here);
 506 
 507     // Fall through to the normal CAS-based lock, because no matter what
 508     // the result of the above CAS, some thread must have succeeded in
 509     // removing the bias bit from the object's header.
 510     if (counters != NULL) {
 511       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 512                   rscratch1);
 513     }
 514     bind(nope);
 515   }
 516 
 517   bind(cas_label);
 518 
 519   return null_check_offset;
 520 }
 521 
 522 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 523   assert(UseBiasedLocking, "why call this otherwise?");
 524 
 525   // Check for biased locking unlock case, which is a no-op
 526   // Note: we do not have to check the thread ID for two reasons.
 527   // First, the interpreter checks for IllegalMonitorStateException at
 528   // a higher level. Second, if the bias was revoked while we held the
 529   // lock, the object could not be rebiased toward another thread, so
 530   // the bias bit would be clear.
 531   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 532   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 533   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 534   br(Assembler::EQ, done);
 535 }
 536 
 537 
 538 // added to make this compile
 539 
 540 REGISTER_DEFINITION(Register, noreg);
 541 
 542 static void pass_arg0(MacroAssembler* masm, Register arg) {
 543   if (c_rarg0 != arg ) {
 544     masm->mov(c_rarg0, arg);
 545   }
 546 }
 547 
 548 static void pass_arg1(MacroAssembler* masm, Register arg) {
 549   if (c_rarg1 != arg ) {
 550     masm->mov(c_rarg1, arg);
 551   }
 552 }
 553 
 554 static void pass_arg2(MacroAssembler* masm, Register arg) {
 555   if (c_rarg2 != arg ) {
 556     masm->mov(c_rarg2, arg);
 557   }
 558 }
 559 
 560 static void pass_arg3(MacroAssembler* masm, Register arg) {
 561   if (c_rarg3 != arg ) {
 562     masm->mov(c_rarg3, arg);
 563   }
 564 }
 565 
 566 void MacroAssembler::call_VM_base(Register oop_result,
 567                                   Register java_thread,
 568                                   Register last_java_sp,
 569                                   address  entry_point,
 570                                   int      number_of_arguments,
 571                                   bool     check_exceptions) {
 572    // determine java_thread register
 573   if (!java_thread->is_valid()) {
 574     java_thread = rthread;
 575   }
 576 
 577   // determine last_java_sp register
 578   if (!last_java_sp->is_valid()) {
 579     last_java_sp = esp;
 580   }
 581 
 582   // debugging support
 583   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 584   assert(java_thread == rthread, "unexpected register");
 585 #ifdef ASSERT
 586   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 587   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 588 #endif // ASSERT
 589 
 590   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 591   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 592 
 593   // push java thread (becomes first argument of C function)
 594 
 595   mov(c_rarg0, java_thread);
 596 
 597   // set last Java frame before call
 598   assert(last_java_sp != rfp, "can't use rfp");
 599 
 600   Label l;
 601   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 602 
 603   // do the call, remove parameters
 604   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 605 
 606   // reset last Java frame
 607   // Only interpreter should have to clear fp
 608   reset_last_Java_frame(true, true);
 609 
 610    // C++ interp handles this in the interpreter
 611   check_and_handle_popframe(java_thread);
 612   check_and_handle_earlyret(java_thread);
 613 
 614   if (check_exceptions) {
 615     // check for pending exceptions (java_thread is set upon return)
 616     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 617     Label ok;
 618     cbz(rscratch1, ok);
 619     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 620     br(rscratch1);
 621     bind(ok);
 622   }
 623 
 624   // get oop result if there is one and reset the value in the thread
 625   if (oop_result->is_valid()) {
 626     get_vm_result(oop_result, java_thread);
 627   }
 628 }
 629 
 630 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 631   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 632 }
 633 
 634 void MacroAssembler::call(Address entry) {
 635   if (true // reachable(entry)
 636       ) {
 637     bl(entry);
 638   } else {
 639     lea(rscratch1, entry);
 640     blr(rscratch1);
 641   }
 642 }
 643 
 644 void MacroAssembler::ic_call(address entry) {
 645   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 646   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 647   // unsigned long offset;
 648   // ldr_constant(rscratch2, const_ptr);
 649   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 650   call(Address(entry, rh));
 651 }
 652 
 653 // Implementation of call_VM versions
 654 
 655 void MacroAssembler::call_VM(Register oop_result,
 656                              address entry_point,
 657                              bool check_exceptions) {
 658   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 659 }
 660 
 661 void MacroAssembler::call_VM(Register oop_result,
 662                              address entry_point,
 663                              Register arg_1,
 664                              bool check_exceptions) {
 665   pass_arg1(this, arg_1);
 666   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 667 }
 668 
 669 void MacroAssembler::call_VM(Register oop_result,
 670                              address entry_point,
 671                              Register arg_1,
 672                              Register arg_2,
 673                              bool check_exceptions) {
 674   assert(arg_1 != c_rarg2, "smashed arg");
 675   pass_arg2(this, arg_2);
 676   pass_arg1(this, arg_1);
 677   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 678 }
 679 
 680 void MacroAssembler::call_VM(Register oop_result,
 681                              address entry_point,
 682                              Register arg_1,
 683                              Register arg_2,
 684                              Register arg_3,
 685                              bool check_exceptions) {
 686   assert(arg_1 != c_rarg3, "smashed arg");
 687   assert(arg_2 != c_rarg3, "smashed arg");
 688   pass_arg3(this, arg_3);
 689 
 690   assert(arg_1 != c_rarg2, "smashed arg");
 691   pass_arg2(this, arg_2);
 692 
 693   pass_arg1(this, arg_1);
 694   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 695 }
 696 
 697 void MacroAssembler::call_VM(Register oop_result,
 698                              Register last_java_sp,
 699                              address entry_point,
 700                              int number_of_arguments,
 701                              bool check_exceptions) {
 702   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 703 }
 704 
 705 void MacroAssembler::call_VM(Register oop_result,
 706                              Register last_java_sp,
 707                              address entry_point,
 708                              Register arg_1,
 709                              bool check_exceptions) {
 710   pass_arg1(this, arg_1);
 711   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 712 }
 713 
 714 void MacroAssembler::call_VM(Register oop_result,
 715                              Register last_java_sp,
 716                              address entry_point,
 717                              Register arg_1,
 718                              Register arg_2,
 719                              bool check_exceptions) {
 720 
 721   assert(arg_1 != c_rarg2, "smashed arg");
 722   pass_arg2(this, arg_2);
 723   pass_arg1(this, arg_1);
 724   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 725 }
 726 
 727 void MacroAssembler::call_VM(Register oop_result,
 728                              Register last_java_sp,
 729                              address entry_point,
 730                              Register arg_1,
 731                              Register arg_2,
 732                              Register arg_3,
 733                              bool check_exceptions) {
 734   assert(arg_1 != c_rarg3, "smashed arg");
 735   assert(arg_2 != c_rarg3, "smashed arg");
 736   pass_arg3(this, arg_3);
 737   assert(arg_1 != c_rarg2, "smashed arg");
 738   pass_arg2(this, arg_2);
 739   pass_arg1(this, arg_1);
 740   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 741 }
 742 
 743 
 744 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 745   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 746   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 747   verify_oop(oop_result, "broken oop in call_VM_base");
 748 }
 749 
 750 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 751   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 752   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 753 }
 754 
 755 void MacroAssembler::align(int modulus) {
 756   while (offset() % modulus != 0) nop();
 757 }
 758 
 759 // these are no-ops overridden by InterpreterMacroAssembler
 760 
 761 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 762 
 763 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 764 
 765 
 766 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 767                                                       Register tmp,
 768                                                       int offset) {
 769   intptr_t value = *delayed_value_addr;
 770   if (value != 0)
 771     return RegisterOrConstant(value + offset);
 772 
 773   // load indirectly to solve generation ordering problem
 774   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 775 
 776   if (offset != 0)
 777     add(tmp, tmp, offset);
 778 
 779   return RegisterOrConstant(tmp);
 780 }
 781 
 782 
 783 void MacroAssembler:: notify(int type) {
 784   if (type == bytecode_start) {
 785     // set_last_Java_frame(esp, rfp, (address)NULL);
 786     Assembler:: notify(type);
 787     // reset_last_Java_frame(true, false);
 788   }
 789   else
 790     Assembler:: notify(type);
 791 }
 792 
 793 // Look up the method for a megamorphic invokeinterface call.
 794 // The target method is determined by <intf_klass, itable_index>.
 795 // The receiver klass is in recv_klass.
 796 // On success, the result will be in method_result, and execution falls through.
 797 // On failure, execution transfers to the given label.
 798 void MacroAssembler::lookup_interface_method(Register recv_klass,
 799                                              Register intf_klass,
 800                                              RegisterOrConstant itable_index,
 801                                              Register method_result,
 802                                              Register scan_temp,
 803                                              Label& L_no_such_interface) {
 804   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 805   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 806          "caller must use same register for non-constant itable index as for method");
 807 
 808   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 809   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 810   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 811   int scan_step   = itableOffsetEntry::size() * wordSize;
 812   int vte_size    = vtableEntry::size() * wordSize;
 813   assert(vte_size == wordSize, "else adjust times_vte_scale");
 814 
 815   ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 816 
 817   // %%% Could store the aligned, prescaled offset in the klassoop.
 818   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 819   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 820   add(scan_temp, scan_temp, vtable_base);
 821   if (HeapWordsPerLong > 1) {
 822     // Round up to align_object_offset boundary
 823     // see code for instanceKlass::start_of_itable!
 824     round_to(scan_temp, BytesPerLong);
 825   }
 826 
 827   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 828   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 829   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 830   lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 831   if (itentry_off)
 832     add(recv_klass, recv_klass, itentry_off);
 833 
 834   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 835   //   if (scan->interface() == intf) {
 836   //     result = (klass + scan->offset() + itable_index);
 837   //   }
 838   // }
 839   Label search, found_method;
 840 
 841   for (int peel = 1; peel >= 0; peel--) {
 842     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 843     cmp(intf_klass, method_result);
 844 
 845     if (peel) {
 846       br(Assembler::EQ, found_method);
 847     } else {
 848       br(Assembler::NE, search);
 849       // (invert the test to fall through to found_method...)
 850     }
 851 
 852     if (!peel)  break;
 853 
 854     bind(search);
 855 
 856     // Check that the previous entry is non-null.  A null entry means that
 857     // the receiver class doesn't implement the interface, and wasn't the
 858     // same as when the caller was compiled.
 859     cbz(method_result, L_no_such_interface);
 860     add(scan_temp, scan_temp, scan_step);
 861   }
 862 
 863   bind(found_method);
 864 
 865   // Got a hit.
 866   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 867   ldr(method_result, Address(recv_klass, scan_temp));
 868 }
 869 
 870 // virtual method calling
 871 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 872                                            RegisterOrConstant vtable_index,
 873                                            Register method_result) {
 874   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 875   assert(vtableEntry::size() * wordSize == 8,
 876          "adjust the scaling in the code below");
 877   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 878 
 879   if (vtable_index.is_register()) {
 880     lea(method_result, Address(recv_klass,
 881                                vtable_index.as_register(),
 882                                Address::lsl(LogBytesPerWord)));
 883     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 884   } else {
 885     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 886     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 887   }
 888 }
 889 
 890 void MacroAssembler::check_klass_subtype(Register sub_klass,
 891                            Register super_klass,
 892                            Register temp_reg,
 893                            Label& L_success) {
 894   Label L_failure;
 895   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 896   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 897   bind(L_failure);
 898 }
 899 
 900 
 901 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 902                                                    Register super_klass,
 903                                                    Register temp_reg,
 904                                                    Label* L_success,
 905                                                    Label* L_failure,
 906                                                    Label* L_slow_path,
 907                                         RegisterOrConstant super_check_offset) {
 908   assert_different_registers(sub_klass, super_klass, temp_reg);
 909   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 910   if (super_check_offset.is_register()) {
 911     assert_different_registers(sub_klass, super_klass,
 912                                super_check_offset.as_register());
 913   } else if (must_load_sco) {
 914     assert(temp_reg != noreg, "supply either a temp or a register offset");
 915   }
 916 
 917   Label L_fallthrough;
 918   int label_nulls = 0;
 919   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 920   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 921   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 922   assert(label_nulls <= 1, "at most one NULL in the batch");
 923 
 924   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 925   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 926   Address super_check_offset_addr(super_klass, sco_offset);
 927 
 928   // Hacked jmp, which may only be used just before L_fallthrough.
 929 #define final_jmp(label)                                                \
 930   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 931   else                            b(label)                /*omit semi*/
 932 
 933   // If the pointers are equal, we are done (e.g., String[] elements).
 934   // This self-check enables sharing of secondary supertype arrays among
 935   // non-primary types such as array-of-interface.  Otherwise, each such
 936   // type would need its own customized SSA.
 937   // We move this check to the front of the fast path because many
 938   // type checks are in fact trivially successful in this manner,
 939   // so we get a nicely predicted branch right at the start of the check.
 940   cmp(sub_klass, super_klass);
 941   br(Assembler::EQ, *L_success);
 942 
 943   // Check the supertype display:
 944   if (must_load_sco) {
 945     // Positive movl does right thing on LP64.
 946     ldrw(temp_reg, super_check_offset_addr);
 947     super_check_offset = RegisterOrConstant(temp_reg);
 948   }
 949   Address super_check_addr(sub_klass, super_check_offset);
 950   ldr(rscratch1, super_check_addr);
 951   cmp(super_klass, rscratch1); // load displayed supertype
 952 
 953   // This check has worked decisively for primary supers.
 954   // Secondary supers are sought in the super_cache ('super_cache_addr').
 955   // (Secondary supers are interfaces and very deeply nested subtypes.)
 956   // This works in the same check above because of a tricky aliasing
 957   // between the super_cache and the primary super display elements.
 958   // (The 'super_check_addr' can address either, as the case requires.)
 959   // Note that the cache is updated below if it does not help us find
 960   // what we need immediately.
 961   // So if it was a primary super, we can just fail immediately.
 962   // Otherwise, it's the slow path for us (no success at this point).
 963 
 964   if (super_check_offset.is_register()) {
 965     br(Assembler::EQ, *L_success);
 966     cmp(super_check_offset.as_register(), sc_offset);
 967     if (L_failure == &L_fallthrough) {
 968       br(Assembler::EQ, *L_slow_path);
 969     } else {
 970       br(Assembler::NE, *L_failure);
 971       final_jmp(*L_slow_path);
 972     }
 973   } else if (super_check_offset.as_constant() == sc_offset) {
 974     // Need a slow path; fast failure is impossible.
 975     if (L_slow_path == &L_fallthrough) {
 976       br(Assembler::EQ, *L_success);
 977     } else {
 978       br(Assembler::NE, *L_slow_path);
 979       final_jmp(*L_success);
 980     }
 981   } else {
 982     // No slow path; it's a fast decision.
 983     if (L_failure == &L_fallthrough) {
 984       br(Assembler::EQ, *L_success);
 985     } else {
 986       br(Assembler::NE, *L_failure);
 987       final_jmp(*L_success);
 988     }
 989   }
 990 
 991   bind(L_fallthrough);
 992 
 993 #undef final_jmp
 994 }
 995 
 996 // These two are taken from x86, but they look generally useful
 997 
 998 // scans count pointer sized words at [addr] for occurence of value,
 999 // generic
1000 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1001                                 Register scratch) {
1002   Label Lloop, Lexit;
1003   cbz(count, Lexit);
1004   bind(Lloop);
1005   ldr(scratch, post(addr, wordSize));
1006   cmp(value, scratch);
1007   br(EQ, Lexit);
1008   sub(count, count, 1);
1009   cbnz(count, Lloop);
1010   bind(Lexit);
1011 }
1012 
1013 // scans count 4 byte words at [addr] for occurence of value,
1014 // generic
1015 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1016                                 Register scratch) {
1017   Label Lloop, Lexit;
1018   cbz(count, Lexit);
1019   bind(Lloop);
1020   ldrw(scratch, post(addr, wordSize));
1021   cmpw(value, scratch);
1022   br(EQ, Lexit);
1023   sub(count, count, 1);
1024   cbnz(count, Lloop);
1025   bind(Lexit);
1026 }
1027 
1028 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1029                                                    Register super_klass,
1030                                                    Register temp_reg,
1031                                                    Register temp2_reg,
1032                                                    Label* L_success,
1033                                                    Label* L_failure,
1034                                                    bool set_cond_codes) {
1035   assert_different_registers(sub_klass, super_klass, temp_reg);
1036   if (temp2_reg != noreg)
1037     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1038 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1039 
1040   Label L_fallthrough;
1041   int label_nulls = 0;
1042   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1043   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1044   assert(label_nulls <= 1, "at most one NULL in the batch");
1045 
1046   // a couple of useful fields in sub_klass:
1047   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1048   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1049   Address secondary_supers_addr(sub_klass, ss_offset);
1050   Address super_cache_addr(     sub_klass, sc_offset);
1051 
1052   BLOCK_COMMENT("check_klass_subtype_slow_path");
1053 
1054   // Do a linear scan of the secondary super-klass chain.
1055   // This code is rarely used, so simplicity is a virtue here.
1056   // The repne_scan instruction uses fixed registers, which we must spill.
1057   // Don't worry too much about pre-existing connections with the input regs.
1058 
1059   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1060   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1061 
1062   // Get super_klass value into r0 (even if it was in r5 or r2).
1063   RegSet pushed_registers;
1064   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1065   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1066 
1067   if (super_klass != r0 || UseCompressedOops) {
1068     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1069   }
1070 
1071   push(pushed_registers, sp);
1072 
1073 #ifndef PRODUCT
1074   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1075   Address pst_counter_addr(rscratch2);
1076   ldr(rscratch1, pst_counter_addr);
1077   add(rscratch1, rscratch1, 1);
1078   str(rscratch1, pst_counter_addr);
1079 #endif //PRODUCT
1080 
1081   // We will consult the secondary-super array.
1082   ldr(r5, secondary_supers_addr);
1083   // Load the array length.  (Positive movl does right thing on LP64.)
1084   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1085   // Skip to start of data.
1086   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1087 
1088   cmp(sp, zr); // Clear Z flag; SP is never zero
1089   // Scan R2 words at [R5] for an occurrence of R0.
1090   // Set NZ/Z based on last compare.
1091   repne_scan(r5, r0, r2, rscratch1);
1092 
1093   // Unspill the temp. registers:
1094   pop(pushed_registers, sp);
1095 
1096   br(Assembler::NE, *L_failure);
1097 
1098   // Success.  Cache the super we found and proceed in triumph.
1099   str(super_klass, super_cache_addr);
1100 
1101   if (L_success != &L_fallthrough) {
1102     b(*L_success);
1103   }
1104 
1105 #undef IS_A_TEMP
1106 
1107   bind(L_fallthrough);
1108 }
1109 
1110 
1111 void MacroAssembler::verify_oop(Register reg, const char* s) {
1112   if (!VerifyOops) return;
1113 
1114   // Pass register number to verify_oop_subroutine
1115   const char* b = NULL;
1116   {
1117     ResourceMark rm;
1118     stringStream ss;
1119     ss.print("verify_oop: %s: %s", reg->name(), s);
1120     b = code_string(ss.as_string());
1121   }
1122   BLOCK_COMMENT("verify_oop {");
1123 
1124   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1125   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1126 
1127   mov(r0, reg);
1128   mov(rscratch1, (address)b);
1129 
1130   // call indirectly to solve generation ordering problem
1131   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1132   ldr(rscratch2, Address(rscratch2));
1133   blr(rscratch2);
1134 
1135   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1136   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1137 
1138   BLOCK_COMMENT("} verify_oop");
1139 }
1140 
1141 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1142   if (!VerifyOops) return;
1143 
1144   const char* b = NULL;
1145   {
1146     ResourceMark rm;
1147     stringStream ss;
1148     ss.print("verify_oop_addr: %s", s);
1149     b = code_string(ss.as_string());
1150   }
1151   BLOCK_COMMENT("verify_oop_addr {");
1152 
1153   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1154   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1155 
1156   // addr may contain sp so we will have to adjust it based on the
1157   // pushes that we just did.
1158   if (addr.uses(sp)) {
1159     lea(r0, addr);
1160     ldr(r0, Address(r0, 4 * wordSize));
1161   } else {
1162     ldr(r0, addr);
1163   }
1164   mov(rscratch1, (address)b);
1165 
1166   // call indirectly to solve generation ordering problem
1167   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1168   ldr(rscratch2, Address(rscratch2));
1169   blr(rscratch2);
1170 
1171   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1172   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1173 
1174   BLOCK_COMMENT("} verify_oop_addr");
1175 }
1176 
1177 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1178                                          int extra_slot_offset) {
1179   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1180   int stackElementSize = Interpreter::stackElementSize;
1181   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1182 #ifdef ASSERT
1183   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1184   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1185 #endif
1186   if (arg_slot.is_constant()) {
1187     return Address(esp, arg_slot.as_constant() * stackElementSize
1188                    + offset);
1189   } else {
1190     add(rscratch1, esp, arg_slot.as_register(),
1191         ext::uxtx, exact_log2(stackElementSize));
1192     return Address(rscratch1, offset);
1193   }
1194 }
1195 
1196 void MacroAssembler::call_VM_leaf_base(address entry_point,
1197                                        int number_of_arguments,
1198                                        Label *retaddr) {
1199   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1200 }
1201 
1202 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1203                                         int number_of_gp_arguments,
1204                                         int number_of_fp_arguments,
1205                                         ret_type type,
1206                                         Label *retaddr) {
1207   Label E, L;
1208 
1209   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1210 
1211   // We add 1 to number_of_arguments because the thread in arg0 is
1212   // not counted
1213   mov(rscratch1, entry_point);
1214   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1215   if (retaddr)
1216     bind(*retaddr);
1217 
1218   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1219   maybe_isb();
1220 }
1221 
1222 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1223   call_VM_leaf_base(entry_point, number_of_arguments);
1224 }
1225 
1226 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1227   pass_arg0(this, arg_0);
1228   call_VM_leaf_base(entry_point, 1);
1229 }
1230 
1231 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1232   pass_arg0(this, arg_0);
1233   pass_arg1(this, arg_1);
1234   call_VM_leaf_base(entry_point, 2);
1235 }
1236 
1237 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1238                                   Register arg_1, Register arg_2) {
1239   pass_arg0(this, arg_0);
1240   pass_arg1(this, arg_1);
1241   pass_arg2(this, arg_2);
1242   call_VM_leaf_base(entry_point, 3);
1243 }
1244 
1245 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1246   pass_arg0(this, arg_0);
1247   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1248 }
1249 
1250 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1251 
1252   assert(arg_0 != c_rarg1, "smashed arg");
1253   pass_arg1(this, arg_1);
1254   pass_arg0(this, arg_0);
1255   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1256 }
1257 
1258 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1259   assert(arg_0 != c_rarg2, "smashed arg");
1260   assert(arg_1 != c_rarg2, "smashed arg");
1261   pass_arg2(this, arg_2);
1262   assert(arg_0 != c_rarg1, "smashed arg");
1263   pass_arg1(this, arg_1);
1264   pass_arg0(this, arg_0);
1265   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1266 }
1267 
1268 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1269   assert(arg_0 != c_rarg3, "smashed arg");
1270   assert(arg_1 != c_rarg3, "smashed arg");
1271   assert(arg_2 != c_rarg3, "smashed arg");
1272   pass_arg3(this, arg_3);
1273   assert(arg_0 != c_rarg2, "smashed arg");
1274   assert(arg_1 != c_rarg2, "smashed arg");
1275   pass_arg2(this, arg_2);
1276   assert(arg_0 != c_rarg1, "smashed arg");
1277   pass_arg1(this, arg_1);
1278   pass_arg0(this, arg_0);
1279   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1280 }
1281 
1282 void MacroAssembler::null_check(Register reg, int offset) {
1283   if (needs_explicit_null_check(offset)) {
1284     // provoke OS NULL exception if reg = NULL by
1285     // accessing M[reg] w/o changing any registers
1286     // NOTE: this is plenty to provoke a segv
1287     ldr(zr, Address(reg));
1288   } else {
1289     // nothing to do, (later) access of M[reg + offset]
1290     // will provoke OS NULL exception if reg = NULL
1291   }
1292 }
1293 
1294 // MacroAssembler protected routines needed to implement
1295 // public methods
1296 
1297 void MacroAssembler::mov(Register r, Address dest) {
1298   InstructionMark im(this);
1299   code_section()->relocate(inst_mark(), dest.rspec());
1300   u_int64_t imm64 = (u_int64_t)dest.target();
1301   movptr(r, imm64);
1302 }
1303 
1304 // Move a constant pointer into r.  In AArch64 mode the virtual
1305 // address space is 48 bits in size, so we only need three
1306 // instructions to create a patchable instruction sequence that can
1307 // reach anywhere.
1308 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1309 #ifndef PRODUCT
1310   {
1311     char buffer[64];
1312     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1313     block_comment(buffer);
1314   }
1315 #endif
1316   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1317   movz(r, imm64 & 0xffff);
1318   imm64 >>= 16;
1319   movk(r, imm64 & 0xffff, 16);
1320   imm64 >>= 16;
1321   movk(r, imm64 & 0xffff, 32);
1322 }
1323 
1324 // Macro to mov replicated immediate to vector register.
1325 //  Vd will get the following values for different arrangements in T
1326 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1327 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1328 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1329 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1330 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1331 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1332 //   T1D/T2D: invalid
1333 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1334   assert(T != T1D && T != T2D, "invalid arrangement");
1335   if (T == T8B || T == T16B) {
1336     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1337     movi(Vd, T, imm32 & 0xff, 0);
1338     return;
1339   }
1340   u_int32_t nimm32 = ~imm32;
1341   if (T == T4H || T == T8H) {
1342     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1343     imm32 &= 0xffff;
1344     nimm32 &= 0xffff;
1345   }
1346   u_int32_t x = imm32;
1347   int movi_cnt = 0;
1348   int movn_cnt = 0;
1349   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1350   x = nimm32;
1351   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1352   if (movn_cnt < movi_cnt) imm32 = nimm32;
1353   unsigned lsl = 0;
1354   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1355   if (movn_cnt < movi_cnt)
1356     mvni(Vd, T, imm32 & 0xff, lsl);
1357   else
1358     movi(Vd, T, imm32 & 0xff, lsl);
1359   imm32 >>= 8; lsl += 8;
1360   while (imm32) {
1361     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1362     if (movn_cnt < movi_cnt)
1363       bici(Vd, T, imm32 & 0xff, lsl);
1364     else
1365       orri(Vd, T, imm32 & 0xff, lsl);
1366     lsl += 8; imm32 >>= 8;
1367   }
1368 }
1369 
1370 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1371 {
1372 #ifndef PRODUCT
1373   {
1374     char buffer[64];
1375     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1376     block_comment(buffer);
1377   }
1378 #endif
1379   if (operand_valid_for_logical_immediate(false, imm64)) {
1380     orr(dst, zr, imm64);
1381   } else {
1382     // we can use a combination of MOVZ or MOVN with
1383     // MOVK to build up the constant
1384     u_int64_t imm_h[4];
1385     int zero_count = 0;
1386     int neg_count = 0;
1387     int i;
1388     for (i = 0; i < 4; i++) {
1389       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1390       if (imm_h[i] == 0) {
1391         zero_count++;
1392       } else if (imm_h[i] == 0xffffL) {
1393         neg_count++;
1394       }
1395     }
1396     if (zero_count == 4) {
1397       // one MOVZ will do
1398       movz(dst, 0);
1399     } else if (neg_count == 4) {
1400       // one MOVN will do
1401       movn(dst, 0);
1402     } else if (zero_count == 3) {
1403       for (i = 0; i < 4; i++) {
1404         if (imm_h[i] != 0L) {
1405           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1406           break;
1407         }
1408       }
1409     } else if (neg_count == 3) {
1410       // one MOVN will do
1411       for (int i = 0; i < 4; i++) {
1412         if (imm_h[i] != 0xffffL) {
1413           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1414           break;
1415         }
1416       }
1417     } else if (zero_count == 2) {
1418       // one MOVZ and one MOVK will do
1419       for (i = 0; i < 3; i++) {
1420         if (imm_h[i] != 0L) {
1421           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1422           i++;
1423           break;
1424         }
1425       }
1426       for (;i < 4; i++) {
1427         if (imm_h[i] != 0L) {
1428           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1429         }
1430       }
1431     } else if (neg_count == 2) {
1432       // one MOVN and one MOVK will do
1433       for (i = 0; i < 4; i++) {
1434         if (imm_h[i] != 0xffffL) {
1435           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1436           i++;
1437           break;
1438         }
1439       }
1440       for (;i < 4; i++) {
1441         if (imm_h[i] != 0xffffL) {
1442           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1443         }
1444       }
1445     } else if (zero_count == 1) {
1446       // one MOVZ and two MOVKs will do
1447       for (i = 0; i < 4; i++) {
1448         if (imm_h[i] != 0L) {
1449           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1450           i++;
1451           break;
1452         }
1453       }
1454       for (;i < 4; i++) {
1455         if (imm_h[i] != 0x0L) {
1456           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1457         }
1458       }
1459     } else if (neg_count == 1) {
1460       // one MOVN and two MOVKs will do
1461       for (i = 0; i < 4; i++) {
1462         if (imm_h[i] != 0xffffL) {
1463           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1464           i++;
1465           break;
1466         }
1467       }
1468       for (;i < 4; i++) {
1469         if (imm_h[i] != 0xffffL) {
1470           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1471         }
1472       }
1473     } else {
1474       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1475       movz(dst, (u_int32_t)imm_h[0], 0);
1476       for (i = 1; i < 4; i++) {
1477         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1478       }
1479     }
1480   }
1481 }
1482 
1483 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1484 {
1485 #ifndef PRODUCT
1486     {
1487       char buffer[64];
1488       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1489       block_comment(buffer);
1490     }
1491 #endif
1492   if (operand_valid_for_logical_immediate(true, imm32)) {
1493     orrw(dst, zr, imm32);
1494   } else {
1495     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1496     // constant
1497     u_int32_t imm_h[2];
1498     imm_h[0] = imm32 & 0xffff;
1499     imm_h[1] = ((imm32 >> 16) & 0xffff);
1500     if (imm_h[0] == 0) {
1501       movzw(dst, imm_h[1], 16);
1502     } else if (imm_h[0] == 0xffff) {
1503       movnw(dst, imm_h[1] ^ 0xffff, 16);
1504     } else if (imm_h[1] == 0) {
1505       movzw(dst, imm_h[0], 0);
1506     } else if (imm_h[1] == 0xffff) {
1507       movnw(dst, imm_h[0] ^ 0xffff, 0);
1508     } else {
1509       // use a MOVZ and MOVK (makes it easier to debug)
1510       movzw(dst, imm_h[0], 0);
1511       movkw(dst, imm_h[1], 16);
1512     }
1513   }
1514 }
1515 
1516 // Form an address from base + offset in Rd.  Rd may or may
1517 // not actually be used: you must use the Address that is returned.
1518 // It is up to you to ensure that the shift provided matches the size
1519 // of your data.
1520 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1521   if (Address::offset_ok_for_immed(byte_offset, shift))
1522     // It fits; no need for any heroics
1523     return Address(base, byte_offset);
1524 
1525   // Don't do anything clever with negative or misaligned offsets
1526   unsigned mask = (1 << shift) - 1;
1527   if (byte_offset < 0 || byte_offset & mask) {
1528     mov(Rd, byte_offset);
1529     add(Rd, base, Rd);
1530     return Address(Rd);
1531   }
1532 
1533   // See if we can do this with two 12-bit offsets
1534   {
1535     unsigned long word_offset = byte_offset >> shift;
1536     unsigned long masked_offset = word_offset & 0xfff000;
1537     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1538         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1539       add(Rd, base, masked_offset << shift);
1540       word_offset -= masked_offset;
1541       return Address(Rd, word_offset << shift);
1542     }
1543   }
1544 
1545   // Do it the hard way
1546   mov(Rd, byte_offset);
1547   add(Rd, base, Rd);
1548   return Address(Rd);
1549 }
1550 
1551 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
1552   Label retry_load;
1553   bind(retry_load);
1554   // flush and load exclusive from the memory location
1555   ldxrw(tmp, counter_addr);
1556   addw(tmp, tmp, 1);
1557   // if we store+flush with no intervening write tmp wil be zero
1558   stxrw(tmp, tmp, counter_addr);
1559   cbnzw(tmp, retry_load);
1560 }
1561 
1562 
1563 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1564                                     bool want_remainder, Register scratch)
1565 {
1566   // Full implementation of Java idiv and irem.  The function
1567   // returns the (pc) offset of the div instruction - may be needed
1568   // for implicit exceptions.
1569   //
1570   // constraint : ra/rb =/= scratch
1571   //         normal case
1572   //
1573   // input : ra: dividend
1574   //         rb: divisor
1575   //
1576   // result: either
1577   //         quotient  (= ra idiv rb)
1578   //         remainder (= ra irem rb)
1579 
1580   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1581 
1582   int idivl_offset = offset();
1583   if (! want_remainder) {
1584     sdivw(result, ra, rb);
1585   } else {
1586     sdivw(scratch, ra, rb);
1587     Assembler::msubw(result, scratch, rb, ra);
1588   }
1589 
1590   return idivl_offset;
1591 }
1592 
1593 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1594                                     bool want_remainder, Register scratch)
1595 {
1596   // Full implementation of Java ldiv and lrem.  The function
1597   // returns the (pc) offset of the div instruction - may be needed
1598   // for implicit exceptions.
1599   //
1600   // constraint : ra/rb =/= scratch
1601   //         normal case
1602   //
1603   // input : ra: dividend
1604   //         rb: divisor
1605   //
1606   // result: either
1607   //         quotient  (= ra idiv rb)
1608   //         remainder (= ra irem rb)
1609 
1610   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1611 
1612   int idivq_offset = offset();
1613   if (! want_remainder) {
1614     sdiv(result, ra, rb);
1615   } else {
1616     sdiv(scratch, ra, rb);
1617     Assembler::msub(result, scratch, rb, ra);
1618   }
1619 
1620   return idivq_offset;
1621 }
1622 
1623 // MacroAssembler routines found actually to be needed
1624 
1625 void MacroAssembler::push(Register src)
1626 {
1627   str(src, Address(pre(esp, -1 * wordSize)));
1628 }
1629 
1630 void MacroAssembler::pop(Register dst)
1631 {
1632   ldr(dst, Address(post(esp, 1 * wordSize)));
1633 }
1634 
1635 // Note: load_unsigned_short used to be called load_unsigned_word.
1636 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1637   int off = offset();
1638   ldrh(dst, src);
1639   return off;
1640 }
1641 
1642 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1643   int off = offset();
1644   ldrb(dst, src);
1645   return off;
1646 }
1647 
1648 int MacroAssembler::load_signed_short(Register dst, Address src) {
1649   int off = offset();
1650   ldrsh(dst, src);
1651   return off;
1652 }
1653 
1654 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1655   int off = offset();
1656   ldrsb(dst, src);
1657   return off;
1658 }
1659 
1660 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1661   int off = offset();
1662   ldrshw(dst, src);
1663   return off;
1664 }
1665 
1666 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1667   int off = offset();
1668   ldrsbw(dst, src);
1669   return off;
1670 }
1671 
1672 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1673   switch (size_in_bytes) {
1674   case  8:  ldr(dst, src); break;
1675   case  4:  ldrw(dst, src); break;
1676   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1677   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1678   default:  ShouldNotReachHere();
1679   }
1680 }
1681 
1682 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1683   switch (size_in_bytes) {
1684   case  8:  str(src, dst); break;
1685   case  4:  strw(src, dst); break;
1686   case  2:  strh(src, dst); break;
1687   case  1:  strb(src, dst); break;
1688   default:  ShouldNotReachHere();
1689   }
1690 }
1691 
1692 void MacroAssembler::decrementw(Register reg, int value)
1693 {
1694   if (value < 0)  { incrementw(reg, -value);      return; }
1695   if (value == 0) {                               return; }
1696   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1697   /* else */ {
1698     guarantee(reg != rscratch2, "invalid dst for register decrement");
1699     movw(rscratch2, (unsigned)value);
1700     subw(reg, reg, rscratch2);
1701   }
1702 }
1703 
1704 void MacroAssembler::decrement(Register reg, int value)
1705 {
1706   if (value < 0)  { increment(reg, -value);      return; }
1707   if (value == 0) {                              return; }
1708   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1709   /* else */ {
1710     assert(reg != rscratch2, "invalid dst for register decrement");
1711     mov(rscratch2, (unsigned long)value);
1712     sub(reg, reg, rscratch2);
1713   }
1714 }
1715 
1716 void MacroAssembler::decrementw(Address dst, int value)
1717 {
1718   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1719   ldrw(rscratch1, dst);
1720   decrementw(rscratch1, value);
1721   strw(rscratch1, dst);
1722 }
1723 
1724 void MacroAssembler::decrement(Address dst, int value)
1725 {
1726   assert(!dst.uses(rscratch1), "invalid address for decrement");
1727   ldr(rscratch1, dst);
1728   decrement(rscratch1, value);
1729   str(rscratch1, dst);
1730 }
1731 
1732 void MacroAssembler::incrementw(Register reg, int value)
1733 {
1734   if (value < 0)  { decrementw(reg, -value);      return; }
1735   if (value == 0) {                               return; }
1736   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1737   /* else */ {
1738     assert(reg != rscratch2, "invalid dst for register increment");
1739     movw(rscratch2, (unsigned)value);
1740     addw(reg, reg, rscratch2);
1741   }
1742 }
1743 
1744 void MacroAssembler::increment(Register reg, int value)
1745 {
1746   if (value < 0)  { decrement(reg, -value);      return; }
1747   if (value == 0) {                              return; }
1748   if (value < (1 << 12)) { add(reg, reg, value); return; }
1749   /* else */ {
1750     assert(reg != rscratch2, "invalid dst for register increment");
1751     movw(rscratch2, (unsigned)value);
1752     add(reg, reg, rscratch2);
1753   }
1754 }
1755 
1756 void MacroAssembler::incrementw(Address dst, int value)
1757 {
1758   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1759   ldrw(rscratch1, dst);
1760   incrementw(rscratch1, value);
1761   strw(rscratch1, dst);
1762 }
1763 
1764 void MacroAssembler::increment(Address dst, int value)
1765 {
1766   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1767   ldr(rscratch1, dst);
1768   increment(rscratch1, value);
1769   str(rscratch1, dst);
1770 }
1771 
1772 
1773 void MacroAssembler::pusha() {
1774   push(0x7fffffff, sp);
1775 }
1776 
1777 void MacroAssembler::popa() {
1778   pop(0x7fffffff, sp);
1779 }
1780 
1781 // Push lots of registers in the bit set supplied.  Don't push sp.
1782 // Return the number of words pushed
1783 int MacroAssembler::push(unsigned int bitset, Register stack) {
1784   int words_pushed = 0;
1785 
1786   // Scan bitset to accumulate register pairs
1787   unsigned char regs[32];
1788   int count = 0;
1789   for (int reg = 0; reg <= 30; reg++) {
1790     if (1 & bitset)
1791       regs[count++] = reg;
1792     bitset >>= 1;
1793   }
1794   regs[count++] = zr->encoding_nocheck();
1795   count &= ~1;  // Only push an even nuber of regs
1796 
1797   if (count) {
1798     stp(as_Register(regs[0]), as_Register(regs[1]),
1799        Address(pre(stack, -count * wordSize)));
1800     words_pushed += 2;
1801   }
1802   for (int i = 2; i < count; i += 2) {
1803     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1804        Address(stack, i * wordSize));
1805     words_pushed += 2;
1806   }
1807 
1808   assert(words_pushed == count, "oops, pushed != count");
1809 
1810   return count;
1811 }
1812 
1813 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1814   int words_pushed = 0;
1815 
1816   // Scan bitset to accumulate register pairs
1817   unsigned char regs[32];
1818   int count = 0;
1819   for (int reg = 0; reg <= 30; reg++) {
1820     if (1 & bitset)
1821       regs[count++] = reg;
1822     bitset >>= 1;
1823   }
1824   regs[count++] = zr->encoding_nocheck();
1825   count &= ~1;
1826 
1827   for (int i = 2; i < count; i += 2) {
1828     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1829        Address(stack, i * wordSize));
1830     words_pushed += 2;
1831   }
1832   if (count) {
1833     ldp(as_Register(regs[0]), as_Register(regs[1]),
1834        Address(post(stack, count * wordSize)));
1835     words_pushed += 2;
1836   }
1837 
1838   assert(words_pushed == count, "oops, pushed != count");
1839 
1840   return count;
1841 }
1842 #ifdef ASSERT
1843 void MacroAssembler::verify_heapbase(const char* msg) {
1844 #if 0
1845   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1846   assert (Universe::heap() != NULL, "java heap should be initialized");
1847   if (CheckCompressedOops) {
1848     Label ok;
1849     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1850     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1851     br(Assembler::EQ, ok);
1852     stop(msg);
1853     bind(ok);
1854     pop(1 << rscratch1->encoding(), sp);
1855   }
1856 #endif
1857 }
1858 #endif
1859 
1860 void MacroAssembler::stop(const char* msg) {
1861   address ip = pc();
1862   pusha();
1863   mov(c_rarg0, (address)msg);
1864   mov(c_rarg1, (address)ip);
1865   mov(c_rarg2, sp);
1866   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
1867   // call(c_rarg3);
1868   blrt(c_rarg3, 3, 0, 1);
1869   hlt(0);
1870 }
1871 
1872 // If a constant does not fit in an immediate field, generate some
1873 // number of MOV instructions and then perform the operation.
1874 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
1875                                            add_sub_imm_insn insn1,
1876                                            add_sub_reg_insn insn2) {
1877   assert(Rd != zr, "Rd = zr and not setting flags?");
1878   if (operand_valid_for_add_sub_immediate((int)imm)) {
1879     (this->*insn1)(Rd, Rn, imm);
1880   } else {
1881     if (uabs(imm) < (1 << 24)) {
1882        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
1883        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
1884     } else {
1885        assert_different_registers(Rd, Rn);
1886        mov(Rd, (uint64_t)imm);
1887        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1888     }
1889   }
1890 }
1891 
1892 // Seperate vsn which sets the flags. Optimisations are more restricted
1893 // because we must set the flags correctly.
1894 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
1895                                            add_sub_imm_insn insn1,
1896                                            add_sub_reg_insn insn2) {
1897   if (operand_valid_for_add_sub_immediate((int)imm)) {
1898     (this->*insn1)(Rd, Rn, imm);
1899   } else {
1900     assert_different_registers(Rd, Rn);
1901     assert(Rd != zr, "overflow in immediate operand");
1902     mov(Rd, (uint64_t)imm);
1903     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1904   }
1905 }
1906 
1907 
1908 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
1909   if (increment.is_register()) {
1910     add(Rd, Rn, increment.as_register());
1911   } else {
1912     add(Rd, Rn, increment.as_constant());
1913   }
1914 }
1915 
1916 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
1917   if (increment.is_register()) {
1918     addw(Rd, Rn, increment.as_register());
1919   } else {
1920     addw(Rd, Rn, increment.as_constant());
1921   }
1922 }
1923 
1924 void MacroAssembler::reinit_heapbase()
1925 {
1926   if (UseCompressedOops) {
1927     if (Universe::is_fully_initialized()) {
1928       mov(rheapbase, Universe::narrow_ptrs_base());
1929     } else {
1930       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1931       ldr(rheapbase, Address(rheapbase));
1932     }
1933   }
1934 }
1935 
1936 // this simulates the behaviour of the x86 cmpxchg instruction using a
1937 // load linked/store conditional pair. we use the acquire/release
1938 // versions of these instructions so that we flush pending writes as
1939 // per Java semantics.
1940 
1941 // n.b the x86 version assumes the old value to be compared against is
1942 // in rax and updates rax with the value located in memory if the
1943 // cmpxchg fails. we supply a register for the old value explicitly
1944 
1945 // the aarch64 load linked/store conditional instructions do not
1946 // accept an offset. so, unlike x86, we must provide a plain register
1947 // to identify the memory word to be compared/exchanged rather than a
1948 // register+offset Address.
1949 
1950 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1951                                 Label &succeed, Label *fail) {
1952   // oldv holds comparison value
1953   // newv holds value to write in exchange
1954   // addr identifies memory word to compare against/update
1955   // tmp returns 0/1 for success/failure
1956   Label retry_load, nope;
1957   
1958   bind(retry_load);
1959   // flush and load exclusive from the memory location
1960   // and fail if it is not what we expect
1961   ldaxr(tmp, addr);
1962   cmp(tmp, oldv);
1963   br(Assembler::NE, nope);
1964   // if we store+flush with no intervening write tmp wil be zero
1965   stlxr(tmp, newv, addr);
1966   cbzw(tmp, succeed);
1967   // retry so we only ever return after a load fails to compare
1968   // ensures we don't return a stale value after a failed write.
1969   b(retry_load);
1970   // if the memory word differs we return it in oldv and signal a fail
1971   bind(nope);
1972   membar(AnyAny);
1973   mov(oldv, tmp);
1974   if (fail)
1975     b(*fail);
1976 }
1977 
1978 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1979                                 Label &succeed, Label *fail) {
1980   // oldv holds comparison value
1981   // newv holds value to write in exchange
1982   // addr identifies memory word to compare against/update
1983   // tmp returns 0/1 for success/failure
1984   Label retry_load, nope;
1985   
1986   bind(retry_load);
1987   // flush and load exclusive from the memory location
1988   // and fail if it is not what we expect
1989   ldaxrw(tmp, addr);
1990   cmp(tmp, oldv);
1991   br(Assembler::NE, nope);
1992   // if we store+flush with no intervening write tmp wil be zero
1993   stlxrw(tmp, newv, addr);
1994   cbzw(tmp, succeed);
1995   // retry so we only ever return after a load fails to compare
1996   // ensures we don't return a stale value after a failed write.
1997   b(retry_load);
1998   // if the memory word differs we return it in oldv and signal a fail
1999   bind(nope);
2000   membar(AnyAny);
2001   mov(oldv, tmp);
2002   if (fail)
2003     b(*fail);
2004 }
2005 
2006 static bool different(Register a, RegisterOrConstant b, Register c) {
2007   if (b.is_constant())
2008     return a != c;
2009   else
2010     return a != b.as_register() && a != c && b.as_register() != c;
2011 }
2012 
2013 #define ATOMIC_OP(LDXR, OP, STXR)                                       \
2014 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2015   Register result = rscratch2;                                          \
2016   if (prev->is_valid())                                                      \
2017     result = different(prev, incr, addr) ? prev : rscratch2;            \
2018                                                                         \
2019   Label retry_load;                                                     \
2020   bind(retry_load);                                                     \
2021   LDXR(result, addr);                                                   \
2022   OP(rscratch1, result, incr);                                          \
2023   STXR(rscratch1, rscratch1, addr);                                     \
2024   cbnzw(rscratch1, retry_load);                                         \
2025   if (prev->is_valid() && prev != result)                            \
2026     mov(prev, result);                                                  \
2027 }
2028 
2029 ATOMIC_OP(ldxr, add, stxr)
2030 ATOMIC_OP(ldxrw, addw, stxrw)
2031 
2032 #undef ATOMIC_OP
2033 
2034 #define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
2035 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2036   Register result = rscratch2;                                          \
2037   if (prev->is_valid())                                                      \
2038     result = different(prev, newv, addr) ? prev : rscratch2;            \
2039                                                                         \
2040   Label retry_load;                                                     \
2041   bind(retry_load);                                                     \
2042   LDXR(result, addr);                                                   \
2043   STXR(rscratch1, newv, addr);                                          \
2044   cbnzw(rscratch1, retry_load);                                         \
2045   if (prev->is_valid() && prev != result)                            \
2046     mov(prev, result);                                                  \
2047 }
2048 
2049 ATOMIC_XCHG(xchg, ldxr, stxr)
2050 ATOMIC_XCHG(xchgw, ldxrw, stxrw)
2051 
2052 #undef ATOMIC_XCHG
2053 
2054 void MacroAssembler::incr_allocated_bytes(Register thread,
2055                                           Register var_size_in_bytes,
2056                                           int con_size_in_bytes,
2057                                           Register t1) {
2058   if (!thread->is_valid()) {
2059     thread = rthread;
2060   }
2061   assert(t1->is_valid(), "need temp reg");
2062 
2063   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2064   if (var_size_in_bytes->is_valid()) {
2065     add(t1, t1, var_size_in_bytes);
2066   } else {
2067     add(t1, t1, con_size_in_bytes);
2068   }
2069   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2070 }
2071 
2072 #ifndef PRODUCT
2073 extern "C" void findpc(intptr_t x);
2074 #endif
2075 
2076 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2077 {
2078   // In order to get locks to work, we need to fake a in_VM state
2079   if (ShowMessageBoxOnError ) {
2080     JavaThread* thread = JavaThread::current();
2081     JavaThreadState saved_state = thread->thread_state();
2082     thread->set_thread_state(_thread_in_vm);
2083 #ifndef PRODUCT
2084     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2085       ttyLocker ttyl;
2086       BytecodeCounter::print();
2087     }
2088 #endif
2089     if (os::message_box(msg, "Execution stopped, print registers?")) {
2090       ttyLocker ttyl;
2091       tty->print_cr(" pc = 0x%016lx", pc);
2092 #ifndef PRODUCT
2093       tty->cr();
2094       findpc(pc);
2095       tty->cr();
2096 #endif
2097       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2098       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2099       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2100       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2101       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2102       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2103       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2104       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2105       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2106       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2107       tty->print_cr("r10 = 0x%016lx", regs[10]);
2108       tty->print_cr("r11 = 0x%016lx", regs[11]);
2109       tty->print_cr("r12 = 0x%016lx", regs[12]);
2110       tty->print_cr("r13 = 0x%016lx", regs[13]);
2111       tty->print_cr("r14 = 0x%016lx", regs[14]);
2112       tty->print_cr("r15 = 0x%016lx", regs[15]);
2113       tty->print_cr("r16 = 0x%016lx", regs[16]);
2114       tty->print_cr("r17 = 0x%016lx", regs[17]);
2115       tty->print_cr("r18 = 0x%016lx", regs[18]);
2116       tty->print_cr("r19 = 0x%016lx", regs[19]);
2117       tty->print_cr("r20 = 0x%016lx", regs[20]);
2118       tty->print_cr("r21 = 0x%016lx", regs[21]);
2119       tty->print_cr("r22 = 0x%016lx", regs[22]);
2120       tty->print_cr("r23 = 0x%016lx", regs[23]);
2121       tty->print_cr("r24 = 0x%016lx", regs[24]);
2122       tty->print_cr("r25 = 0x%016lx", regs[25]);
2123       tty->print_cr("r26 = 0x%016lx", regs[26]);
2124       tty->print_cr("r27 = 0x%016lx", regs[27]);
2125       tty->print_cr("r28 = 0x%016lx", regs[28]);
2126       tty->print_cr("r30 = 0x%016lx", regs[30]);
2127       tty->print_cr("r31 = 0x%016lx", regs[31]);
2128       BREAKPOINT;
2129     }
2130     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2131   } else {
2132     ttyLocker ttyl;
2133     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2134                     msg);
2135     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
2136   }
2137 }
2138 
2139 #ifdef BUILTIN_SIM
2140 // routine to generate an x86 prolog for a stub function which
2141 // bootstraps into the generated ARM code which directly follows the
2142 // stub
2143 //
2144 // the argument encodes the number of general and fp registers
2145 // passed by the caller and the callng convention (currently just
2146 // the number of general registers and assumes C argument passing)
2147 
2148 extern "C" {
2149 int aarch64_stub_prolog_size();
2150 void aarch64_stub_prolog();
2151 void aarch64_prolog();
2152 }
2153 
2154 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2155                                    address *prolog_ptr)
2156 {
2157   int calltype = (((ret_type & 0x3) << 8) |
2158                   ((fp_arg_count & 0xf) << 4) |
2159                   (gp_arg_count & 0xf));
2160 
2161   // the addresses for the x86 to ARM entry code we need to use
2162   address start = pc();
2163   // printf("start = %lx\n", start);
2164   int byteCount =  aarch64_stub_prolog_size();
2165   // printf("byteCount = %x\n", byteCount);
2166   int instructionCount = (byteCount + 3)/ 4;
2167   // printf("instructionCount = %x\n", instructionCount);
2168   for (int i = 0; i < instructionCount; i++) {
2169     nop();
2170   }
2171 
2172   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2173 
2174   // write the address of the setup routine and the call format at the
2175   // end of into the copied code
2176   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2177   if (prolog_ptr)
2178     patch_end[-2] = (u_int64_t)prolog_ptr;
2179   patch_end[-1] = calltype;
2180 }
2181 #endif
2182 
2183 void MacroAssembler::push_CPU_state() {
2184     push(0x3fffffff, sp);         // integer registers except lr & sp
2185 
2186     for (int i = 30; i >= 0; i -= 2)
2187       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2188            Address(pre(sp, -2 * wordSize)));
2189 }
2190 
2191 void MacroAssembler::pop_CPU_state() {
2192   for (int i = 0; i < 32; i += 2)
2193     ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2194          Address(post(sp, 2 * wordSize)));
2195 
2196   pop(0x3fffffff, sp);         // integer registers except lr & sp
2197 }
2198 
2199 /**
2200  * Helpers for multiply_to_len().
2201  */
2202 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2203                                      Register src1, Register src2) {
2204   adds(dest_lo, dest_lo, src1);
2205   adc(dest_hi, dest_hi, zr);
2206   adds(dest_lo, dest_lo, src2);
2207   adc(final_dest_hi, dest_hi, zr);
2208 }
2209 
2210 // Generate an address from (r + r1 extend offset).  "size" is the
2211 // size of the operand.  The result may be in rscratch2.
2212 Address MacroAssembler::offsetted_address(Register r, Register r1,
2213                                           Address::extend ext, int offset, int size) {
2214   if (offset || (ext.shift() % size != 0)) {
2215     lea(rscratch2, Address(r, r1, ext));
2216     return Address(rscratch2, offset);
2217   } else {
2218     return Address(r, r1, ext);
2219   }
2220 }
2221 
2222 /**
2223  * Multiply 64 bit by 64 bit first loop.
2224  */
2225 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2226                                            Register y, Register y_idx, Register z,
2227                                            Register carry, Register product,
2228                                            Register idx, Register kdx) {
2229   //
2230   //  jlong carry, x[], y[], z[];
2231   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2232   //    huge_128 product = y[idx] * x[xstart] + carry;
2233   //    z[kdx] = (jlong)product;
2234   //    carry  = (jlong)(product >>> 64);
2235   //  }
2236   //  z[xstart] = carry;
2237   //
2238 
2239   Label L_first_loop, L_first_loop_exit;
2240   Label L_one_x, L_one_y, L_multiply;
2241 
2242   subsw(xstart, xstart, 1);
2243   br(Assembler::MI, L_one_x);
2244 
2245   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2246   ldr(x_xstart, Address(rscratch1));
2247   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2248 
2249   bind(L_first_loop);
2250   subsw(idx, idx, 1);
2251   br(Assembler::MI, L_first_loop_exit);
2252   subsw(idx, idx, 1);
2253   br(Assembler::MI, L_one_y);
2254   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2255   ldr(y_idx, Address(rscratch1));
2256   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2257   bind(L_multiply);
2258 
2259   // AArch64 has a multiply-accumulate instruction that we can't use
2260   // here because it has no way to process carries, so we have to use
2261   // separate add and adc instructions.  Bah.
2262   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2263   mul(product, x_xstart, y_idx);
2264   adds(product, product, carry);
2265   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2266 
2267   subw(kdx, kdx, 2);
2268   ror(product, product, 32); // back to big-endian
2269   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2270 
2271   b(L_first_loop);
2272 
2273   bind(L_one_y);
2274   ldrw(y_idx, Address(y,  0));
2275   b(L_multiply);
2276 
2277   bind(L_one_x);
2278   ldrw(x_xstart, Address(x,  0));
2279   b(L_first_loop);
2280 
2281   bind(L_first_loop_exit);
2282 }
2283 
2284 /**
2285  * Multiply 128 bit by 128. Unrolled inner loop.
2286  *
2287  */
2288 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2289                                              Register carry, Register carry2,
2290                                              Register idx, Register jdx,
2291                                              Register yz_idx1, Register yz_idx2,
2292                                              Register tmp, Register tmp3, Register tmp4,
2293                                              Register tmp6, Register product_hi) {
2294 
2295   //   jlong carry, x[], y[], z[];
2296   //   int kdx = ystart+1;
2297   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2298   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2299   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2300   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2301   //     carry  = (jlong)(tmp4 >>> 64);
2302   //     z[kdx+idx+1] = (jlong)tmp3;
2303   //     z[kdx+idx] = (jlong)tmp4;
2304   //   }
2305   //   idx += 2;
2306   //   if (idx > 0) {
2307   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2308   //     z[kdx+idx] = (jlong)yz_idx1;
2309   //     carry  = (jlong)(yz_idx1 >>> 64);
2310   //   }
2311   //
2312 
2313   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2314 
2315   lsrw(jdx, idx, 2);
2316 
2317   bind(L_third_loop);
2318 
2319   subsw(jdx, jdx, 1);
2320   br(Assembler::MI, L_third_loop_exit);
2321   subw(idx, idx, 4);
2322 
2323   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2324 
2325   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2326 
2327   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2328 
2329   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2330   ror(yz_idx2, yz_idx2, 32);
2331 
2332   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2333 
2334   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2335   umulh(tmp4, product_hi, yz_idx1);
2336 
2337   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2338   ror(rscratch2, rscratch2, 32);
2339 
2340   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2341   umulh(carry2, product_hi, yz_idx2);
2342 
2343   // propagate sum of both multiplications into carry:tmp4:tmp3
2344   adds(tmp3, tmp3, carry);
2345   adc(tmp4, tmp4, zr);
2346   adds(tmp3, tmp3, rscratch1);
2347   adcs(tmp4, tmp4, tmp);
2348   adc(carry, carry2, zr);
2349   adds(tmp4, tmp4, rscratch2);
2350   adc(carry, carry, zr);
2351 
2352   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2353   ror(tmp4, tmp4, 32);
2354   stp(tmp4, tmp3, Address(tmp6, 0));
2355 
2356   b(L_third_loop);
2357   bind (L_third_loop_exit);
2358 
2359   andw (idx, idx, 0x3);
2360   cbz(idx, L_post_third_loop_done);
2361 
2362   Label L_check_1;
2363   subsw(idx, idx, 2);
2364   br(Assembler::MI, L_check_1);
2365 
2366   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2367   ldr(yz_idx1, Address(rscratch1, 0));
2368   ror(yz_idx1, yz_idx1, 32);
2369   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2370   umulh(tmp4, product_hi, yz_idx1);
2371   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2372   ldr(yz_idx2, Address(rscratch1, 0));
2373   ror(yz_idx2, yz_idx2, 32);
2374 
2375   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2376 
2377   ror(tmp3, tmp3, 32);
2378   str(tmp3, Address(rscratch1, 0));
2379 
2380   bind (L_check_1);
2381 
2382   andw (idx, idx, 0x1);
2383   subsw(idx, idx, 1);
2384   br(Assembler::MI, L_post_third_loop_done);
2385   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2386   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2387   umulh(carry2, tmp4, product_hi);
2388   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2389 
2390   add2_with_carry(carry2, tmp3, tmp4, carry);
2391 
2392   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2393   extr(carry, carry2, tmp3, 32);
2394 
2395   bind(L_post_third_loop_done);
2396 }
2397 
2398 /**
2399  * Code for BigInteger::multiplyToLen() instrinsic.
2400  *
2401  * r0: x
2402  * r1: xlen
2403  * r2: y
2404  * r3: ylen
2405  * r4:  z
2406  * r5: zlen
2407  * r10: tmp1
2408  * r11: tmp2
2409  * r12: tmp3
2410  * r13: tmp4
2411  * r14: tmp5
2412  * r15: tmp6
2413  * r16: tmp7
2414  *
2415  */
2416 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2417                                      Register z, Register zlen,
2418                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2419                                      Register tmp5, Register tmp6, Register product_hi) {
2420 
2421   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2422 
2423   const Register idx = tmp1;
2424   const Register kdx = tmp2;
2425   const Register xstart = tmp3;
2426 
2427   const Register y_idx = tmp4;
2428   const Register carry = tmp5;
2429   const Register product  = xlen;
2430   const Register x_xstart = zlen;  // reuse register
2431 
2432   // First Loop.
2433   //
2434   //  final static long LONG_MASK = 0xffffffffL;
2435   //  int xstart = xlen - 1;
2436   //  int ystart = ylen - 1;
2437   //  long carry = 0;
2438   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2439   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2440   //    z[kdx] = (int)product;
2441   //    carry = product >>> 32;
2442   //  }
2443   //  z[xstart] = (int)carry;
2444   //
2445 
2446   movw(idx, ylen);      // idx = ylen;
2447   movw(kdx, zlen);      // kdx = xlen+ylen;
2448   mov(carry, zr);       // carry = 0;
2449 
2450   Label L_done;
2451 
2452   movw(xstart, xlen);
2453   subsw(xstart, xstart, 1);
2454   br(Assembler::MI, L_done);
2455 
2456   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2457 
2458   Label L_second_loop;
2459   cbzw(kdx, L_second_loop);
2460 
2461   Label L_carry;
2462   subw(kdx, kdx, 1);
2463   cbzw(kdx, L_carry);
2464 
2465   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2466   lsr(carry, carry, 32);
2467   subw(kdx, kdx, 1);
2468 
2469   bind(L_carry);
2470   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2471 
2472   // Second and third (nested) loops.
2473   //
2474   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2475   //   carry = 0;
2476   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2477   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2478   //                    (z[k] & LONG_MASK) + carry;
2479   //     z[k] = (int)product;
2480   //     carry = product >>> 32;
2481   //   }
2482   //   z[i] = (int)carry;
2483   // }
2484   //
2485   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2486 
2487   const Register jdx = tmp1;
2488 
2489   bind(L_second_loop);
2490   mov(carry, zr);                // carry = 0;
2491   movw(jdx, ylen);               // j = ystart+1
2492 
2493   subsw(xstart, xstart, 1);      // i = xstart-1;
2494   br(Assembler::MI, L_done);
2495 
2496   str(z, Address(pre(sp, -4 * wordSize)));
2497 
2498   Label L_last_x;
2499   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2500   subsw(xstart, xstart, 1);       // i = xstart-1;
2501   br(Assembler::MI, L_last_x);
2502 
2503   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2504   ldr(product_hi, Address(rscratch1));
2505   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2506 
2507   Label L_third_loop_prologue;
2508   bind(L_third_loop_prologue);
2509 
2510   str(ylen, Address(sp, wordSize));
2511   stp(x, xstart, Address(sp, 2 * wordSize));
2512   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2513                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2514   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2515   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2516 
2517   addw(tmp3, xlen, 1);
2518   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2519   subsw(tmp3, tmp3, 1);
2520   br(Assembler::MI, L_done);
2521 
2522   lsr(carry, carry, 32);
2523   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2524   b(L_second_loop);
2525 
2526   // Next infrequent code is moved outside loops.
2527   bind(L_last_x);
2528   ldrw(product_hi, Address(x,  0));
2529   b(L_third_loop_prologue);
2530 
2531   bind(L_done);
2532 }
2533 
2534 /**
2535  * Emits code to update CRC-32 with a byte value according to constants in table
2536  *
2537  * @param [in,out]crc   Register containing the crc.
2538  * @param [in]val       Register containing the byte to fold into the CRC.
2539  * @param [in]table     Register containing the table of crc constants.
2540  *
2541  * uint32_t crc;
2542  * val = crc_table[(val ^ crc) & 0xFF];
2543  * crc = val ^ (crc >> 8);
2544  *
2545  */
2546 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2547   eor(val, val, crc);
2548   andr(val, val, 0xff);
2549   ldrw(val, Address(table, val, Address::lsl(2)));
2550   eor(crc, val, crc, Assembler::LSR, 8);
2551 }
2552 
2553 /**
2554  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2555  *
2556  * @param [in,out]crc   Register containing the crc.
2557  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2558  * @param [in]table0    Register containing table 0 of crc constants.
2559  * @param [in]table1    Register containing table 1 of crc constants.
2560  * @param [in]table2    Register containing table 2 of crc constants.
2561  * @param [in]table3    Register containing table 3 of crc constants.
2562  *
2563  * uint32_t crc;
2564  *   v = crc ^ v
2565  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2566  *
2567  */
2568 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2569         Register table0, Register table1, Register table2, Register table3,
2570         bool upper) {
2571   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2572   uxtb(tmp, v);
2573   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2574   ubfx(tmp, v, 8, 8);
2575   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2576   eor(crc, crc, tmp);
2577   ubfx(tmp, v, 16, 8);
2578   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2579   eor(crc, crc, tmp);
2580   ubfx(tmp, v, 24, 8);
2581   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2582   eor(crc, crc, tmp);
2583 }
2584 
2585 /**
2586  * @param crc   register containing existing CRC (32-bit)
2587  * @param buf   register pointing to input byte buffer (byte*)
2588  * @param len   register containing number of bytes
2589  * @param table register that will contain address of CRC table
2590  * @param tmp   scratch register
2591  */
2592 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2593         Register table0, Register table1, Register table2, Register table3,
2594         Register tmp, Register tmp2, Register tmp3) {
2595   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2596   unsigned long offset;
2597 
2598     ornw(crc, zr, crc);
2599 
2600   if (UseCRC32) {
2601     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2602 
2603       subs(len, len, 64);
2604       br(Assembler::GE, CRC_by64_loop);
2605       adds(len, len, 64-4);
2606       br(Assembler::GE, CRC_by4_loop);
2607       adds(len, len, 4);
2608       br(Assembler::GT, CRC_by1_loop);
2609       b(L_exit);
2610 
2611     BIND(CRC_by4_loop);
2612       ldrw(tmp, Address(post(buf, 4)));
2613       subs(len, len, 4);
2614       crc32w(crc, crc, tmp);
2615       br(Assembler::GE, CRC_by4_loop);
2616       adds(len, len, 4);
2617       br(Assembler::LE, L_exit);
2618     BIND(CRC_by1_loop);
2619       ldrb(tmp, Address(post(buf, 1)));
2620       subs(len, len, 1);
2621       crc32b(crc, crc, tmp);
2622       br(Assembler::GT, CRC_by1_loop);
2623       b(L_exit);
2624 
2625       align(CodeEntryAlignment);
2626     BIND(CRC_by64_loop);
2627       subs(len, len, 64);
2628       ldp(tmp, tmp3, Address(post(buf, 16)));
2629       crc32x(crc, crc, tmp);
2630       crc32x(crc, crc, tmp3);
2631       ldp(tmp, tmp3, Address(post(buf, 16)));
2632       crc32x(crc, crc, tmp);
2633       crc32x(crc, crc, tmp3);
2634       ldp(tmp, tmp3, Address(post(buf, 16)));
2635       crc32x(crc, crc, tmp);
2636       crc32x(crc, crc, tmp3);
2637       ldp(tmp, tmp3, Address(post(buf, 16)));
2638       crc32x(crc, crc, tmp);
2639       crc32x(crc, crc, tmp3);
2640       br(Assembler::GE, CRC_by64_loop);
2641       adds(len, len, 64-4);
2642       br(Assembler::GE, CRC_by4_loop);
2643       adds(len, len, 4);
2644       br(Assembler::GT, CRC_by1_loop);
2645     BIND(L_exit);
2646       ornw(crc, zr, crc);
2647       return;
2648   }
2649 
2650     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
2651     if (offset) add(table0, table0, offset);
2652     add(table1, table0, 1*256*sizeof(juint));
2653     add(table2, table0, 2*256*sizeof(juint));
2654     add(table3, table0, 3*256*sizeof(juint));
2655 
2656   if (UseNeon) {
2657       cmp(len, 64);
2658       br(Assembler::LT, L_by16);
2659       eor(v16, T16B, v16, v16);
2660 
2661     Label L_fold;
2662 
2663       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
2664 
2665       ld1(v0, v1, T2D, post(buf, 32));
2666       ld1r(v4, T2D, post(tmp, 8));
2667       ld1r(v5, T2D, post(tmp, 8));
2668       ld1r(v6, T2D, post(tmp, 8));
2669       ld1r(v7, T2D, post(tmp, 8));
2670       mov(v16, T4S, 0, crc);
2671 
2672       eor(v0, T16B, v0, v16);
2673       sub(len, len, 64);
2674 
2675     BIND(L_fold);
2676       pmull(v22, T8H, v0, v5, T8B);
2677       pmull(v20, T8H, v0, v7, T8B);
2678       pmull(v23, T8H, v0, v4, T8B);
2679       pmull(v21, T8H, v0, v6, T8B);
2680 
2681       pmull2(v18, T8H, v0, v5, T16B);
2682       pmull2(v16, T8H, v0, v7, T16B);
2683       pmull2(v19, T8H, v0, v4, T16B);
2684       pmull2(v17, T8H, v0, v6, T16B);
2685 
2686       uzp1(v24, v20, v22, T8H);
2687       uzp2(v25, v20, v22, T8H);
2688       eor(v20, T16B, v24, v25);
2689 
2690       uzp1(v26, v16, v18, T8H);
2691       uzp2(v27, v16, v18, T8H);
2692       eor(v16, T16B, v26, v27);
2693 
2694       ushll2(v22, T4S, v20, T8H, 8);
2695       ushll(v20, T4S, v20, T4H, 8);
2696 
2697       ushll2(v18, T4S, v16, T8H, 8);
2698       ushll(v16, T4S, v16, T4H, 8);
2699 
2700       eor(v22, T16B, v23, v22);
2701       eor(v18, T16B, v19, v18);
2702       eor(v20, T16B, v21, v20);
2703       eor(v16, T16B, v17, v16);
2704 
2705       uzp1(v17, v16, v20, T2D);
2706       uzp2(v21, v16, v20, T2D);
2707       eor(v17, T16B, v17, v21);
2708 
2709       ushll2(v20, T2D, v17, T4S, 16);
2710       ushll(v16, T2D, v17, T2S, 16);
2711 
2712       eor(v20, T16B, v20, v22);
2713       eor(v16, T16B, v16, v18);
2714 
2715       uzp1(v17, v20, v16, T2D);
2716       uzp2(v21, v20, v16, T2D);
2717       eor(v28, T16B, v17, v21);
2718 
2719       pmull(v22, T8H, v1, v5, T8B);
2720       pmull(v20, T8H, v1, v7, T8B);
2721       pmull(v23, T8H, v1, v4, T8B);
2722       pmull(v21, T8H, v1, v6, T8B);
2723 
2724       pmull2(v18, T8H, v1, v5, T16B);
2725       pmull2(v16, T8H, v1, v7, T16B);
2726       pmull2(v19, T8H, v1, v4, T16B);
2727       pmull2(v17, T8H, v1, v6, T16B);
2728 
2729       ld1(v0, v1, T2D, post(buf, 32));
2730 
2731       uzp1(v24, v20, v22, T8H);
2732       uzp2(v25, v20, v22, T8H);
2733       eor(v20, T16B, v24, v25);
2734 
2735       uzp1(v26, v16, v18, T8H);
2736       uzp2(v27, v16, v18, T8H);
2737       eor(v16, T16B, v26, v27);
2738 
2739       ushll2(v22, T4S, v20, T8H, 8);
2740       ushll(v20, T4S, v20, T4H, 8);
2741 
2742       ushll2(v18, T4S, v16, T8H, 8);
2743       ushll(v16, T4S, v16, T4H, 8);
2744 
2745       eor(v22, T16B, v23, v22);
2746       eor(v18, T16B, v19, v18);
2747       eor(v20, T16B, v21, v20);
2748       eor(v16, T16B, v17, v16);
2749 
2750       uzp1(v17, v16, v20, T2D);
2751       uzp2(v21, v16, v20, T2D);
2752       eor(v16, T16B, v17, v21);
2753 
2754       ushll2(v20, T2D, v16, T4S, 16);
2755       ushll(v16, T2D, v16, T2S, 16);
2756 
2757       eor(v20, T16B, v22, v20);
2758       eor(v16, T16B, v16, v18);
2759 
2760       uzp1(v17, v20, v16, T2D);
2761       uzp2(v21, v20, v16, T2D);
2762       eor(v20, T16B, v17, v21);
2763 
2764       shl(v16, T2D, v28, 1);
2765       shl(v17, T2D, v20, 1);
2766 
2767       eor(v0, T16B, v0, v16);
2768       eor(v1, T16B, v1, v17);
2769 
2770       subs(len, len, 32);
2771       br(Assembler::GE, L_fold);
2772 
2773       mov(crc, 0);
2774       mov(tmp, v0, T1D, 0);
2775       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2776       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2777       mov(tmp, v0, T1D, 1);
2778       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2779       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2780       mov(tmp, v1, T1D, 0);
2781       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2782       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2783       mov(tmp, v1, T1D, 1);
2784       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2785       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2786 
2787       add(len, len, 32);
2788   }
2789 
2790   BIND(L_by16);
2791     subs(len, len, 16);
2792     br(Assembler::GE, L_by16_loop);
2793     adds(len, len, 16-4);
2794     br(Assembler::GE, L_by4_loop);
2795     adds(len, len, 4);
2796     br(Assembler::GT, L_by1_loop);
2797     b(L_exit);
2798 
2799   BIND(L_by4_loop);
2800     ldrw(tmp, Address(post(buf, 4)));
2801     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
2802     subs(len, len, 4);
2803     br(Assembler::GE, L_by4_loop);
2804     adds(len, len, 4);
2805     br(Assembler::LE, L_exit);
2806   BIND(L_by1_loop);
2807     subs(len, len, 1);
2808     ldrb(tmp, Address(post(buf, 1)));
2809     update_byte_crc32(crc, tmp, table0);
2810     br(Assembler::GT, L_by1_loop);
2811     b(L_exit);
2812 
2813     align(CodeEntryAlignment);
2814   BIND(L_by16_loop);
2815     subs(len, len, 16);
2816     ldp(tmp, tmp3, Address(post(buf, 16)));
2817     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2818     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2819     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
2820     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
2821     br(Assembler::GE, L_by16_loop);
2822     adds(len, len, 16-4);
2823     br(Assembler::GE, L_by4_loop);
2824     adds(len, len, 4);
2825     br(Assembler::GT, L_by1_loop);
2826   BIND(L_exit);
2827     ornw(crc, zr, crc);
2828 }
2829 
2830 SkipIfEqual::SkipIfEqual(
2831     MacroAssembler* masm, const bool* flag_addr, bool value) {
2832   _masm = masm;
2833   unsigned long offset;
2834   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
2835   _masm->ldrb(rscratch1, Address(rscratch1, offset));
2836   _masm->cbzw(rscratch1, _label);
2837 }
2838 
2839 SkipIfEqual::~SkipIfEqual() {
2840   _masm->bind(_label);
2841 }
2842 
2843 void MacroAssembler::cmpptr(Register src1, Address src2) {
2844   unsigned long offset;
2845   adrp(rscratch1, src2, offset);
2846   ldr(rscratch1, Address(rscratch1, offset));
2847   cmp(src1, rscratch1);
2848 }
2849 
2850 void MacroAssembler::store_check(Register obj) {
2851   // Does a store check for the oop in register obj. The content of
2852   // register obj is destroyed afterwards.
2853   store_check_part_1(obj);
2854   store_check_part_2(obj);
2855 }
2856 
2857 void MacroAssembler::store_check(Register obj, Address dst) {
2858   store_check(obj);
2859 }
2860 
2861 
2862 // split the store check operation so that other instructions can be scheduled inbetween
2863 void MacroAssembler::store_check_part_1(Register obj) {
2864   BarrierSet* bs = Universe::heap()->barrier_set();
2865   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
2866   lsr(obj, obj, CardTableModRefBS::card_shift);
2867 }
2868 
2869 void MacroAssembler::store_check_part_2(Register obj) {
2870   BarrierSet* bs = Universe::heap()->barrier_set();
2871   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
2872   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
2873   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
2874 
2875   // The calculation for byte_map_base is as follows:
2876   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
2877   // So this essentially converts an address to a displacement and
2878   // it will never need to be relocated.
2879   intptr_t disp = (intptr_t) ct->byte_map_base;
2880   mov(rscratch1, disp);
2881   add(rscratch1, rscratch1, obj);
2882   mov(obj, zr);
2883   stlrb(obj, rscratch1);
2884 }
2885 
2886 void MacroAssembler::load_klass(Register dst, Register src) {
2887   if (UseCompressedClassPointers) {
2888     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2889     decode_klass_not_null(dst);
2890   } else {
2891     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
2892   }
2893 }
2894 
2895 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
2896   if (UseCompressedClassPointers) {
2897     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
2898     if (Universe::narrow_klass_base() == NULL) {
2899       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
2900       return;
2901     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
2902                && Universe::narrow_klass_shift() == 0) {
2903       // Only the bottom 32 bits matter
2904       cmpw(trial_klass, tmp);
2905       return;
2906     }
2907     decode_klass_not_null(tmp);
2908   } else {
2909     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
2910   }
2911   cmp(trial_klass, tmp);
2912 }
2913 
2914 void MacroAssembler::load_prototype_header(Register dst, Register src) {
2915   load_klass(dst, src);
2916   ldr(dst, Address(dst, Klass::prototype_header_offset()));
2917 }
2918 
2919 void MacroAssembler::store_klass(Register dst, Register src) {
2920   // FIXME: Should this be a store release?  concurrent gcs assumes
2921   // klass length is valid if klass field is not null.
2922   if (UseCompressedClassPointers) {
2923     encode_klass_not_null(src);
2924     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2925   } else {
2926     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
2927   }
2928 }
2929 
2930 void MacroAssembler::store_klass_gap(Register dst, Register src) {
2931   if (UseCompressedClassPointers) {
2932     // Store to klass gap in destination
2933     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2934   }
2935 }
2936 
2937 // Algorithm must match oop.inline.hpp encode_heap_oop.
2938 void MacroAssembler::encode_heap_oop(Register d, Register s) {
2939 #ifdef ASSERT
2940   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
2941 #endif
2942   verify_oop(s, "broken oop in encode_heap_oop");
2943   if (Universe::narrow_oop_base() == NULL) {
2944     if (Universe::narrow_oop_shift() != 0) {
2945       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2946       lsr(d, s, LogMinObjAlignmentInBytes);
2947     } else {
2948       mov(d, s);
2949     }
2950   } else {
2951     subs(d, s, rheapbase);
2952     csel(d, d, zr, Assembler::HS);
2953     lsr(d, d, LogMinObjAlignmentInBytes);
2954 
2955     /*  Old algorithm: is this any worse?
2956     Label nonnull;
2957     cbnz(r, nonnull);
2958     sub(r, r, rheapbase);
2959     bind(nonnull);
2960     lsr(r, r, LogMinObjAlignmentInBytes);
2961     */
2962   }
2963 }
2964 
2965 void MacroAssembler::encode_heap_oop_not_null(Register r) {
2966 #ifdef ASSERT
2967   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
2968   if (CheckCompressedOops) {
2969     Label ok;
2970     cbnz(r, ok);
2971     stop("null oop passed to encode_heap_oop_not_null");
2972     bind(ok);
2973   }
2974 #endif
2975   verify_oop(r, "broken oop in encode_heap_oop_not_null");
2976   if (Universe::narrow_oop_base() != NULL) {
2977     sub(r, r, rheapbase);
2978   }
2979   if (Universe::narrow_oop_shift() != 0) {
2980     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
2981     lsr(r, r, LogMinObjAlignmentInBytes);
2982   }
2983 }
2984 
2985 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
2986 #ifdef ASSERT
2987   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
2988   if (CheckCompressedOops) {
2989     Label ok;
2990     cbnz(src, ok);
2991     stop("null oop passed to encode_heap_oop_not_null2");
2992     bind(ok);
2993   }
2994 #endif
2995   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
2996 
2997   Register data = src;
2998   if (Universe::narrow_oop_base() != NULL) {
2999     sub(dst, src, rheapbase);
3000     data = dst;
3001   }
3002   if (Universe::narrow_oop_shift() != 0) {
3003     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3004     lsr(dst, data, LogMinObjAlignmentInBytes);
3005     data = dst;
3006   }
3007   if (data == src)
3008     mov(dst, src);
3009 }
3010 
3011 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3012 #ifdef ASSERT
3013   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3014 #endif
3015   if (Universe::narrow_oop_base() == NULL) {
3016     if (Universe::narrow_oop_shift() != 0 || d != s) {
3017       lsl(d, s, Universe::narrow_oop_shift());
3018     }
3019   } else {
3020     Label done;
3021     if (d != s)
3022       mov(d, s);
3023     cbz(s, done);
3024     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3025     bind(done);
3026   }
3027   verify_oop(d, "broken oop in decode_heap_oop");
3028 }
3029 
3030 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3031   assert (UseCompressedOops, "should only be used for compressed headers");
3032   assert (Universe::heap() != NULL, "java heap should be initialized");
3033   // Cannot assert, unverified entry point counts instructions (see .ad file)
3034   // vtableStubs also counts instructions in pd_code_size_limit.
3035   // Also do not verify_oop as this is called by verify_oop.
3036   if (Universe::narrow_oop_shift() != 0) {
3037     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3038     if (Universe::narrow_oop_base() != NULL) {
3039       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3040     } else {
3041       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3042     }
3043   } else {
3044     assert (Universe::narrow_oop_base() == NULL, "sanity");
3045   }
3046 }
3047 
3048 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3049   assert (UseCompressedOops, "should only be used for compressed headers");
3050   assert (Universe::heap() != NULL, "java heap should be initialized");
3051   // Cannot assert, unverified entry point counts instructions (see .ad file)
3052   // vtableStubs also counts instructions in pd_code_size_limit.
3053   // Also do not verify_oop as this is called by verify_oop.
3054   if (Universe::narrow_oop_shift() != 0) {
3055     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3056     if (Universe::narrow_oop_base() != NULL) {
3057       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3058     } else {
3059       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3060     }
3061   } else {
3062     assert (Universe::narrow_oop_base() == NULL, "sanity");
3063     if (dst != src) {
3064       mov(dst, src);
3065     }
3066   }
3067 }
3068 
3069 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3070   if (Universe::narrow_klass_base() == NULL) {
3071     if (Universe::narrow_klass_shift() != 0) {
3072       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3073       lsr(dst, src, LogKlassAlignmentInBytes);
3074     } else {
3075       if (dst != src) mov(dst, src);
3076     }
3077     return;
3078   }
3079 
3080   if (use_XOR_for_compressed_class_base) {
3081     if (Universe::narrow_klass_shift() != 0) {
3082       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3083       lsr(dst, dst, LogKlassAlignmentInBytes);
3084     } else {
3085       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3086     }
3087     return;
3088   }
3089 
3090   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3091       && Universe::narrow_klass_shift() == 0) {
3092     movw(dst, src);
3093     return;
3094   }
3095 
3096 #ifdef ASSERT
3097   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3098 #endif
3099 
3100   Register rbase = dst;
3101   if (dst == src) rbase = rheapbase;
3102   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3103   sub(dst, src, rbase);
3104   if (Universe::narrow_klass_shift() != 0) {
3105     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3106     lsr(dst, dst, LogKlassAlignmentInBytes);
3107   }
3108   if (dst == src) reinit_heapbase();
3109 }
3110 
3111 void MacroAssembler::encode_klass_not_null(Register r) {
3112   encode_klass_not_null(r, r);
3113 }
3114 
3115 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3116   Register rbase = dst;
3117   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3118 
3119   if (Universe::narrow_klass_base() == NULL) {
3120     if (Universe::narrow_klass_shift() != 0) {
3121       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3122       lsl(dst, src, LogKlassAlignmentInBytes);
3123     } else {
3124       if (dst != src) mov(dst, src);
3125     }
3126     return;
3127   }
3128 
3129   if (use_XOR_for_compressed_class_base) {
3130     if (Universe::narrow_klass_shift() != 0) {
3131       lsl(dst, src, LogKlassAlignmentInBytes);
3132       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3133     } else {
3134       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3135     }
3136     return;
3137   }
3138 
3139   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3140       && Universe::narrow_klass_shift() == 0) {
3141     if (dst != src)
3142       movw(dst, src);
3143     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3144     return;
3145   }
3146 
3147   // Cannot assert, unverified entry point counts instructions (see .ad file)
3148   // vtableStubs also counts instructions in pd_code_size_limit.
3149   // Also do not verify_oop as this is called by verify_oop.
3150   if (dst == src) rbase = rheapbase;
3151   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3152   if (Universe::narrow_klass_shift() != 0) {
3153     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3154     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3155   } else {
3156     add(dst, rbase, src);
3157   }
3158   if (dst == src) reinit_heapbase();
3159 }
3160 
3161 void  MacroAssembler::decode_klass_not_null(Register r) {
3162   decode_klass_not_null(r, r);
3163 }
3164 
3165 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3166   assert (UseCompressedOops, "should only be used for compressed oops");
3167   assert (Universe::heap() != NULL, "java heap should be initialized");
3168   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3169 
3170   int oop_index = oop_recorder()->find_index(obj);
3171   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3172 
3173   InstructionMark im(this);
3174   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3175   code_section()->relocate(inst_mark(), rspec);
3176   movz(dst, 0xDEAD, 16);
3177   movk(dst, 0xBEEF);
3178 }
3179 
3180 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3181   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3182   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3183   int index = oop_recorder()->find_index(k);
3184   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3185 
3186   InstructionMark im(this);
3187   RelocationHolder rspec = metadata_Relocation::spec(index);
3188   code_section()->relocate(inst_mark(), rspec);
3189   narrowKlass nk = Klass::encode_klass(k);
3190   movz(dst, (nk >> 16), 16);
3191   movk(dst, nk & 0xffff);
3192 }
3193 
3194 void MacroAssembler::load_heap_oop(Register dst, Address src)
3195 {
3196   if (UseCompressedOops) {
3197     ldrw(dst, src);
3198     decode_heap_oop(dst);
3199   } else {
3200     ldr(dst, src);
3201   }
3202 }
3203 
3204 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3205 {
3206   if (UseCompressedOops) {
3207     ldrw(dst, src);
3208     decode_heap_oop_not_null(dst);
3209   } else {
3210     ldr(dst, src);
3211   }
3212 }
3213 
3214 void MacroAssembler::store_heap_oop(Address dst, Register src) {
3215   if (UseCompressedOops) {
3216     assert(!dst.uses(src), "not enough registers");
3217     encode_heap_oop(src);
3218     strw(src, dst);
3219   } else
3220     str(src, dst);
3221 }
3222 
3223 // Used for storing NULLs.
3224 void MacroAssembler::store_heap_oop_null(Address dst) {
3225   if (UseCompressedOops) {
3226     strw(zr, dst);
3227   } else
3228     str(zr, dst);
3229 }
3230 
3231 #if INCLUDE_ALL_GCS
3232 void MacroAssembler::g1_write_barrier_pre(Register obj,
3233                                           Register pre_val,
3234                                           Register thread,
3235                                           Register tmp,
3236                                           bool tosca_live,
3237                                           bool expand_call) {
3238   // If expand_call is true then we expand the call_VM_leaf macro
3239   // directly to skip generating the check by
3240   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3241 
3242 #ifdef _LP64
3243   assert(thread == rthread, "must be");
3244 #endif // _LP64
3245 
3246   Label done;
3247   Label runtime;
3248 
3249   assert(pre_val != noreg, "check this code");
3250 
3251   if (obj != noreg)
3252     assert_different_registers(obj, pre_val, tmp);
3253 
3254   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3255                                        PtrQueue::byte_offset_of_active()));
3256   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3257                                        PtrQueue::byte_offset_of_index()));
3258   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3259                                        PtrQueue::byte_offset_of_buf()));
3260 
3261 
3262   // Is marking active?
3263   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
3264     ldrw(tmp, in_progress);
3265   } else {
3266     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
3267     ldrb(tmp, in_progress);
3268   }
3269   cbzw(tmp, done);
3270 
3271   // Do we need to load the previous value?
3272   if (obj != noreg) {
3273     load_heap_oop(pre_val, Address(obj, 0));
3274   }
3275 
3276   // Is the previous value null?
3277   cbz(pre_val, done);
3278 
3279   // Can we store original value in the thread's buffer?
3280   // Is index == 0?
3281   // (The index field is typed as size_t.)
3282 
3283   ldr(tmp, index);                      // tmp := *index_adr
3284   cbz(tmp, runtime);                    // tmp == 0?
3285                                         // If yes, goto runtime
3286 
3287   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3288   str(tmp, index);                      // *index_adr := tmp
3289   ldr(rscratch1, buffer);
3290   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3291 
3292   // Record the previous value
3293   str(pre_val, Address(tmp, 0));
3294   b(done);
3295 
3296   bind(runtime);
3297   // save the live input values
3298   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3299 
3300   // Calling the runtime using the regular call_VM_leaf mechanism generates
3301   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3302   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3303   //
3304   // If we care generating the pre-barrier without a frame (e.g. in the
3305   // intrinsified Reference.get() routine) then ebp might be pointing to
3306   // the caller frame and so this check will most likely fail at runtime.
3307   //
3308   // Expanding the call directly bypasses the generation of the check.
3309   // So when we do not have have a full interpreter frame on the stack
3310   // expand_call should be passed true.
3311 
3312   if (expand_call) {
3313     LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); )
3314     pass_arg1(this, thread);
3315     pass_arg0(this, pre_val);
3316     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3317   } else {
3318     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3319   }
3320 
3321   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3322 
3323   bind(done);
3324 }
3325 
3326 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3327                                            Register new_val,
3328                                            Register thread,
3329                                            Register tmp,
3330                                            Register tmp2) {
3331 #ifdef _LP64
3332   assert(thread == rthread, "must be");
3333 #endif // _LP64
3334 
3335   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3336                                        PtrQueue::byte_offset_of_index()));
3337   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3338                                        PtrQueue::byte_offset_of_buf()));
3339 
3340   BarrierSet* bs = Universe::heap()->barrier_set();
3341   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3342   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3343 
3344   Label done;
3345   Label runtime;
3346 
3347   // Does store cross heap regions?
3348 
3349   eor(tmp, store_addr, new_val);
3350   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3351   cbz(tmp, done);
3352 
3353   // crosses regions, storing NULL?
3354 
3355   cbz(new_val, done);
3356 
3357   // storing region crossing non-NULL, is card already dirty?
3358 
3359   ExternalAddress cardtable((address) ct->byte_map_base);
3360   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3361   const Register card_addr = tmp;
3362 
3363   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3364 
3365   unsigned long offset;
3366   adrp(tmp2, cardtable, offset);
3367 
3368   // get the address of the card
3369   add(card_addr, card_addr, tmp2);
3370   ldrb(tmp2, Address(card_addr, offset));
3371   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3372   br(Assembler::EQ, done);
3373 
3374   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3375 
3376   membar(Assembler::Assembler::StoreLoad);
3377 
3378   ldrb(tmp2, Address(card_addr, offset));
3379   cbzw(tmp2, done);
3380 
3381   // storing a region crossing, non-NULL oop, card is clean.
3382   // dirty card and log.
3383 
3384   strb(zr, Address(card_addr, offset));
3385 
3386   ldr(rscratch1, queue_index);
3387   cbz(rscratch1, runtime);
3388   sub(rscratch1, rscratch1, wordSize);
3389   str(rscratch1, queue_index);
3390 
3391   ldr(tmp2, buffer);
3392   str(card_addr, Address(tmp2, rscratch1));
3393   b(done);
3394 
3395   bind(runtime);
3396   // save the live input values
3397   push(store_addr->bit(true) | new_val->bit(true), sp);
3398   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3399   pop(store_addr->bit(true) | new_val->bit(true), sp);
3400 
3401   bind(done);
3402 }
3403 
3404 #endif // INCLUDE_ALL_GCS
3405 
3406 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3407   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3408   int index = oop_recorder()->allocate_metadata_index(obj);
3409   RelocationHolder rspec = metadata_Relocation::spec(index);
3410   return Address((address)obj, rspec);
3411 }
3412 
3413 // Move an oop into a register.  immediate is true if we want
3414 // immediate instrcutions, i.e. we are not going to patch this
3415 // instruction while the code is being executed by another thread.  In
3416 // that case we can use move immediates rather than the constant pool.
3417 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3418   int oop_index;
3419   if (obj == NULL) {
3420     oop_index = oop_recorder()->allocate_oop_index(obj);
3421   } else {
3422     oop_index = oop_recorder()->find_index(obj);
3423     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3424   }
3425   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3426   if (! immediate) {
3427     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3428     ldr_constant(dst, Address(dummy, rspec));
3429   } else
3430     mov(dst, Address((address)obj, rspec));
3431 }
3432 
3433 // Move a metadata address into a register.
3434 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3435   int oop_index;
3436   if (obj == NULL) {
3437     oop_index = oop_recorder()->allocate_metadata_index(obj);
3438   } else {
3439     oop_index = oop_recorder()->find_index(obj);
3440   }
3441   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3442   mov(dst, Address((address)obj, rspec));
3443 }
3444 
3445 Address MacroAssembler::constant_oop_address(jobject obj) {
3446   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3447   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3448   int oop_index = oop_recorder()->find_index(obj);
3449   return Address((address)obj, oop_Relocation::spec(oop_index));
3450 }
3451 
3452 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3453 void MacroAssembler::tlab_allocate(Register obj,
3454                                    Register var_size_in_bytes,
3455                                    int con_size_in_bytes,
3456                                    Register t1,
3457                                    Register t2,
3458                                    Label& slow_case) {
3459   assert_different_registers(obj, t2);
3460   assert_different_registers(obj, var_size_in_bytes);
3461   Register end = t2;
3462 
3463   // verify_tlab();
3464 
3465   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3466   if (var_size_in_bytes == noreg) {
3467     lea(end, Address(obj, con_size_in_bytes));
3468   } else {
3469     lea(end, Address(obj, var_size_in_bytes));
3470   }
3471   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3472   cmp(end, rscratch1);
3473   br(Assembler::HI, slow_case);
3474 
3475   // update the tlab top pointer
3476   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3477 
3478   // recover var_size_in_bytes if necessary
3479   if (var_size_in_bytes == end) {
3480     sub(var_size_in_bytes, var_size_in_bytes, obj);
3481   }
3482   // verify_tlab();
3483 }
3484 
3485 // Preserves r19, and r3.
3486 Register MacroAssembler::tlab_refill(Label& retry,
3487                                      Label& try_eden,
3488                                      Label& slow_case) {
3489   Register top = r0;
3490   Register t1  = r2;
3491   Register t2  = r4;
3492   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3493   Label do_refill, discard_tlab;
3494 
3495   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3496     // No allocation in the shared eden.
3497     b(slow_case);
3498   }
3499 
3500   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3501   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3502 
3503   // calculate amount of free space
3504   sub(t1, t1, top);
3505   lsr(t1, t1, LogHeapWordSize);
3506 
3507   // Retain tlab and allocate object in shared space if
3508   // the amount free in the tlab is too large to discard.
3509 
3510   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3511   cmp(t1, rscratch1);
3512   br(Assembler::LE, discard_tlab);
3513 
3514   // Retain
3515   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3516   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3517   add(rscratch1, rscratch1, t2);
3518   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3519 
3520   if (TLABStats) {
3521     // increment number of slow_allocations
3522     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3523          1, rscratch1);
3524   }
3525   b(try_eden);
3526 
3527   bind(discard_tlab);
3528   if (TLABStats) {
3529     // increment number of refills
3530     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3531          rscratch1);
3532     // accumulate wastage -- t1 is amount free in tlab
3533     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3534          rscratch1);
3535   }
3536 
3537   // if tlab is currently allocated (top or end != null) then
3538   // fill [top, end + alignment_reserve) with array object
3539   cbz(top, do_refill);
3540 
3541   // set up the mark word
3542   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3543   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3544   // set the length to the remaining space
3545   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3546   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3547   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3548   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3549   // set klass to intArrayKlass
3550   {
3551     unsigned long offset;
3552     // dubious reloc why not an oop reloc?
3553     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3554          offset);
3555     ldr(t1, Address(rscratch1, offset));
3556   }
3557   // store klass last.  concurrent gcs assumes klass length is valid if
3558   // klass field is not null.
3559   store_klass(top, t1);
3560 
3561   mov(t1, top);
3562   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3563   sub(t1, t1, rscratch1);
3564   incr_allocated_bytes(rthread, t1, 0, rscratch1);
3565 
3566   // refill the tlab with an eden allocation
3567   bind(do_refill);
3568   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3569   lsl(t1, t1, LogHeapWordSize);
3570   // allocate new tlab, address returned in top
3571   eden_allocate(top, t1, 0, t2, slow_case);
3572 
3573   // Check that t1 was preserved in eden_allocate.
3574 #ifdef ASSERT
3575   if (UseTLAB) {
3576     Label ok;
3577     Register tsize = r4;
3578     assert_different_registers(tsize, rthread, t1);
3579     str(tsize, Address(pre(sp, -16)));
3580     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3581     lsl(tsize, tsize, LogHeapWordSize);
3582     cmp(t1, tsize);
3583     br(Assembler::EQ, ok);
3584     STOP("assert(t1 != tlab size)");
3585     should_not_reach_here();
3586 
3587     bind(ok);
3588     ldr(tsize, Address(post(sp, 16)));
3589   }
3590 #endif
3591   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3592   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3593   add(top, top, t1);
3594   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
3595   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3596   verify_tlab();
3597   b(retry);
3598 
3599   return rthread; // for use by caller
3600 }
3601 
3602 // Defines obj, preserves var_size_in_bytes
3603 void MacroAssembler::eden_allocate(Register obj,
3604                                    Register var_size_in_bytes,
3605                                    int con_size_in_bytes,
3606                                    Register t1,
3607                                    Label& slow_case) {
3608   assert_different_registers(obj, var_size_in_bytes, t1);
3609   if (CMSIncrementalMode || !Universe::heap()->supports_inline_contig_alloc()) {
3610     b(slow_case);
3611   } else {
3612     Register end = t1;
3613     Register heap_end = rscratch2;
3614     Label retry;
3615     bind(retry);
3616     {
3617       unsigned long offset;
3618       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
3619       ldr(heap_end, Address(rscratch1, offset));
3620     }
3621 
3622     ExternalAddress heap_top((address) Universe::heap()->top_addr());
3623 
3624     // Get the current top of the heap
3625     {
3626       unsigned long offset;
3627       adrp(rscratch1, heap_top, offset);
3628       // Use add() here after ARDP, rather than lea().
3629       // lea() does not generate anything if its offset is zero.
3630       // However, relocs expect to find either an ADD or a load/store
3631       // insn after an ADRP.  add() always generates an ADD insn, even
3632       // for add(Rn, Rn, 0).
3633       add(rscratch1, rscratch1, offset);
3634       ldaxr(obj, rscratch1);
3635     }
3636 
3637     // Adjust it my the size of our new object
3638     if (var_size_in_bytes == noreg) {
3639       lea(end, Address(obj, con_size_in_bytes));
3640     } else {
3641       lea(end, Address(obj, var_size_in_bytes));
3642     }
3643 
3644     // if end < obj then we wrapped around high memory
3645     cmp(end, obj);
3646     br(Assembler::LO, slow_case);
3647 
3648     cmp(end, heap_end);
3649     br(Assembler::HI, slow_case);
3650 
3651     // If heap_top hasn't been changed by some other thread, update it.
3652     stlxr(rscratch1, end, rscratch1);
3653     cbnzw(rscratch1, retry);
3654   }
3655 }
3656 
3657 void MacroAssembler::verify_tlab() {
3658 #ifdef ASSERT
3659   if (UseTLAB && VerifyOops) {
3660     Label next, ok;
3661 
3662     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
3663 
3664     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3665     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3666     cmp(rscratch2, rscratch1);
3667     br(Assembler::HS, next);
3668     STOP("assert(top >= start)");
3669     should_not_reach_here();
3670 
3671     bind(next);
3672     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3673     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3674     cmp(rscratch2, rscratch1);
3675     br(Assembler::HS, ok);
3676     STOP("assert(top <= end)");
3677     should_not_reach_here();
3678 
3679     bind(ok);
3680     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
3681   }
3682 #endif
3683 }
3684 
3685 // Writes to stack successive pages until offset reached to check for
3686 // stack overflow + shadow pages.  This clobbers tmp.
3687 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3688   assert_different_registers(tmp, size, rscratch1);
3689   mov(tmp, sp);
3690   // Bang stack for total size given plus shadow page size.
3691   // Bang one page at a time because large size can bang beyond yellow and
3692   // red zones.
3693   Label loop;
3694   mov(rscratch1, os::vm_page_size());
3695   bind(loop);
3696   lea(tmp, Address(tmp, -os::vm_page_size()));
3697   subsw(size, size, rscratch1);
3698   str(size, Address(tmp));
3699   br(Assembler::GT, loop);
3700 
3701   // Bang down shadow pages too.
3702   // The -1 because we already subtracted 1 page.
3703   for (int i = 0; i< StackShadowPages-1; i++) {
3704     // this could be any sized move but this is can be a debugging crumb
3705     // so the bigger the better.
3706     lea(tmp, Address(tmp, -os::vm_page_size()));
3707     str(size, Address(tmp));
3708   }
3709 }
3710 
3711 
3712 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
3713   unsigned long off;
3714   adrp(r, Address(page, rtype), off);
3715   InstructionMark im(this);
3716   code_section()->relocate(inst_mark(), rtype);
3717   ldrw(zr, Address(r, off));
3718   return inst_mark();
3719 }
3720 
3721 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
3722   InstructionMark im(this);
3723   code_section()->relocate(inst_mark(), rtype);
3724   ldrw(zr, Address(r, 0));
3725   return inst_mark();
3726 }
3727 
3728 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
3729   relocInfo::relocType rtype = dest.rspec().reloc()->type();
3730   if (uabs(pc() - dest.target()) >= (1LL << 32)) {
3731     guarantee(rtype == relocInfo::none
3732               || rtype == relocInfo::external_word_type
3733               || rtype == relocInfo::poll_type
3734               || rtype == relocInfo::poll_return_type,
3735               "can only use a fixed address with an ADRP");
3736     // Out of range.  This doesn't happen very often, but we have to
3737     // handle it
3738     mov(reg1, dest);
3739     byte_offset = 0;
3740   } else {
3741     InstructionMark im(this);
3742     code_section()->relocate(inst_mark(), dest.rspec());
3743     byte_offset = (uint64_t)dest.target() & 0xfff;
3744     _adrp(reg1, dest.target());
3745   }
3746 }
3747 
3748   bool MacroAssembler::use_acq_rel_for_volatile_fields() {
3749 #ifdef PRODUCT
3750     return false;
3751 #else
3752     return UseAcqRelForVolatileFields;
3753 #endif
3754   }
3755 
3756 void MacroAssembler::build_frame(int framesize) {
3757   if (framesize == 0) {
3758     // Is this even possible?
3759     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3760   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
3761     sub(sp, sp, framesize);
3762     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3763   } else {
3764     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3765     if (framesize < ((1 << 12) + 2 * wordSize))
3766       sub(sp, sp, framesize - 2 * wordSize);
3767     else {
3768       mov(rscratch1, framesize - 2 * wordSize);
3769       sub(sp, sp, rscratch1);
3770     }
3771   }
3772 }
3773 
3774 void MacroAssembler::remove_frame(int framesize) {
3775   if (framesize == 0) {
3776     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
3777   } else if (framesize < ((1 << 9) + 2 * wordSize)) {
3778     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3779     add(sp, sp, framesize);
3780   } else {
3781     if (framesize < ((1 << 12) + 2 * wordSize))
3782       add(sp, sp, framesize - 2 * wordSize);
3783     else {
3784       mov(rscratch1, framesize - 2 * wordSize);
3785       add(sp, sp, rscratch1);
3786     }
3787     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
3788   }
3789 }
3790 
3791 // Search for str1 in str2 and return index or -1
3792 void MacroAssembler::string_indexof(Register str2, Register str1,
3793                                     Register cnt2, Register cnt1,
3794                                     Register tmp1, Register tmp2,
3795                                     Register tmp3, Register tmp4,
3796                                     int icnt1, Register result) {
3797   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
3798 
3799   Register ch1 = rscratch1;
3800   Register ch2 = rscratch2;
3801   Register cnt1tmp = tmp1;
3802   Register cnt2tmp = tmp2;
3803   Register cnt1_neg = cnt1;
3804   Register cnt2_neg = cnt2;
3805   Register result_tmp = tmp4;
3806 
3807   // Note, inline_string_indexOf() generates checks:
3808   // if (substr.count > string.count) return -1;
3809   // if (substr.count == 0) return 0;
3810 
3811 // We have two strings, a source string in str2, cnt2 and a pattern string
3812 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
3813 
3814 // For larger pattern and source we use a simplified Boyer Moore algorithm.
3815 // With a small pattern and source we use linear scan.
3816 
3817   if (icnt1 == -1) {
3818     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
3819     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
3820     br(LO, LINEARSEARCH);       // a byte array.
3821     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
3822     br(HS, LINEARSEARCH);
3823   }
3824 
3825 // The Boyer Moore alogorithm is based on the description here:-
3826 //
3827 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
3828 //
3829 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
3830 // and the 'Good Suffix' rule.
3831 //
3832 // These rules are essentially heuristics for how far we can shift the
3833 // pattern along the search string.
3834 //
3835 // The implementation here uses the 'Bad Character' rule only because of the
3836 // complexity of initialisation for the 'Good Suffix' rule.
3837 //
3838 // This is also known as the Boyer-Moore-Horspool algorithm:-
3839 //
3840 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
3841 //
3842 // #define ASIZE 128
3843 //
3844 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
3845 //       int i, j;
3846 //       unsigned c;
3847 //       unsigned char bc[ASIZE];
3848 //    
3849 //       /* Preprocessing */
3850 //       for (i = 0; i < ASIZE; ++i)
3851 //          bc[i] = 0;
3852 //       for (i = 0; i < m - 1; ) {
3853 //          c = x[i];
3854 //          ++i;
3855 //          if (c < ASIZE) bc[c] = i;
3856 //       }
3857 //    
3858 //       /* Searching */
3859 //       j = 0;
3860 //       while (j <= n - m) {
3861 //          c = y[i+j];
3862 //          if (x[m-1] == c)
3863 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
3864 //          if (i < 0) return j;
3865 //          if (c < ASIZE)
3866 //            j = j - bc[y[j+m-1]] + m;
3867 //          else
3868 //            j += 1; // Advance by 1 only if char >= ASIZE
3869 //       }
3870 //    }
3871 
3872   if (icnt1 == -1) {
3873     BIND(BM);
3874 
3875     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
3876     Label BMADV, BMMATCH, BMCHECKEND;
3877 
3878     Register cnt1end = tmp2;
3879     Register str2end = cnt2;
3880     Register skipch = tmp2;
3881 
3882     // Restrict ASIZE to 128 to reduce stack space/initialisation.
3883     // The presence of chars >= ASIZE in the target string does not affect
3884     // performance, but we must be careful not to initialise them in the stack
3885     // array.
3886     // The presence of chars >= ASIZE in the source string may adversely affect
3887     // performance since we can only advance by one when we encounter one.
3888 
3889       stp(zr, zr, pre(sp, -128));
3890       for (int i = 1; i < 8; i++)
3891           stp(zr, zr, Address(sp, i*16));
3892 
3893       mov(cnt1tmp, 0);
3894       sub(cnt1end, cnt1, 1);
3895     BIND(BCLOOP);
3896       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3897       cmp(ch1, 128);
3898       add(cnt1tmp, cnt1tmp, 1);
3899       br(HS, BCSKIP);
3900       strb(cnt1tmp, Address(sp, ch1));
3901     BIND(BCSKIP);
3902       cmp(cnt1tmp, cnt1end);
3903       br(LT, BCLOOP);
3904 
3905       mov(result_tmp, str2);
3906 
3907       sub(cnt2, cnt2, cnt1);
3908       add(str2end, str2, cnt2, LSL, 1);
3909     BIND(BMLOOPSTR2);
3910       sub(cnt1tmp, cnt1, 1);
3911       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3912       ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
3913       cmp(ch1, skipch);
3914       br(NE, BMSKIP);
3915       subs(cnt1tmp, cnt1tmp, 1);
3916       br(LT, BMMATCH);
3917     BIND(BMLOOPSTR1);
3918       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
3919       ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
3920       cmp(ch1, ch2);
3921       br(NE, BMSKIP);
3922       subs(cnt1tmp, cnt1tmp, 1);
3923       br(GE, BMLOOPSTR1);
3924     BIND(BMMATCH);
3925       sub(result_tmp, str2, result_tmp);
3926       lsr(result, result_tmp, 1);
3927       add(sp, sp, 128);
3928       b(DONE);
3929     BIND(BMADV);
3930       add(str2, str2, 2);
3931       b(BMCHECKEND);
3932     BIND(BMSKIP);
3933       cmp(skipch, 128);
3934       br(HS, BMADV);
3935       ldrb(ch2, Address(sp, skipch));
3936       add(str2, str2, cnt1, LSL, 1);
3937       sub(str2, str2, ch2, LSL, 1);
3938     BIND(BMCHECKEND);
3939       cmp(str2, str2end);
3940       br(LE, BMLOOPSTR2);
3941       add(sp, sp, 128);
3942       b(NOMATCH);
3943   }
3944 
3945   BIND(LINEARSEARCH);
3946   {
3947     Label DO1, DO2, DO3;
3948 
3949     Register str2tmp = tmp2;
3950     Register first = tmp3;
3951 
3952     if (icnt1 == -1)
3953     {
3954         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
3955 
3956         cmp(cnt1, 4);
3957         br(LT, DOSHORT);
3958 
3959         sub(cnt2, cnt2, cnt1);
3960         sub(cnt1, cnt1, 4);
3961         mov(result_tmp, cnt2);
3962 
3963         lea(str1, Address(str1, cnt1, Address::uxtw(1)));
3964         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
3965         sub(cnt1_neg, zr, cnt1, LSL, 1);
3966         sub(cnt2_neg, zr, cnt2, LSL, 1);
3967         ldr(first, Address(str1, cnt1_neg));
3968 
3969       BIND(FIRST_LOOP);
3970         ldr(ch2, Address(str2, cnt2_neg));
3971         cmp(first, ch2);
3972         br(EQ, STR1_LOOP);
3973       BIND(STR2_NEXT);
3974         adds(cnt2_neg, cnt2_neg, 2);
3975         br(LE, FIRST_LOOP);
3976         b(NOMATCH);
3977 
3978       BIND(STR1_LOOP);
3979         adds(cnt1tmp, cnt1_neg, 8);
3980         add(cnt2tmp, cnt2_neg, 8);
3981         br(GE, LAST_WORD);
3982 
3983       BIND(STR1_NEXT);
3984         ldr(ch1, Address(str1, cnt1tmp));
3985         ldr(ch2, Address(str2, cnt2tmp));
3986         cmp(ch1, ch2);
3987         br(NE, STR2_NEXT);
3988         adds(cnt1tmp, cnt1tmp, 8);
3989         add(cnt2tmp, cnt2tmp, 8);
3990         br(LT, STR1_NEXT);
3991 
3992       BIND(LAST_WORD);
3993         ldr(ch1, Address(str1));
3994         sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
3995         ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
3996         cmp(ch1, ch2);
3997         br(NE, STR2_NEXT);
3998         b(MATCH);
3999 
4000       BIND(DOSHORT);
4001         cmp(cnt1, 2);
4002         br(LT, DO1);
4003         br(GT, DO3);
4004     }
4005 
4006     if (icnt1 == 4) {
4007       Label CH1_LOOP;
4008 
4009         ldr(ch1, str1);
4010         sub(cnt2, cnt2, 4);
4011         mov(result_tmp, cnt2);
4012         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4013         sub(cnt2_neg, zr, cnt2, LSL, 1);
4014 
4015       BIND(CH1_LOOP);
4016         ldr(ch2, Address(str2, cnt2_neg));
4017         cmp(ch1, ch2);
4018         br(EQ, MATCH);
4019         adds(cnt2_neg, cnt2_neg, 2);
4020         br(LE, CH1_LOOP);
4021         b(NOMATCH);
4022     }
4023 
4024     if (icnt1 == -1 || icnt1 == 2) {
4025       Label CH1_LOOP;
4026 
4027       BIND(DO2);
4028         ldrw(ch1, str1);
4029         sub(cnt2, cnt2, 2);
4030         mov(result_tmp, cnt2);
4031         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4032         sub(cnt2_neg, zr, cnt2, LSL, 1);
4033 
4034       BIND(CH1_LOOP);
4035         ldrw(ch2, Address(str2, cnt2_neg));
4036         cmp(ch1, ch2);
4037         br(EQ, MATCH);
4038         adds(cnt2_neg, cnt2_neg, 2);
4039         br(LE, CH1_LOOP);
4040         b(NOMATCH);
4041     }
4042 
4043     if (icnt1 == -1 || icnt1 == 3) {
4044       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4045 
4046       BIND(DO3);
4047         ldrw(first, str1);
4048         ldrh(ch1, Address(str1, 4));
4049 
4050         sub(cnt2, cnt2, 3);
4051         mov(result_tmp, cnt2);
4052         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4053         sub(cnt2_neg, zr, cnt2, LSL, 1);
4054 
4055       BIND(FIRST_LOOP);
4056         ldrw(ch2, Address(str2, cnt2_neg));
4057         cmpw(first, ch2);
4058         br(EQ, STR1_LOOP);
4059       BIND(STR2_NEXT);
4060         adds(cnt2_neg, cnt2_neg, 2);
4061         br(LE, FIRST_LOOP);
4062         b(NOMATCH);
4063 
4064       BIND(STR1_LOOP);
4065         add(cnt2tmp, cnt2_neg, 4);
4066         ldrh(ch2, Address(str2, cnt2tmp));
4067         cmp(ch1, ch2);
4068         br(NE, STR2_NEXT);
4069         b(MATCH);
4070     }
4071 
4072     if (icnt1 == -1 || icnt1 == 1) {
4073       Label CH1_LOOP, HAS_ZERO;
4074       Label DO1_SHORT, DO1_LOOP;
4075 
4076       BIND(DO1);
4077         ldrh(ch1, str1);
4078         cmp(cnt2, 4);
4079         br(LT, DO1_SHORT);
4080 
4081         orr(ch1, ch1, ch1, LSL, 16);
4082         orr(ch1, ch1, ch1, LSL, 32);
4083 
4084         sub(cnt2, cnt2, 4);
4085         mov(result_tmp, cnt2);
4086         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4087         sub(cnt2_neg, zr, cnt2, LSL, 1);
4088 
4089         mov(tmp3, 0x0001000100010001);
4090       BIND(CH1_LOOP);
4091         ldr(ch2, Address(str2, cnt2_neg));
4092         eor(ch2, ch1, ch2);
4093         sub(tmp1, ch2, tmp3);
4094         orr(tmp2, ch2, 0x7fff7fff7fff7fff);
4095         bics(tmp1, tmp1, tmp2);
4096         br(NE, HAS_ZERO);
4097         adds(cnt2_neg, cnt2_neg, 8);
4098         br(LT, CH1_LOOP);
4099 
4100         cmp(cnt2_neg, 8);
4101         mov(cnt2_neg, 0);
4102         br(LT, CH1_LOOP);
4103         b(NOMATCH);
4104 
4105       BIND(HAS_ZERO);
4106         rev(tmp1, tmp1);
4107         clz(tmp1, tmp1);
4108         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4109         b(MATCH);
4110 
4111       BIND(DO1_SHORT);
4112         mov(result_tmp, cnt2);
4113         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4114         sub(cnt2_neg, zr, cnt2, LSL, 1);
4115       BIND(DO1_LOOP);
4116         ldrh(ch2, Address(str2, cnt2_neg));
4117         cmpw(ch1, ch2);
4118         br(EQ, MATCH);
4119         adds(cnt2_neg, cnt2_neg, 2);
4120         br(LT, DO1_LOOP);
4121     }
4122   }
4123   BIND(NOMATCH);
4124     mov(result, -1);
4125     b(DONE);
4126   BIND(MATCH);
4127     add(result, result_tmp, cnt2_neg, ASR, 1);
4128   BIND(DONE);
4129 }
4130 
4131 // Compare strings.
4132 void MacroAssembler::string_compare(Register str1, Register str2,
4133                                     Register cnt1, Register cnt2, Register result,
4134                                     Register tmp1) {
4135   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4136     NEXT_WORD, DIFFERENCE;
4137 
4138   BLOCK_COMMENT("string_compare {");
4139 
4140   // Compute the minimum of the string lengths and save the difference.
4141   subsw(tmp1, cnt1, cnt2);
4142   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4143 
4144   // A very short string
4145   cmpw(cnt2, 4);
4146   br(Assembler::LT, SHORT_STRING);
4147 
4148   // Check if the strings start at the same location.
4149   cmp(str1, str2);
4150   br(Assembler::EQ, LENGTH_DIFF);
4151 
4152   // Compare longwords
4153   {
4154     subw(cnt2, cnt2, 4); // The last longword is a special case
4155 
4156     // Move both string pointers to the last longword of their
4157     // strings, negate the remaining count, and convert it to bytes.
4158     lea(str1, Address(str1, cnt2, Address::uxtw(1)));
4159     lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4160     sub(cnt2, zr, cnt2, LSL, 1);
4161 
4162     // Loop, loading longwords and comparing them into rscratch2.
4163     bind(NEXT_WORD);
4164     ldr(result, Address(str1, cnt2));
4165     ldr(cnt1, Address(str2, cnt2));
4166     adds(cnt2, cnt2, wordSize);
4167     eor(rscratch2, result, cnt1);
4168     cbnz(rscratch2, DIFFERENCE);
4169     br(Assembler::LT, NEXT_WORD);
4170 
4171     // Last longword.  In the case where length == 4 we compare the
4172     // same longword twice, but that's still faster than another
4173     // conditional branch.
4174 
4175     ldr(result, Address(str1));
4176     ldr(cnt1, Address(str2));
4177     eor(rscratch2, result, cnt1);
4178     cbz(rscratch2, LENGTH_DIFF);
4179 
4180     // Find the first different characters in the longwords and
4181     // compute their difference.
4182     bind(DIFFERENCE);
4183     rev(rscratch2, rscratch2);
4184     clz(rscratch2, rscratch2);
4185     andr(rscratch2, rscratch2, -16);
4186     lsrv(result, result, rscratch2);
4187     uxthw(result, result);
4188     lsrv(cnt1, cnt1, rscratch2);
4189     uxthw(cnt1, cnt1);
4190     subw(result, result, cnt1);
4191     b(DONE);
4192   }
4193 
4194   bind(SHORT_STRING);
4195   // Is the minimum length zero?
4196   cbz(cnt2, LENGTH_DIFF);
4197 
4198   bind(SHORT_LOOP);
4199   load_unsigned_short(result, Address(post(str1, 2)));
4200   load_unsigned_short(cnt1, Address(post(str2, 2)));
4201   subw(result, result, cnt1);
4202   cbnz(result, DONE);
4203   sub(cnt2, cnt2, 1);
4204   cbnz(cnt2, SHORT_LOOP);
4205 
4206   // Strings are equal up to min length.  Return the length difference.
4207   bind(LENGTH_DIFF);
4208   mov(result, tmp1);
4209 
4210   // That's it
4211   bind(DONE);
4212 
4213   BLOCK_COMMENT("} string_compare");
4214 }
4215 
4216 
4217 void MacroAssembler::string_equals(Register str1, Register str2,
4218                                    Register cnt, Register result,
4219                                    Register tmp1) {
4220   Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
4221     NEXT_WORD;
4222 
4223   const Register tmp2 = rscratch1;
4224   assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
4225 
4226   BLOCK_COMMENT("string_equals {");
4227 
4228   // Start by assuming that the strings are not equal.
4229   mov(result, zr);
4230 
4231   // A very short string
4232   cmpw(cnt, 4);
4233   br(Assembler::LT, SHORT_STRING);
4234 
4235   // Check if the strings start at the same location.
4236   cmp(str1, str2);
4237   br(Assembler::EQ, SAME_CHARS);
4238 
4239   // Compare longwords
4240   {
4241     subw(cnt, cnt, 4); // The last longword is a special case
4242 
4243     // Move both string pointers to the last longword of their
4244     // strings, negate the remaining count, and convert it to bytes.
4245     lea(str1, Address(str1, cnt, Address::uxtw(1)));
4246     lea(str2, Address(str2, cnt, Address::uxtw(1)));
4247     sub(cnt, zr, cnt, LSL, 1);
4248 
4249     // Loop, loading longwords and comparing them into rscratch2.
4250     bind(NEXT_WORD);
4251     ldr(tmp1, Address(str1, cnt));
4252     ldr(tmp2, Address(str2, cnt));
4253     adds(cnt, cnt, wordSize);
4254     eor(rscratch2, tmp1, tmp2);
4255     cbnz(rscratch2, DONE);
4256     br(Assembler::LT, NEXT_WORD);
4257 
4258     // Last longword.  In the case where length == 4 we compare the
4259     // same longword twice, but that's still faster than another
4260     // conditional branch.
4261 
4262     ldr(tmp1, Address(str1));
4263     ldr(tmp2, Address(str2));
4264     eor(rscratch2, tmp1, tmp2);
4265     cbz(rscratch2, SAME_CHARS);
4266     b(DONE);
4267   }
4268 
4269   bind(SHORT_STRING);
4270   // Is the length zero?
4271   cbz(cnt, SAME_CHARS);
4272 
4273   bind(SHORT_LOOP);
4274   load_unsigned_short(tmp1, Address(post(str1, 2)));
4275   load_unsigned_short(tmp2, Address(post(str2, 2)));
4276   subw(tmp1, tmp1, tmp2);
4277   cbnz(tmp1, DONE);
4278   sub(cnt, cnt, 1);
4279   cbnz(cnt, SHORT_LOOP);
4280 
4281   // Strings are equal.
4282   bind(SAME_CHARS);
4283   mov(result, true);
4284 
4285   // That's it
4286   bind(DONE);
4287 
4288   BLOCK_COMMENT("} string_equals");
4289 }
4290 
4291 // Compare char[] arrays aligned to 4 bytes
4292 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4293                                         Register result, Register tmp1)
4294 {
4295   Register cnt1 = rscratch1;
4296   Register cnt2 = rscratch2;
4297   Register tmp2 = rscratch2;
4298 
4299   Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4300 
4301   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4302   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4303 
4304   BLOCK_COMMENT("char_arrays_equals  {");
4305 
4306     // different until proven equal
4307     mov(result, false);
4308 
4309     // same array?
4310     cmp(ary1, ary2);
4311     br(Assembler::EQ, SAME);
4312 
4313     // ne if either null
4314     cbz(ary1, DIFFER);
4315     cbz(ary2, DIFFER);
4316 
4317     // lengths ne?
4318     ldrw(cnt1, Address(ary1, length_offset));
4319     ldrw(cnt2, Address(ary2, length_offset));
4320     cmp(cnt1, cnt2);
4321     br(Assembler::NE, DIFFER);
4322 
4323     lea(ary1, Address(ary1, base_offset));
4324     lea(ary2, Address(ary2, base_offset));
4325 
4326     subs(cnt1, cnt1, 4);
4327     br(LT, TAIL03);
4328 
4329   BIND(NEXT);
4330     ldr(tmp1, Address(post(ary1, 8)));
4331     ldr(tmp2, Address(post(ary2, 8)));
4332     subs(cnt1, cnt1, 4);
4333     eor(tmp1, tmp1, tmp2);
4334     cbnz(tmp1, DIFFER);
4335     br(GE, NEXT);
4336 
4337   BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4338     tst(cnt1, 0b10);
4339     br(EQ, TAIL01);
4340     ldrw(tmp1, Address(post(ary1, 4)));
4341     ldrw(tmp2, Address(post(ary2, 4)));
4342     cmp(tmp1, tmp2);
4343     br(NE, DIFFER);
4344   BIND(TAIL01);  // 0-1 chars left
4345     tst(cnt1, 0b01);
4346     br(EQ, SAME);
4347     ldrh(tmp1, ary1);
4348     ldrh(tmp2, ary2);
4349     cmp(tmp1, tmp2);
4350     br(NE, DIFFER);
4351 
4352   BIND(SAME);
4353     mov(result, true);
4354   BIND(DIFFER); // result already set
4355   
4356   BLOCK_COMMENT("} char_arrays_equals");
4357 }
4358 
4359 // encode char[] to byte[] in ISO_8859_1
4360 void MacroAssembler::encode_iso_array(Register src, Register dst,
4361                       Register len, Register result,
4362                       FloatRegister Vtmp1, FloatRegister Vtmp2,
4363                       FloatRegister Vtmp3, FloatRegister Vtmp4)
4364 {
4365     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
4366     Register tmp1 = rscratch1;
4367 
4368       mov(result, len); // Save initial len
4369 
4370 #ifndef BUILTIN_SIM
4371       subs(len, len, 32);
4372       br(LT, LOOP_8);
4373 
4374 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
4375 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
4376 // any char could not fit in a byte, so clear the FPSR so we can test it.
4377       clear_fpsr();
4378 
4379     BIND(NEXT_32);
4380       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4381       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
4382       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
4383       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
4384       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
4385       get_fpsr(tmp1);
4386       cbnzw(tmp1, LOOP_8);
4387       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
4388       subs(len, len, 32);
4389       add(src, src, 64);
4390       br(GE, NEXT_32);
4391 
4392     BIND(LOOP_8);
4393       adds(len, len, 32-8);
4394       br(LT, LOOP_1);
4395       clear_fpsr(); // QC may be set from loop above, clear again
4396     BIND(NEXT_8);
4397       ld1(Vtmp1, T8H, src);
4398       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
4399       get_fpsr(tmp1);
4400       cbnzw(tmp1, LOOP_1);
4401       st1(Vtmp1, T8B, post(dst, 8));
4402       subs(len, len, 8);
4403       add(src, src, 16);
4404       br(GE, NEXT_8);
4405 
4406     BIND(LOOP_1);
4407       adds(len, len, 8);
4408       br(LE, DONE);
4409 #else
4410       cbz(len, DONE);
4411 #endif
4412     BIND(NEXT_1);
4413       ldrh(tmp1, Address(post(src, 2)));
4414       tst(tmp1, 0xff00);
4415       br(NE, DONE);
4416       strb(tmp1, Address(post(dst, 1)));
4417       subs(len, len, 1);
4418       br(GT, NEXT_1);
4419 
4420     BIND(DONE);
4421       sub(result, result, len); // Return index where we stopped
4422 }