1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include <sys/types.h>
  27 
  28 #include "precompiled.hpp"
  29 #include "asm/assembler.hpp"
  30 #include "asm/assembler.inline.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 
  33 #include "compiler/disassembler.hpp"
  34 #include "memory/resourceArea.hpp"
  35 #include "nativeInst_aarch64.hpp"
  36 #include "oops/klass.inline.hpp"
  37 #include "opto/compile.hpp"
  38 #include "opto/node.hpp"
  39 #include "runtime/biasedLocking.hpp"
  40 #include "runtime/icache.hpp"
  41 #include "runtime/interfaceSupport.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 
  44 #if INCLUDE_ALL_GCS
  45 #include "gc/g1/g1CollectedHeap.inline.hpp"
  46 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  47 #include "gc/g1/heapRegion.hpp"
  48 #endif
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) /* nothing */
  52 #define STOP(error) stop(error)
  53 #else
  54 #define BLOCK_COMMENT(str) block_comment(str)
  55 #define STOP(error) block_comment(error); stop(error)
  56 #endif
  57 
  58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 // Patch any kind of instruction; there may be several instructions.
  61 // Return the total length (in bytes) of the instructions.
  62 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  63   int instructions = 1;
  64   assert((uint64_t)target < (1ul << 48), "48-bit overflow in address constant");
  65   long offset = (target - branch) >> 2;
  66   unsigned insn = *(unsigned*)branch;
  67   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b111011) == 0b011000) {
  68     // Load register (literal)
  69     Instruction_aarch64::spatch(branch, 23, 5, offset);
  70   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
  71     // Unconditional branch (immediate)
  72     Instruction_aarch64::spatch(branch, 25, 0, offset);
  73   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
  74     // Conditional branch (immediate)
  75     Instruction_aarch64::spatch(branch, 23, 5, offset);
  76   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
  77     // Compare & branch (immediate)
  78     Instruction_aarch64::spatch(branch, 23, 5, offset);
  79   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
  80     // Test & branch (immediate)
  81     Instruction_aarch64::spatch(branch, 18, 5, offset);
  82   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
  83     // PC-rel. addressing
  84     offset = target-branch;
  85     int shift = Instruction_aarch64::extract(insn, 31, 31);
  86     if (shift) {
  87       u_int64_t dest = (u_int64_t)target;
  88       uint64_t pc_page = (uint64_t)branch >> 12;
  89       uint64_t adr_page = (uint64_t)target >> 12;
  90       unsigned offset_lo = dest & 0xfff;
  91       offset = adr_page - pc_page;
  92 
  93       // We handle 3 types of PC relative addressing
  94       //   1 - adrp    Rx, target_page
  95       //       ldr/str Ry, [Rx, #offset_in_page]
  96       //   2 - adrp    Rx, target_page
  97       //       add     Ry, Rx, #offset_in_page
  98       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
  99       // In the first 2 cases we must check that Rx is the same in the adrp and the
 100       // subsequent ldr/str or add instruction. Otherwise we could accidentally end
 101       // up treating a type 3 relocation as a type 1 or 2 just because it happened
 102       // to be followed by a random unrelated ldr/str or add instruction.
 103       //
 104       // In the case of a type 3 relocation, we know that these are only generated
 105       // for the safepoint polling page, or for the card type byte map base so we
 106       // assert as much and of course that the offset is 0.
 107       //
 108       unsigned insn2 = ((unsigned*)branch)[1];
 109       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 110                 Instruction_aarch64::extract(insn, 4, 0) ==
 111                         Instruction_aarch64::extract(insn2, 9, 5)) {
 112         // Load/store register (unsigned immediate)
 113         unsigned size = Instruction_aarch64::extract(insn2, 31, 30);
 114         Instruction_aarch64::patch(branch + sizeof (unsigned),
 115                                     21, 10, offset_lo >> size);
 116         guarantee(((dest >> size) << size) == dest, "misaligned target");
 117         instructions = 2;
 118       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 119                 Instruction_aarch64::extract(insn, 4, 0) ==
 120                         Instruction_aarch64::extract(insn2, 4, 0)) {
 121         // add (immediate)
 122         Instruction_aarch64::patch(branch + sizeof (unsigned),
 123                                    21, 10, offset_lo);
 124         instructions = 2;
 125       } else {
 126         assert((jbyte *)target ==
 127                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 128                target == StubRoutines::crc_table_addr() ||
 129                (address)target == os::get_polling_page(),
 130                "adrp must be polling page or byte map base");
 131         assert(offset_lo == 0, "offset must be 0 for polling page or byte map base");
 132       }
 133     }
 134     int offset_lo = offset & 3;
 135     offset >>= 2;
 136     Instruction_aarch64::spatch(branch, 23, 5, offset);
 137     Instruction_aarch64::patch(branch, 30, 29, offset_lo);
 138   } else if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010100) {
 139     u_int64_t dest = (u_int64_t)target;
 140     // Move wide constant
 141     assert(nativeInstruction_at(branch+4)->is_movk(), "wrong insns in patch");
 142     assert(nativeInstruction_at(branch+8)->is_movk(), "wrong insns in patch");
 143     Instruction_aarch64::patch(branch, 20, 5, dest & 0xffff);
 144     Instruction_aarch64::patch(branch+4, 20, 5, (dest >>= 16) & 0xffff);
 145     Instruction_aarch64::patch(branch+8, 20, 5, (dest >>= 16) & 0xffff);
 146     assert(target_addr_for_insn(branch) == target, "should be");
 147     instructions = 3;
 148   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 149              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 150     // nothing to do
 151     assert(target == 0, "did not expect to relocate target for polling page load");
 152   } else {
 153     ShouldNotReachHere();
 154   }
 155   return instructions * NativeInstruction::instruction_size;
 156 }
 157 
 158 int MacroAssembler::patch_oop(address insn_addr, address o) {
 159   int instructions;
 160   unsigned insn = *(unsigned*)insn_addr;
 161   assert(nativeInstruction_at(insn_addr+4)->is_movk(), "wrong insns in patch");
 162 
 163   // OOPs are either narrow (32 bits) or wide (48 bits).  We encode
 164   // narrow OOPs by setting the upper 16 bits in the first
 165   // instruction.
 166   if (Instruction_aarch64::extract(insn, 31, 21) == 0b11010010101) {
 167     // Move narrow OOP
 168     narrowOop n = oopDesc::encode_heap_oop((oop)o);
 169     Instruction_aarch64::patch(insn_addr, 20, 5, n >> 16);
 170     Instruction_aarch64::patch(insn_addr+4, 20, 5, n & 0xffff);
 171     instructions = 2;
 172   } else {
 173     // Move wide OOP
 174     assert(nativeInstruction_at(insn_addr+8)->is_movk(), "wrong insns in patch");
 175     uintptr_t dest = (uintptr_t)o;
 176     Instruction_aarch64::patch(insn_addr, 20, 5, dest & 0xffff);
 177     Instruction_aarch64::patch(insn_addr+4, 20, 5, (dest >>= 16) & 0xffff);
 178     Instruction_aarch64::patch(insn_addr+8, 20, 5, (dest >>= 16) & 0xffff);
 179     instructions = 3;
 180   }
 181   return instructions * NativeInstruction::instruction_size;
 182 }
 183 
 184 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 185   long offset = 0;
 186   if ((Instruction_aarch64::extract(insn, 29, 24) & 0b011011) == 0b00011000) {
 187     // Load register (literal)
 188     offset = Instruction_aarch64::sextract(insn, 23, 5);
 189     return address(((uint64_t)insn_addr + (offset << 2)));
 190   } else if (Instruction_aarch64::extract(insn, 30, 26) == 0b00101) {
 191     // Unconditional branch (immediate)
 192     offset = Instruction_aarch64::sextract(insn, 25, 0);
 193   } else if (Instruction_aarch64::extract(insn, 31, 25) == 0b0101010) {
 194     // Conditional branch (immediate)
 195     offset = Instruction_aarch64::sextract(insn, 23, 5);
 196   } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011010) {
 197     // Compare & branch (immediate)
 198     offset = Instruction_aarch64::sextract(insn, 23, 5);
 199    } else if (Instruction_aarch64::extract(insn, 30, 25) == 0b011011) {
 200     // Test & branch (immediate)
 201     offset = Instruction_aarch64::sextract(insn, 18, 5);
 202   } else if (Instruction_aarch64::extract(insn, 28, 24) == 0b10000) {
 203     // PC-rel. addressing
 204     offset = Instruction_aarch64::extract(insn, 30, 29);
 205     offset |= Instruction_aarch64::sextract(insn, 23, 5) << 2;
 206     int shift = Instruction_aarch64::extract(insn, 31, 31) ? 12 : 0;
 207     if (shift) {
 208       offset <<= shift;
 209       uint64_t target_page = ((uint64_t)insn_addr) + offset;
 210       target_page &= ((uint64_t)-1) << shift;
 211       // Return the target address for the following sequences
 212       //   1 - adrp    Rx, target_page
 213       //       ldr/str Ry, [Rx, #offset_in_page]
 214       //   2 - adrp    Rx, target_page         ]
 215       //       add     Ry, Rx, #offset_in_page
 216       //   3 - adrp    Rx, target_page (page aligned reloc, offset == 0)
 217       //
 218       // In the first two cases  we check that the register is the same and
 219       // return the target_page + the offset within the page.
 220       // Otherwise we assume it is a page aligned relocation and return
 221       // the target page only. The only cases this is generated is for
 222       // the safepoint polling page or for the card table byte map base so
 223       // we assert as much.
 224       //
 225       unsigned insn2 = ((unsigned*)insn_addr)[1];
 226       if (Instruction_aarch64::extract(insn2, 29, 24) == 0b111001 &&
 227                 Instruction_aarch64::extract(insn, 4, 0) ==
 228                         Instruction_aarch64::extract(insn2, 9, 5)) {
 229         // Load/store register (unsigned immediate)
 230         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 231         unsigned int size = Instruction_aarch64::extract(insn2, 31, 30);
 232         return address(target_page + (byte_offset << size));
 233       } else if (Instruction_aarch64::extract(insn2, 31, 22) == 0b1001000100 &&
 234                 Instruction_aarch64::extract(insn, 4, 0) ==
 235                         Instruction_aarch64::extract(insn2, 4, 0)) {
 236         // add (immediate)
 237         unsigned int byte_offset = Instruction_aarch64::extract(insn2, 21, 10);
 238         return address(target_page + byte_offset);
 239       } else {
 240         assert((jbyte *)target_page ==
 241                 ((CardTableModRefBS*)(Universe::heap()->barrier_set()))->byte_map_base ||
 242                (address)target_page == os::get_polling_page(),
 243                "adrp must be polling page or byte map base");
 244         return (address)target_page;
 245       }
 246     } else {
 247       ShouldNotReachHere();
 248     }
 249   } else if (Instruction_aarch64::extract(insn, 31, 23) == 0b110100101) {
 250     u_int32_t *insns = (u_int32_t *)insn_addr;
 251     // Move wide constant: movz, movk, movk.  See movptr().
 252     assert(nativeInstruction_at(insns+1)->is_movk(), "wrong insns in patch");
 253     assert(nativeInstruction_at(insns+2)->is_movk(), "wrong insns in patch");
 254     return address(u_int64_t(Instruction_aarch64::extract(insns[0], 20, 5))
 255                    + (u_int64_t(Instruction_aarch64::extract(insns[1], 20, 5)) << 16)
 256                    + (u_int64_t(Instruction_aarch64::extract(insns[2], 20, 5)) << 32));
 257   } else if (Instruction_aarch64::extract(insn, 31, 22) == 0b1011100101 &&
 258              Instruction_aarch64::extract(insn, 4, 0) == 0b11111) {
 259     return 0;
 260   } else {
 261     ShouldNotReachHere();
 262   }
 263   return address(((uint64_t)insn_addr + (offset << 2)));
 264 }
 265 
 266 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 267   dsb(Assembler::SY);
 268 }
 269 
 270 
 271 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 272                                            bool clear_pc) {
 273   // we must set sp to zero to clear frame
 274   str(zr, Address(rthread, JavaThread::last_Java_sp_offset()));
 275   // must clear fp, so that compiled frames are not confused; it is
 276   // possible that we need it only for debugging
 277   if (clear_fp) {
 278     str(zr, Address(rthread, JavaThread::last_Java_fp_offset()));
 279   }
 280 
 281   if (clear_pc) {
 282     str(zr, Address(rthread, JavaThread::last_Java_pc_offset()));
 283   }
 284 }
 285 
 286 // Calls to C land
 287 //
 288 // When entering C land, the rfp, & resp of the last Java frame have to be recorded
 289 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 290 // has to be reset to 0. This is required to allow proper stack traversal.
 291 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 292                                          Register last_java_fp,
 293                                          Register last_java_pc,
 294                                          Register scratch) {
 295 
 296   if (last_java_pc->is_valid()) {
 297       str(last_java_pc, Address(rthread,
 298                                 JavaThread::frame_anchor_offset()
 299                                 + JavaFrameAnchor::last_Java_pc_offset()));
 300     }
 301 
 302   // determine last_java_sp register
 303   if (last_java_sp == sp) {
 304     mov(scratch, sp);
 305     last_java_sp = scratch;
 306   } else if (!last_java_sp->is_valid()) {
 307     last_java_sp = esp;
 308   }
 309 
 310   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 311 
 312   // last_java_fp is optional
 313   if (last_java_fp->is_valid()) {
 314     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 315   }
 316 }
 317 
 318 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 319                                          Register last_java_fp,
 320                                          address  last_java_pc,
 321                                          Register scratch) {
 322   if (last_java_pc != NULL) {
 323     adr(scratch, last_java_pc);
 324   } else {
 325     // FIXME: This is almost never correct.  We should delete all
 326     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 327     // correct return address instead.
 328     adr(scratch, pc());
 329   }
 330 
 331   str(scratch, Address(rthread,
 332                        JavaThread::frame_anchor_offset()
 333                        + JavaFrameAnchor::last_Java_pc_offset()));
 334 
 335   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 336 }
 337 
 338 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 339                                          Register last_java_fp,
 340                                          Label &L,
 341                                          Register scratch) {
 342   if (L.is_bound()) {
 343     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 344   } else {
 345     InstructionMark im(this);
 346     L.add_patch_at(code(), locator());
 347     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 348   }
 349 }
 350 
 351 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 352   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 353   assert(CodeCache::find_blob(entry.target()) != NULL,
 354          "destination of far call not found in code cache");
 355   if (far_branches()) {
 356     unsigned long offset;
 357     // We can use ADRP here because we know that the total size of
 358     // the code cache cannot exceed 2Gb.
 359     adrp(tmp, entry, offset);
 360     add(tmp, tmp, offset);
 361     if (cbuf) cbuf->set_insts_mark();
 362     blr(tmp);
 363   } else {
 364     if (cbuf) cbuf->set_insts_mark();
 365     bl(entry);
 366   }
 367 }
 368 
 369 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 370   assert(ReservedCodeCacheSize < 4*G, "branch out of range");
 371   assert(CodeCache::find_blob(entry.target()) != NULL,
 372          "destination of far call not found in code cache");
 373   if (far_branches()) {
 374     unsigned long offset;
 375     // We can use ADRP here because we know that the total size of
 376     // the code cache cannot exceed 2Gb.
 377     adrp(tmp, entry, offset);
 378     add(tmp, tmp, offset);
 379     if (cbuf) cbuf->set_insts_mark();
 380     br(tmp);
 381   } else {
 382     if (cbuf) cbuf->set_insts_mark();
 383     b(entry);
 384   }
 385 }
 386 
 387 int MacroAssembler::biased_locking_enter(Register lock_reg,
 388                                          Register obj_reg,
 389                                          Register swap_reg,
 390                                          Register tmp_reg,
 391                                          bool swap_reg_contains_mark,
 392                                          Label& done,
 393                                          Label* slow_case,
 394                                          BiasedLockingCounters* counters) {
 395   assert(UseBiasedLocking, "why call this otherwise?");
 396   assert_different_registers(lock_reg, obj_reg, swap_reg);
 397 
 398   if (PrintBiasedLockingStatistics && counters == NULL)
 399     counters = BiasedLocking::counters();
 400 
 401   bool need_tmp_reg = false;
 402   if (tmp_reg == noreg) {
 403     tmp_reg = rscratch2;
 404   }
 405   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 406   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 407   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 408   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 409   Address saved_mark_addr(lock_reg, 0);
 410 
 411   // Biased locking
 412   // See whether the lock is currently biased toward our thread and
 413   // whether the epoch is still valid
 414   // Note that the runtime guarantees sufficient alignment of JavaThread
 415   // pointers to allow age to be placed into low bits
 416   // First check to see whether biasing is even enabled for this object
 417   Label cas_label;
 418   int null_check_offset = -1;
 419   if (!swap_reg_contains_mark) {
 420     null_check_offset = offset();
 421     ldr(swap_reg, mark_addr);
 422   }
 423   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 424   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 425   br(Assembler::NE, cas_label);
 426   // The bias pattern is present in the object's header. Need to check
 427   // whether the bias owner and the epoch are both still current.
 428   load_prototype_header(tmp_reg, obj_reg);
 429   orr(tmp_reg, tmp_reg, rthread);
 430   eor(tmp_reg, swap_reg, tmp_reg);
 431   andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 432   if (counters != NULL) {
 433     Label around;
 434     cbnz(tmp_reg, around);
 435     atomic_incw(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 436     b(done);
 437     bind(around);
 438   } else {
 439     cbz(tmp_reg, done);
 440   }
 441 
 442   Label try_revoke_bias;
 443   Label try_rebias;
 444 
 445   // At this point we know that the header has the bias pattern and
 446   // that we are not the bias owner in the current epoch. We need to
 447   // figure out more details about the state of the header in order to
 448   // know what operations can be legally performed on the object's
 449   // header.
 450 
 451   // If the low three bits in the xor result aren't clear, that means
 452   // the prototype header is no longer biased and we have to revoke
 453   // the bias on this object.
 454   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 455   cbnz(rscratch1, try_revoke_bias);
 456 
 457   // Biasing is still enabled for this data type. See whether the
 458   // epoch of the current bias is still valid, meaning that the epoch
 459   // bits of the mark word are equal to the epoch bits of the
 460   // prototype header. (Note that the prototype header's epoch bits
 461   // only change at a safepoint.) If not, attempt to rebias the object
 462   // toward the current thread. Note that we must be absolutely sure
 463   // that the current epoch is invalid in order to do this because
 464   // otherwise the manipulations it performs on the mark word are
 465   // illegal.
 466   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 467   cbnz(rscratch1, try_rebias);
 468 
 469   // The epoch of the current bias is still valid but we know nothing
 470   // about the owner; it might be set or it might be clear. Try to
 471   // acquire the bias of the object using an atomic operation. If this
 472   // fails we will go in to the runtime to revoke the object's bias.
 473   // Note that we first construct the presumed unbiased header so we
 474   // don't accidentally blow away another thread's valid bias.
 475   {
 476     Label here;
 477     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 478     andr(swap_reg, swap_reg, rscratch1);
 479     orr(tmp_reg, swap_reg, rthread);
 480     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 481     // If the biasing toward our thread failed, this means that
 482     // another thread succeeded in biasing it toward itself and we
 483     // need to revoke that bias. The revocation will occur in the
 484     // interpreter runtime in the slow case.
 485     bind(here);
 486     if (counters != NULL) {
 487       atomic_incw(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 488                   tmp_reg, rscratch1);
 489     }
 490   }
 491   b(done);
 492 
 493   bind(try_rebias);
 494   // At this point we know the epoch has expired, meaning that the
 495   // current "bias owner", if any, is actually invalid. Under these
 496   // circumstances _only_, we are allowed to use the current header's
 497   // value as the comparison value when doing the cas to acquire the
 498   // bias in the current epoch. In other words, we allow transfer of
 499   // the bias from one thread to another directly in this situation.
 500   //
 501   // FIXME: due to a lack of registers we currently blow away the age
 502   // bits in this situation. Should attempt to preserve them.
 503   {
 504     Label here;
 505     load_prototype_header(tmp_reg, obj_reg);
 506     orr(tmp_reg, rthread, tmp_reg);
 507     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 508     // If the biasing toward our thread failed, then another thread
 509     // succeeded in biasing it toward itself and we need to revoke that
 510     // bias. The revocation will occur in the runtime in the slow case.
 511     bind(here);
 512     if (counters != NULL) {
 513       atomic_incw(Address((address)counters->rebiased_lock_entry_count_addr()),
 514                   tmp_reg, rscratch1);
 515     }
 516   }
 517   b(done);
 518 
 519   bind(try_revoke_bias);
 520   // The prototype mark in the klass doesn't have the bias bit set any
 521   // more, indicating that objects of this data type are not supposed
 522   // to be biased any more. We are going to try to reset the mark of
 523   // this object to the prototype value and fall through to the
 524   // CAS-based locking scheme. Note that if our CAS fails, it means
 525   // that another thread raced us for the privilege of revoking the
 526   // bias of this particular object, so it's okay to continue in the
 527   // normal locking code.
 528   //
 529   // FIXME: due to a lack of registers we currently blow away the age
 530   // bits in this situation. Should attempt to preserve them.
 531   {
 532     Label here, nope;
 533     load_prototype_header(tmp_reg, obj_reg);
 534     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 535     bind(here);
 536 
 537     // Fall through to the normal CAS-based lock, because no matter what
 538     // the result of the above CAS, some thread must have succeeded in
 539     // removing the bias bit from the object's header.
 540     if (counters != NULL) {
 541       atomic_incw(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 542                   rscratch1);
 543     }
 544     bind(nope);
 545   }
 546 
 547   bind(cas_label);
 548 
 549   return null_check_offset;
 550 }
 551 
 552 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 553   assert(UseBiasedLocking, "why call this otherwise?");
 554 
 555   // Check for biased locking unlock case, which is a no-op
 556   // Note: we do not have to check the thread ID for two reasons.
 557   // First, the interpreter checks for IllegalMonitorStateException at
 558   // a higher level. Second, if the bias was revoked while we held the
 559   // lock, the object could not be rebiased toward another thread, so
 560   // the bias bit would be clear.
 561   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 562   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 563   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 564   br(Assembler::EQ, done);
 565 }
 566 
 567 
 568 // added to make this compile
 569 
 570 REGISTER_DEFINITION(Register, noreg);
 571 
 572 static void pass_arg0(MacroAssembler* masm, Register arg) {
 573   if (c_rarg0 != arg ) {
 574     masm->mov(c_rarg0, arg);
 575   }
 576 }
 577 
 578 static void pass_arg1(MacroAssembler* masm, Register arg) {
 579   if (c_rarg1 != arg ) {
 580     masm->mov(c_rarg1, arg);
 581   }
 582 }
 583 
 584 static void pass_arg2(MacroAssembler* masm, Register arg) {
 585   if (c_rarg2 != arg ) {
 586     masm->mov(c_rarg2, arg);
 587   }
 588 }
 589 
 590 static void pass_arg3(MacroAssembler* masm, Register arg) {
 591   if (c_rarg3 != arg ) {
 592     masm->mov(c_rarg3, arg);
 593   }
 594 }
 595 
 596 void MacroAssembler::call_VM_base(Register oop_result,
 597                                   Register java_thread,
 598                                   Register last_java_sp,
 599                                   address  entry_point,
 600                                   int      number_of_arguments,
 601                                   bool     check_exceptions) {
 602    // determine java_thread register
 603   if (!java_thread->is_valid()) {
 604     java_thread = rthread;
 605   }
 606 
 607   // determine last_java_sp register
 608   if (!last_java_sp->is_valid()) {
 609     last_java_sp = esp;
 610   }
 611 
 612   // debugging support
 613   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 614   assert(java_thread == rthread, "unexpected register");
 615 #ifdef ASSERT
 616   // TraceBytecodes does not use r12 but saves it over the call, so don't verify
 617   // if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");
 618 #endif // ASSERT
 619 
 620   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 621   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 622 
 623   // push java thread (becomes first argument of C function)
 624 
 625   mov(c_rarg0, java_thread);
 626 
 627   // set last Java frame before call
 628   assert(last_java_sp != rfp, "can't use rfp");
 629 
 630   Label l;
 631   set_last_Java_frame(last_java_sp, rfp, l, rscratch1);
 632 
 633   // do the call, remove parameters
 634   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 635 
 636   // reset last Java frame
 637   // Only interpreter should have to clear fp
 638   reset_last_Java_frame(true, true);
 639 
 640    // C++ interp handles this in the interpreter
 641   check_and_handle_popframe(java_thread);
 642   check_and_handle_earlyret(java_thread);
 643 
 644   if (check_exceptions) {
 645     // check for pending exceptions (java_thread is set upon return)
 646     ldr(rscratch1, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 647     Label ok;
 648     cbz(rscratch1, ok);
 649     lea(rscratch1, RuntimeAddress(StubRoutines::forward_exception_entry()));
 650     br(rscratch1);
 651     bind(ok);
 652   }
 653 
 654   // get oop result if there is one and reset the value in the thread
 655   if (oop_result->is_valid()) {
 656     get_vm_result(oop_result, java_thread);
 657   }
 658 }
 659 
 660 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 661   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 662 }
 663 
 664 // Maybe emit a call via a trampoline.  If the code cache is small
 665 // trampolines won't be emitted.
 666 
 667 address MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 668   assert(entry.rspec().type() == relocInfo::runtime_call_type
 669          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 670          || entry.rspec().type() == relocInfo::static_call_type
 671          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 672 
 673   unsigned int start_offset = offset();
 674   if (far_branches() && !Compile::current()->in_scratch_emit_size()) {
 675     address stub = emit_trampoline_stub(start_offset, entry.target());
 676     if (stub == NULL) {
 677       return NULL; // CodeCache is full
 678     }
 679   }
 680 
 681   if (cbuf) cbuf->set_insts_mark();
 682   relocate(entry.rspec());
 683   if (Assembler::reachable_from_branch_at(pc(), entry.target())) {
 684     bl(entry.target());
 685   } else {
 686     bl(pc());
 687   }
 688   return start_offset;
 689 }
 690 
 691 
 692 // Emit a trampoline stub for a call to a target which is too far away.
 693 //
 694 // code sequences:
 695 //
 696 // call-site:
 697 //   branch-and-link to <destination> or <trampoline stub>
 698 //
 699 // Related trampoline stub for this call site in the stub section:
 700 //   load the call target from the constant pool
 701 //   branch (LR still points to the call site above)
 702 
 703 address MacroAssembler::emit_trampoline_stub(int insts_call_instruction_offset,
 704                                              address dest) {
 705   address stub = start_a_stub(Compile::MAX_stubs_size/2);
 706   if (stub == NULL) {
 707     return NULL;  // CodeBuffer::expand failed
 708   }
 709 
 710   // Create a trampoline stub relocation which relates this trampoline stub
 711   // with the call instruction at insts_call_instruction_offset in the
 712   // instructions code-section.
 713   align(wordSize);
 714   relocate(trampoline_stub_Relocation::spec(code()->insts()->start()
 715                                             + insts_call_instruction_offset));
 716   const int stub_start_offset = offset();
 717 
 718   // Now, create the trampoline stub's code:
 719   // - load the call
 720   // - call
 721   Label target;
 722   ldr(rscratch1, target);
 723   br(rscratch1);
 724   bind(target);
 725   assert(offset() - stub_start_offset == NativeCallTrampolineStub::data_offset,
 726          "should be");
 727   emit_int64((int64_t)dest);
 728 
 729   const address stub_start_addr = addr_at(stub_start_offset);
 730 
 731   assert(is_NativeCallTrampolineStub_at(stub_start_addr), "doesn't look like a trampoline");
 732 
 733   end_a_stub();
 734   return stub;
 735 }
 736 
 737 address MacroAssembler::ic_call(address entry) {
 738   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 739   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 740   // unsigned long offset;
 741   // ldr_constant(rscratch2, const_ptr);
 742   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 743   return trampoline_call(Address(entry, rh));
 744 }
 745 
 746 // Implementation of call_VM versions
 747 
 748 void MacroAssembler::call_VM(Register oop_result,
 749                              address entry_point,
 750                              bool check_exceptions) {
 751   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 752 }
 753 
 754 void MacroAssembler::call_VM(Register oop_result,
 755                              address entry_point,
 756                              Register arg_1,
 757                              bool check_exceptions) {
 758   pass_arg1(this, arg_1);
 759   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 760 }
 761 
 762 void MacroAssembler::call_VM(Register oop_result,
 763                              address entry_point,
 764                              Register arg_1,
 765                              Register arg_2,
 766                              bool check_exceptions) {
 767   assert(arg_1 != c_rarg2, "smashed arg");
 768   pass_arg2(this, arg_2);
 769   pass_arg1(this, arg_1);
 770   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 771 }
 772 
 773 void MacroAssembler::call_VM(Register oop_result,
 774                              address entry_point,
 775                              Register arg_1,
 776                              Register arg_2,
 777                              Register arg_3,
 778                              bool check_exceptions) {
 779   assert(arg_1 != c_rarg3, "smashed arg");
 780   assert(arg_2 != c_rarg3, "smashed arg");
 781   pass_arg3(this, arg_3);
 782 
 783   assert(arg_1 != c_rarg2, "smashed arg");
 784   pass_arg2(this, arg_2);
 785 
 786   pass_arg1(this, arg_1);
 787   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 788 }
 789 
 790 void MacroAssembler::call_VM(Register oop_result,
 791                              Register last_java_sp,
 792                              address entry_point,
 793                              int number_of_arguments,
 794                              bool check_exceptions) {
 795   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 796 }
 797 
 798 void MacroAssembler::call_VM(Register oop_result,
 799                              Register last_java_sp,
 800                              address entry_point,
 801                              Register arg_1,
 802                              bool check_exceptions) {
 803   pass_arg1(this, arg_1);
 804   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 805 }
 806 
 807 void MacroAssembler::call_VM(Register oop_result,
 808                              Register last_java_sp,
 809                              address entry_point,
 810                              Register arg_1,
 811                              Register arg_2,
 812                              bool check_exceptions) {
 813 
 814   assert(arg_1 != c_rarg2, "smashed arg");
 815   pass_arg2(this, arg_2);
 816   pass_arg1(this, arg_1);
 817   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 818 }
 819 
 820 void MacroAssembler::call_VM(Register oop_result,
 821                              Register last_java_sp,
 822                              address entry_point,
 823                              Register arg_1,
 824                              Register arg_2,
 825                              Register arg_3,
 826                              bool check_exceptions) {
 827   assert(arg_1 != c_rarg3, "smashed arg");
 828   assert(arg_2 != c_rarg3, "smashed arg");
 829   pass_arg3(this, arg_3);
 830   assert(arg_1 != c_rarg2, "smashed arg");
 831   pass_arg2(this, arg_2);
 832   pass_arg1(this, arg_1);
 833   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 834 }
 835 
 836 
 837 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 838   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 839   str(zr, Address(java_thread, JavaThread::vm_result_offset()));
 840   verify_oop(oop_result, "broken oop in call_VM_base");
 841 }
 842 
 843 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 844   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 845   str(zr, Address(java_thread, JavaThread::vm_result_2_offset()));
 846 }
 847 
 848 void MacroAssembler::align(int modulus) {
 849   while (offset() % modulus != 0) nop();
 850 }
 851 
 852 // these are no-ops overridden by InterpreterMacroAssembler
 853 
 854 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 855 
 856 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 857 
 858 
 859 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 860                                                       Register tmp,
 861                                                       int offset) {
 862   intptr_t value = *delayed_value_addr;
 863   if (value != 0)
 864     return RegisterOrConstant(value + offset);
 865 
 866   // load indirectly to solve generation ordering problem
 867   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 868 
 869   if (offset != 0)
 870     add(tmp, tmp, offset);
 871 
 872   return RegisterOrConstant(tmp);
 873 }
 874 
 875 
 876 void MacroAssembler:: notify(int type) {
 877   if (type == bytecode_start) {
 878     // set_last_Java_frame(esp, rfp, (address)NULL);
 879     Assembler:: notify(type);
 880     // reset_last_Java_frame(true, false);
 881   }
 882   else
 883     Assembler:: notify(type);
 884 }
 885 
 886 // Look up the method for a megamorphic invokeinterface call.
 887 // The target method is determined by <intf_klass, itable_index>.
 888 // The receiver klass is in recv_klass.
 889 // On success, the result will be in method_result, and execution falls through.
 890 // On failure, execution transfers to the given label.
 891 void MacroAssembler::lookup_interface_method(Register recv_klass,
 892                                              Register intf_klass,
 893                                              RegisterOrConstant itable_index,
 894                                              Register method_result,
 895                                              Register scan_temp,
 896                                              Label& L_no_such_interface) {
 897   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 898   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 899          "caller must use same register for non-constant itable index as for method");
 900 
 901   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 902   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 903   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 904   int scan_step   = itableOffsetEntry::size() * wordSize;
 905   int vte_size    = vtableEntry::size() * wordSize;
 906   assert(vte_size == wordSize, "else adjust times_vte_scale");
 907 
 908   ldrw(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 909 
 910   // %%% Could store the aligned, prescaled offset in the klassoop.
 911   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 912   lea(scan_temp, Address(recv_klass, scan_temp, Address::lsl(3)));
 913   add(scan_temp, scan_temp, vtable_base);
 914   if (HeapWordsPerLong > 1) {
 915     // Round up to align_object_offset boundary
 916     // see code for instanceKlass::start_of_itable!
 917     round_to(scan_temp, BytesPerLong);
 918   }
 919 
 920   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 921   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 922   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 923   lea(recv_klass, Address(recv_klass, itable_index, Address::lsl(3)));
 924   if (itentry_off)
 925     add(recv_klass, recv_klass, itentry_off);
 926 
 927   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 928   //   if (scan->interface() == intf) {
 929   //     result = (klass + scan->offset() + itable_index);
 930   //   }
 931   // }
 932   Label search, found_method;
 933 
 934   for (int peel = 1; peel >= 0; peel--) {
 935     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 936     cmp(intf_klass, method_result);
 937 
 938     if (peel) {
 939       br(Assembler::EQ, found_method);
 940     } else {
 941       br(Assembler::NE, search);
 942       // (invert the test to fall through to found_method...)
 943     }
 944 
 945     if (!peel)  break;
 946 
 947     bind(search);
 948 
 949     // Check that the previous entry is non-null.  A null entry means that
 950     // the receiver class doesn't implement the interface, and wasn't the
 951     // same as when the caller was compiled.
 952     cbz(method_result, L_no_such_interface);
 953     add(scan_temp, scan_temp, scan_step);
 954   }
 955 
 956   bind(found_method);
 957 
 958   // Got a hit.
 959   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 960   ldr(method_result, Address(recv_klass, scan_temp));
 961 }
 962 
 963 // virtual method calling
 964 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 965                                            RegisterOrConstant vtable_index,
 966                                            Register method_result) {
 967   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 968   assert(vtableEntry::size() * wordSize == 8,
 969          "adjust the scaling in the code below");
 970   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 971 
 972   if (vtable_index.is_register()) {
 973     lea(method_result, Address(recv_klass,
 974                                vtable_index.as_register(),
 975                                Address::lsl(LogBytesPerWord)));
 976     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 977   } else {
 978     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 979     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 980   }
 981 }
 982 
 983 void MacroAssembler::check_klass_subtype(Register sub_klass,
 984                            Register super_klass,
 985                            Register temp_reg,
 986                            Label& L_success) {
 987   Label L_failure;
 988   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 989   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 990   bind(L_failure);
 991 }
 992 
 993 
 994 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 995                                                    Register super_klass,
 996                                                    Register temp_reg,
 997                                                    Label* L_success,
 998                                                    Label* L_failure,
 999                                                    Label* L_slow_path,
1000                                         RegisterOrConstant super_check_offset) {
1001   assert_different_registers(sub_klass, super_klass, temp_reg);
1002   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
1003   if (super_check_offset.is_register()) {
1004     assert_different_registers(sub_klass, super_klass,
1005                                super_check_offset.as_register());
1006   } else if (must_load_sco) {
1007     assert(temp_reg != noreg, "supply either a temp or a register offset");
1008   }
1009 
1010   Label L_fallthrough;
1011   int label_nulls = 0;
1012   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1013   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1014   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
1015   assert(label_nulls <= 1, "at most one NULL in the batch");
1016 
1017   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1018   int sco_offset = in_bytes(Klass::super_check_offset_offset());
1019   Address super_check_offset_addr(super_klass, sco_offset);
1020 
1021   // Hacked jmp, which may only be used just before L_fallthrough.
1022 #define final_jmp(label)                                                \
1023   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
1024   else                            b(label)                /*omit semi*/
1025 
1026   // If the pointers are equal, we are done (e.g., String[] elements).
1027   // This self-check enables sharing of secondary supertype arrays among
1028   // non-primary types such as array-of-interface.  Otherwise, each such
1029   // type would need its own customized SSA.
1030   // We move this check to the front of the fast path because many
1031   // type checks are in fact trivially successful in this manner,
1032   // so we get a nicely predicted branch right at the start of the check.
1033   cmp(sub_klass, super_klass);
1034   br(Assembler::EQ, *L_success);
1035 
1036   // Check the supertype display:
1037   if (must_load_sco) {
1038     ldrw(temp_reg, super_check_offset_addr);
1039     super_check_offset = RegisterOrConstant(temp_reg);
1040   }
1041   Address super_check_addr(sub_klass, super_check_offset);
1042   ldr(rscratch1, super_check_addr);
1043   cmp(super_klass, rscratch1); // load displayed supertype
1044 
1045   // This check has worked decisively for primary supers.
1046   // Secondary supers are sought in the super_cache ('super_cache_addr').
1047   // (Secondary supers are interfaces and very deeply nested subtypes.)
1048   // This works in the same check above because of a tricky aliasing
1049   // between the super_cache and the primary super display elements.
1050   // (The 'super_check_addr' can address either, as the case requires.)
1051   // Note that the cache is updated below if it does not help us find
1052   // what we need immediately.
1053   // So if it was a primary super, we can just fail immediately.
1054   // Otherwise, it's the slow path for us (no success at this point).
1055 
1056   if (super_check_offset.is_register()) {
1057     br(Assembler::EQ, *L_success);
1058     cmp(super_check_offset.as_register(), sc_offset);
1059     if (L_failure == &L_fallthrough) {
1060       br(Assembler::EQ, *L_slow_path);
1061     } else {
1062       br(Assembler::NE, *L_failure);
1063       final_jmp(*L_slow_path);
1064     }
1065   } else if (super_check_offset.as_constant() == sc_offset) {
1066     // Need a slow path; fast failure is impossible.
1067     if (L_slow_path == &L_fallthrough) {
1068       br(Assembler::EQ, *L_success);
1069     } else {
1070       br(Assembler::NE, *L_slow_path);
1071       final_jmp(*L_success);
1072     }
1073   } else {
1074     // No slow path; it's a fast decision.
1075     if (L_failure == &L_fallthrough) {
1076       br(Assembler::EQ, *L_success);
1077     } else {
1078       br(Assembler::NE, *L_failure);
1079       final_jmp(*L_success);
1080     }
1081   }
1082 
1083   bind(L_fallthrough);
1084 
1085 #undef final_jmp
1086 }
1087 
1088 // These two are taken from x86, but they look generally useful
1089 
1090 // scans count pointer sized words at [addr] for occurence of value,
1091 // generic
1092 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1093                                 Register scratch) {
1094   Label Lloop, Lexit;
1095   cbz(count, Lexit);
1096   bind(Lloop);
1097   ldr(scratch, post(addr, wordSize));
1098   cmp(value, scratch);
1099   br(EQ, Lexit);
1100   sub(count, count, 1);
1101   cbnz(count, Lloop);
1102   bind(Lexit);
1103 }
1104 
1105 // scans count 4 byte words at [addr] for occurence of value,
1106 // generic
1107 void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1108                                 Register scratch) {
1109   Label Lloop, Lexit;
1110   cbz(count, Lexit);
1111   bind(Lloop);
1112   ldrw(scratch, post(addr, wordSize));
1113   cmpw(value, scratch);
1114   br(EQ, Lexit);
1115   sub(count, count, 1);
1116   cbnz(count, Lloop);
1117   bind(Lexit);
1118 }
1119 
1120 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1121                                                    Register super_klass,
1122                                                    Register temp_reg,
1123                                                    Register temp2_reg,
1124                                                    Label* L_success,
1125                                                    Label* L_failure,
1126                                                    bool set_cond_codes) {
1127   assert_different_registers(sub_klass, super_klass, temp_reg);
1128   if (temp2_reg != noreg)
1129     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1130 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1131 
1132   Label L_fallthrough;
1133   int label_nulls = 0;
1134   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1135   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1136   assert(label_nulls <= 1, "at most one NULL in the batch");
1137 
1138   // a couple of useful fields in sub_klass:
1139   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1140   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1141   Address secondary_supers_addr(sub_klass, ss_offset);
1142   Address super_cache_addr(     sub_klass, sc_offset);
1143 
1144   BLOCK_COMMENT("check_klass_subtype_slow_path");
1145 
1146   // Do a linear scan of the secondary super-klass chain.
1147   // This code is rarely used, so simplicity is a virtue here.
1148   // The repne_scan instruction uses fixed registers, which we must spill.
1149   // Don't worry too much about pre-existing connections with the input regs.
1150 
1151   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1152   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1153 
1154   // Get super_klass value into r0 (even if it was in r5 or r2).
1155   RegSet pushed_registers;
1156   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1157   if (!IS_A_TEMP(r5))    pushed_registers += r5;
1158 
1159   if (super_klass != r0 || UseCompressedOops) {
1160     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1161   }
1162 
1163   push(pushed_registers, sp);
1164 
1165 #ifndef PRODUCT
1166   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1167   Address pst_counter_addr(rscratch2);
1168   ldr(rscratch1, pst_counter_addr);
1169   add(rscratch1, rscratch1, 1);
1170   str(rscratch1, pst_counter_addr);
1171 #endif //PRODUCT
1172 
1173   // We will consult the secondary-super array.
1174   ldr(r5, secondary_supers_addr);
1175   // Load the array length.
1176   ldrw(r2, Address(r5, Array<Klass*>::length_offset_in_bytes()));
1177   // Skip to start of data.
1178   add(r5, r5, Array<Klass*>::base_offset_in_bytes());
1179 
1180   cmp(sp, zr); // Clear Z flag; SP is never zero
1181   // Scan R2 words at [R5] for an occurrence of R0.
1182   // Set NZ/Z based on last compare.
1183   repne_scan(r5, r0, r2, rscratch1);
1184 
1185   // Unspill the temp. registers:
1186   pop(pushed_registers, sp);
1187 
1188   br(Assembler::NE, *L_failure);
1189 
1190   // Success.  Cache the super we found and proceed in triumph.
1191   str(super_klass, super_cache_addr);
1192 
1193   if (L_success != &L_fallthrough) {
1194     b(*L_success);
1195   }
1196 
1197 #undef IS_A_TEMP
1198 
1199   bind(L_fallthrough);
1200 }
1201 
1202 
1203 void MacroAssembler::verify_oop(Register reg, const char* s) {
1204   if (!VerifyOops) return;
1205 
1206   // Pass register number to verify_oop_subroutine
1207   const char* b = NULL;
1208   {
1209     ResourceMark rm;
1210     stringStream ss;
1211     ss.print("verify_oop: %s: %s", reg->name(), s);
1212     b = code_string(ss.as_string());
1213   }
1214   BLOCK_COMMENT("verify_oop {");
1215 
1216   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1217   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1218 
1219   mov(r0, reg);
1220   mov(rscratch1, (address)b);
1221 
1222   // call indirectly to solve generation ordering problem
1223   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1224   ldr(rscratch2, Address(rscratch2));
1225   blr(rscratch2);
1226 
1227   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1228   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1229 
1230   BLOCK_COMMENT("} verify_oop");
1231 }
1232 
1233 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1234   if (!VerifyOops) return;
1235 
1236   const char* b = NULL;
1237   {
1238     ResourceMark rm;
1239     stringStream ss;
1240     ss.print("verify_oop_addr: %s", s);
1241     b = code_string(ss.as_string());
1242   }
1243   BLOCK_COMMENT("verify_oop_addr {");
1244 
1245   stp(r0, rscratch1, Address(pre(sp, -2 * wordSize)));
1246   stp(rscratch2, lr, Address(pre(sp, -2 * wordSize)));
1247 
1248   // addr may contain sp so we will have to adjust it based on the
1249   // pushes that we just did.
1250   if (addr.uses(sp)) {
1251     lea(r0, addr);
1252     ldr(r0, Address(r0, 4 * wordSize));
1253   } else {
1254     ldr(r0, addr);
1255   }
1256   mov(rscratch1, (address)b);
1257 
1258   // call indirectly to solve generation ordering problem
1259   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1260   ldr(rscratch2, Address(rscratch2));
1261   blr(rscratch2);
1262 
1263   ldp(rscratch2, lr, Address(post(sp, 2 * wordSize)));
1264   ldp(r0, rscratch1, Address(post(sp, 2 * wordSize)));
1265 
1266   BLOCK_COMMENT("} verify_oop_addr");
1267 }
1268 
1269 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1270                                          int extra_slot_offset) {
1271   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1272   int stackElementSize = Interpreter::stackElementSize;
1273   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1274 #ifdef ASSERT
1275   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1276   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1277 #endif
1278   if (arg_slot.is_constant()) {
1279     return Address(esp, arg_slot.as_constant() * stackElementSize
1280                    + offset);
1281   } else {
1282     add(rscratch1, esp, arg_slot.as_register(),
1283         ext::uxtx, exact_log2(stackElementSize));
1284     return Address(rscratch1, offset);
1285   }
1286 }
1287 
1288 void MacroAssembler::call_VM_leaf_base(address entry_point,
1289                                        int number_of_arguments,
1290                                        Label *retaddr) {
1291   call_VM_leaf_base1(entry_point, number_of_arguments, 0, ret_type_integral, retaddr);
1292 }
1293 
1294 void MacroAssembler::call_VM_leaf_base1(address entry_point,
1295                                         int number_of_gp_arguments,
1296                                         int number_of_fp_arguments,
1297                                         ret_type type,
1298                                         Label *retaddr) {
1299   Label E, L;
1300 
1301   stp(rscratch1, rmethod, Address(pre(sp, -2 * wordSize)));
1302 
1303   // We add 1 to number_of_arguments because the thread in arg0 is
1304   // not counted
1305   mov(rscratch1, entry_point);
1306   blrt(rscratch1, number_of_gp_arguments + 1, number_of_fp_arguments, type);
1307   if (retaddr)
1308     bind(*retaddr);
1309 
1310   ldp(rscratch1, rmethod, Address(post(sp, 2 * wordSize)));
1311   maybe_isb();
1312 }
1313 
1314 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1315   call_VM_leaf_base(entry_point, number_of_arguments);
1316 }
1317 
1318 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1319   pass_arg0(this, arg_0);
1320   call_VM_leaf_base(entry_point, 1);
1321 }
1322 
1323 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1324   pass_arg0(this, arg_0);
1325   pass_arg1(this, arg_1);
1326   call_VM_leaf_base(entry_point, 2);
1327 }
1328 
1329 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1330                                   Register arg_1, Register arg_2) {
1331   pass_arg0(this, arg_0);
1332   pass_arg1(this, arg_1);
1333   pass_arg2(this, arg_2);
1334   call_VM_leaf_base(entry_point, 3);
1335 }
1336 
1337 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1338   pass_arg0(this, arg_0);
1339   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1340 }
1341 
1342 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1343 
1344   assert(arg_0 != c_rarg1, "smashed arg");
1345   pass_arg1(this, arg_1);
1346   pass_arg0(this, arg_0);
1347   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1348 }
1349 
1350 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1351   assert(arg_0 != c_rarg2, "smashed arg");
1352   assert(arg_1 != c_rarg2, "smashed arg");
1353   pass_arg2(this, arg_2);
1354   assert(arg_0 != c_rarg1, "smashed arg");
1355   pass_arg1(this, arg_1);
1356   pass_arg0(this, arg_0);
1357   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1358 }
1359 
1360 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1361   assert(arg_0 != c_rarg3, "smashed arg");
1362   assert(arg_1 != c_rarg3, "smashed arg");
1363   assert(arg_2 != c_rarg3, "smashed arg");
1364   pass_arg3(this, arg_3);
1365   assert(arg_0 != c_rarg2, "smashed arg");
1366   assert(arg_1 != c_rarg2, "smashed arg");
1367   pass_arg2(this, arg_2);
1368   assert(arg_0 != c_rarg1, "smashed arg");
1369   pass_arg1(this, arg_1);
1370   pass_arg0(this, arg_0);
1371   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1372 }
1373 
1374 void MacroAssembler::null_check(Register reg, int offset) {
1375   if (needs_explicit_null_check(offset)) {
1376     // provoke OS NULL exception if reg = NULL by
1377     // accessing M[reg] w/o changing any registers
1378     // NOTE: this is plenty to provoke a segv
1379     ldr(zr, Address(reg));
1380   } else {
1381     // nothing to do, (later) access of M[reg + offset]
1382     // will provoke OS NULL exception if reg = NULL
1383   }
1384 }
1385 
1386 // MacroAssembler protected routines needed to implement
1387 // public methods
1388 
1389 void MacroAssembler::mov(Register r, Address dest) {
1390   code_section()->relocate(pc(), dest.rspec());
1391   u_int64_t imm64 = (u_int64_t)dest.target();
1392   movptr(r, imm64);
1393 }
1394 
1395 // Move a constant pointer into r.  In AArch64 mode the virtual
1396 // address space is 48 bits in size, so we only need three
1397 // instructions to create a patchable instruction sequence that can
1398 // reach anywhere.
1399 void MacroAssembler::movptr(Register r, uintptr_t imm64) {
1400 #ifndef PRODUCT
1401   {
1402     char buffer[64];
1403     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1404     block_comment(buffer);
1405   }
1406 #endif
1407   assert(imm64 < (1ul << 48), "48-bit overflow in address constant");
1408   movz(r, imm64 & 0xffff);
1409   imm64 >>= 16;
1410   movk(r, imm64 & 0xffff, 16);
1411   imm64 >>= 16;
1412   movk(r, imm64 & 0xffff, 32);
1413 }
1414 
1415 // Macro to mov replicated immediate to vector register.
1416 //  Vd will get the following values for different arrangements in T
1417 //   imm32 == hex 000000gh  T8B:  Vd = ghghghghghghghgh
1418 //   imm32 == hex 000000gh  T16B: Vd = ghghghghghghghghghghghghghghghgh
1419 //   imm32 == hex 0000efgh  T4H:  Vd = efghefghefghefgh
1420 //   imm32 == hex 0000efgh  T8H:  Vd = efghefghefghefghefghefghefghefgh
1421 //   imm32 == hex abcdefgh  T2S:  Vd = abcdefghabcdefgh
1422 //   imm32 == hex abcdefgh  T4S:  Vd = abcdefghabcdefghabcdefghabcdefgh
1423 //   T1D/T2D: invalid
1424 void MacroAssembler::mov(FloatRegister Vd, SIMD_Arrangement T, u_int32_t imm32) {
1425   assert(T != T1D && T != T2D, "invalid arrangement");
1426   if (T == T8B || T == T16B) {
1427     assert((imm32 & ~0xff) == 0, "extraneous bits in unsigned imm32 (T8B/T16B)");
1428     movi(Vd, T, imm32 & 0xff, 0);
1429     return;
1430   }
1431   u_int32_t nimm32 = ~imm32;
1432   if (T == T4H || T == T8H) {
1433     assert((imm32  & ~0xffff) == 0, "extraneous bits in unsigned imm32 (T4H/T8H)");
1434     imm32 &= 0xffff;
1435     nimm32 &= 0xffff;
1436   }
1437   u_int32_t x = imm32;
1438   int movi_cnt = 0;
1439   int movn_cnt = 0;
1440   while (x) { if (x & 0xff) movi_cnt++; x >>= 8; }
1441   x = nimm32;
1442   while (x) { if (x & 0xff) movn_cnt++; x >>= 8; }
1443   if (movn_cnt < movi_cnt) imm32 = nimm32;
1444   unsigned lsl = 0;
1445   while (imm32 && (imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1446   if (movn_cnt < movi_cnt)
1447     mvni(Vd, T, imm32 & 0xff, lsl);
1448   else
1449     movi(Vd, T, imm32 & 0xff, lsl);
1450   imm32 >>= 8; lsl += 8;
1451   while (imm32) {
1452     while ((imm32 & 0xff) == 0) { lsl += 8; imm32 >>= 8; }
1453     if (movn_cnt < movi_cnt)
1454       bici(Vd, T, imm32 & 0xff, lsl);
1455     else
1456       orri(Vd, T, imm32 & 0xff, lsl);
1457     lsl += 8; imm32 >>= 8;
1458   }
1459 }
1460 
1461 void MacroAssembler::mov_immediate64(Register dst, u_int64_t imm64)
1462 {
1463 #ifndef PRODUCT
1464   {
1465     char buffer[64];
1466     snprintf(buffer, sizeof(buffer), "0x%"PRIX64, imm64);
1467     block_comment(buffer);
1468   }
1469 #endif
1470   if (operand_valid_for_logical_immediate(false, imm64)) {
1471     orr(dst, zr, imm64);
1472   } else {
1473     // we can use a combination of MOVZ or MOVN with
1474     // MOVK to build up the constant
1475     u_int64_t imm_h[4];
1476     int zero_count = 0;
1477     int neg_count = 0;
1478     int i;
1479     for (i = 0; i < 4; i++) {
1480       imm_h[i] = ((imm64 >> (i * 16)) & 0xffffL);
1481       if (imm_h[i] == 0) {
1482         zero_count++;
1483       } else if (imm_h[i] == 0xffffL) {
1484         neg_count++;
1485       }
1486     }
1487     if (zero_count == 4) {
1488       // one MOVZ will do
1489       movz(dst, 0);
1490     } else if (neg_count == 4) {
1491       // one MOVN will do
1492       movn(dst, 0);
1493     } else if (zero_count == 3) {
1494       for (i = 0; i < 4; i++) {
1495         if (imm_h[i] != 0L) {
1496           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1497           break;
1498         }
1499       }
1500     } else if (neg_count == 3) {
1501       // one MOVN will do
1502       for (int i = 0; i < 4; i++) {
1503         if (imm_h[i] != 0xffffL) {
1504           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1505           break;
1506         }
1507       }
1508     } else if (zero_count == 2) {
1509       // one MOVZ and one MOVK will do
1510       for (i = 0; i < 3; i++) {
1511         if (imm_h[i] != 0L) {
1512           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1513           i++;
1514           break;
1515         }
1516       }
1517       for (;i < 4; i++) {
1518         if (imm_h[i] != 0L) {
1519           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1520         }
1521       }
1522     } else if (neg_count == 2) {
1523       // one MOVN and one MOVK will do
1524       for (i = 0; i < 4; i++) {
1525         if (imm_h[i] != 0xffffL) {
1526           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1527           i++;
1528           break;
1529         }
1530       }
1531       for (;i < 4; i++) {
1532         if (imm_h[i] != 0xffffL) {
1533           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1534         }
1535       }
1536     } else if (zero_count == 1) {
1537       // one MOVZ and two MOVKs will do
1538       for (i = 0; i < 4; i++) {
1539         if (imm_h[i] != 0L) {
1540           movz(dst, (u_int32_t)imm_h[i], (i << 4));
1541           i++;
1542           break;
1543         }
1544       }
1545       for (;i < 4; i++) {
1546         if (imm_h[i] != 0x0L) {
1547           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1548         }
1549       }
1550     } else if (neg_count == 1) {
1551       // one MOVN and two MOVKs will do
1552       for (i = 0; i < 4; i++) {
1553         if (imm_h[i] != 0xffffL) {
1554           movn(dst, (u_int32_t)imm_h[i] ^ 0xffffL, (i << 4));
1555           i++;
1556           break;
1557         }
1558       }
1559       for (;i < 4; i++) {
1560         if (imm_h[i] != 0xffffL) {
1561           movk(dst, (u_int32_t)imm_h[i], (i << 4));
1562         }
1563       }
1564     } else {
1565       // use a MOVZ and 3 MOVKs (makes it easier to debug)
1566       movz(dst, (u_int32_t)imm_h[0], 0);
1567       for (i = 1; i < 4; i++) {
1568         movk(dst, (u_int32_t)imm_h[i], (i << 4));
1569       }
1570     }
1571   }
1572 }
1573 
1574 void MacroAssembler::mov_immediate32(Register dst, u_int32_t imm32)
1575 {
1576 #ifndef PRODUCT
1577     {
1578       char buffer[64];
1579       snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1580       block_comment(buffer);
1581     }
1582 #endif
1583   if (operand_valid_for_logical_immediate(true, imm32)) {
1584     orrw(dst, zr, imm32);
1585   } else {
1586     // we can use MOVZ, MOVN or two calls to MOVK to build up the
1587     // constant
1588     u_int32_t imm_h[2];
1589     imm_h[0] = imm32 & 0xffff;
1590     imm_h[1] = ((imm32 >> 16) & 0xffff);
1591     if (imm_h[0] == 0) {
1592       movzw(dst, imm_h[1], 16);
1593     } else if (imm_h[0] == 0xffff) {
1594       movnw(dst, imm_h[1] ^ 0xffff, 16);
1595     } else if (imm_h[1] == 0) {
1596       movzw(dst, imm_h[0], 0);
1597     } else if (imm_h[1] == 0xffff) {
1598       movnw(dst, imm_h[0] ^ 0xffff, 0);
1599     } else {
1600       // use a MOVZ and MOVK (makes it easier to debug)
1601       movzw(dst, imm_h[0], 0);
1602       movkw(dst, imm_h[1], 16);
1603     }
1604   }
1605 }
1606 
1607 // Form an address from base + offset in Rd.  Rd may or may
1608 // not actually be used: you must use the Address that is returned.
1609 // It is up to you to ensure that the shift provided matches the size
1610 // of your data.
1611 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1612   if (Address::offset_ok_for_immed(byte_offset, shift))
1613     // It fits; no need for any heroics
1614     return Address(base, byte_offset);
1615 
1616   // Don't do anything clever with negative or misaligned offsets
1617   unsigned mask = (1 << shift) - 1;
1618   if (byte_offset < 0 || byte_offset & mask) {
1619     mov(Rd, byte_offset);
1620     add(Rd, base, Rd);
1621     return Address(Rd);
1622   }
1623 
1624   // See if we can do this with two 12-bit offsets
1625   {
1626     unsigned long word_offset = byte_offset >> shift;
1627     unsigned long masked_offset = word_offset & 0xfff000;
1628     if (Address::offset_ok_for_immed(word_offset - masked_offset)
1629         && Assembler::operand_valid_for_add_sub_immediate(masked_offset << shift)) {
1630       add(Rd, base, masked_offset << shift);
1631       word_offset -= masked_offset;
1632       return Address(Rd, word_offset << shift);
1633     }
1634   }
1635 
1636   // Do it the hard way
1637   mov(Rd, byte_offset);
1638   add(Rd, base, Rd);
1639   return Address(Rd);
1640 }
1641 
1642 void MacroAssembler::atomic_incw(Register counter_addr, Register tmp) {
1643   Label retry_load;
1644   bind(retry_load);
1645   // flush and load exclusive from the memory location
1646   ldxrw(tmp, counter_addr);
1647   addw(tmp, tmp, 1);
1648   // if we store+flush with no intervening write tmp wil be zero
1649   stxrw(tmp, tmp, counter_addr);
1650   cbnzw(tmp, retry_load);
1651 }
1652 
1653 
1654 int MacroAssembler::corrected_idivl(Register result, Register ra, Register rb,
1655                                     bool want_remainder, Register scratch)
1656 {
1657   // Full implementation of Java idiv and irem.  The function
1658   // returns the (pc) offset of the div instruction - may be needed
1659   // for implicit exceptions.
1660   //
1661   // constraint : ra/rb =/= scratch
1662   //         normal case
1663   //
1664   // input : ra: dividend
1665   //         rb: divisor
1666   //
1667   // result: either
1668   //         quotient  (= ra idiv rb)
1669   //         remainder (= ra irem rb)
1670 
1671   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1672 
1673   int idivl_offset = offset();
1674   if (! want_remainder) {
1675     sdivw(result, ra, rb);
1676   } else {
1677     sdivw(scratch, ra, rb);
1678     Assembler::msubw(result, scratch, rb, ra);
1679   }
1680 
1681   return idivl_offset;
1682 }
1683 
1684 int MacroAssembler::corrected_idivq(Register result, Register ra, Register rb,
1685                                     bool want_remainder, Register scratch)
1686 {
1687   // Full implementation of Java ldiv and lrem.  The function
1688   // returns the (pc) offset of the div instruction - may be needed
1689   // for implicit exceptions.
1690   //
1691   // constraint : ra/rb =/= scratch
1692   //         normal case
1693   //
1694   // input : ra: dividend
1695   //         rb: divisor
1696   //
1697   // result: either
1698   //         quotient  (= ra idiv rb)
1699   //         remainder (= ra irem rb)
1700 
1701   assert(ra != scratch && rb != scratch, "reg cannot be scratch");
1702 
1703   int idivq_offset = offset();
1704   if (! want_remainder) {
1705     sdiv(result, ra, rb);
1706   } else {
1707     sdiv(scratch, ra, rb);
1708     Assembler::msub(result, scratch, rb, ra);
1709   }
1710 
1711   return idivq_offset;
1712 }
1713 
1714 // MacroAssembler routines found actually to be needed
1715 
1716 void MacroAssembler::push(Register src)
1717 {
1718   str(src, Address(pre(esp, -1 * wordSize)));
1719 }
1720 
1721 void MacroAssembler::pop(Register dst)
1722 {
1723   ldr(dst, Address(post(esp, 1 * wordSize)));
1724 }
1725 
1726 // Note: load_unsigned_short used to be called load_unsigned_word.
1727 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1728   int off = offset();
1729   ldrh(dst, src);
1730   return off;
1731 }
1732 
1733 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1734   int off = offset();
1735   ldrb(dst, src);
1736   return off;
1737 }
1738 
1739 int MacroAssembler::load_signed_short(Register dst, Address src) {
1740   int off = offset();
1741   ldrsh(dst, src);
1742   return off;
1743 }
1744 
1745 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1746   int off = offset();
1747   ldrsb(dst, src);
1748   return off;
1749 }
1750 
1751 int MacroAssembler::load_signed_short32(Register dst, Address src) {
1752   int off = offset();
1753   ldrshw(dst, src);
1754   return off;
1755 }
1756 
1757 int MacroAssembler::load_signed_byte32(Register dst, Address src) {
1758   int off = offset();
1759   ldrsbw(dst, src);
1760   return off;
1761 }
1762 
1763 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1764   switch (size_in_bytes) {
1765   case  8:  ldr(dst, src); break;
1766   case  4:  ldrw(dst, src); break;
1767   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1768   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1769   default:  ShouldNotReachHere();
1770   }
1771 }
1772 
1773 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1774   switch (size_in_bytes) {
1775   case  8:  str(src, dst); break;
1776   case  4:  strw(src, dst); break;
1777   case  2:  strh(src, dst); break;
1778   case  1:  strb(src, dst); break;
1779   default:  ShouldNotReachHere();
1780   }
1781 }
1782 
1783 void MacroAssembler::decrementw(Register reg, int value)
1784 {
1785   if (value < 0)  { incrementw(reg, -value);      return; }
1786   if (value == 0) {                               return; }
1787   if (value < (1 << 12)) { subw(reg, reg, value); return; }
1788   /* else */ {
1789     guarantee(reg != rscratch2, "invalid dst for register decrement");
1790     movw(rscratch2, (unsigned)value);
1791     subw(reg, reg, rscratch2);
1792   }
1793 }
1794 
1795 void MacroAssembler::decrement(Register reg, int value)
1796 {
1797   if (value < 0)  { increment(reg, -value);      return; }
1798   if (value == 0) {                              return; }
1799   if (value < (1 << 12)) { sub(reg, reg, value); return; }
1800   /* else */ {
1801     assert(reg != rscratch2, "invalid dst for register decrement");
1802     mov(rscratch2, (unsigned long)value);
1803     sub(reg, reg, rscratch2);
1804   }
1805 }
1806 
1807 void MacroAssembler::decrementw(Address dst, int value)
1808 {
1809   assert(!dst.uses(rscratch1), "invalid dst for address decrement");
1810   ldrw(rscratch1, dst);
1811   decrementw(rscratch1, value);
1812   strw(rscratch1, dst);
1813 }
1814 
1815 void MacroAssembler::decrement(Address dst, int value)
1816 {
1817   assert(!dst.uses(rscratch1), "invalid address for decrement");
1818   ldr(rscratch1, dst);
1819   decrement(rscratch1, value);
1820   str(rscratch1, dst);
1821 }
1822 
1823 void MacroAssembler::incrementw(Register reg, int value)
1824 {
1825   if (value < 0)  { decrementw(reg, -value);      return; }
1826   if (value == 0) {                               return; }
1827   if (value < (1 << 12)) { addw(reg, reg, value); return; }
1828   /* else */ {
1829     assert(reg != rscratch2, "invalid dst for register increment");
1830     movw(rscratch2, (unsigned)value);
1831     addw(reg, reg, rscratch2);
1832   }
1833 }
1834 
1835 void MacroAssembler::increment(Register reg, int value)
1836 {
1837   if (value < 0)  { decrement(reg, -value);      return; }
1838   if (value == 0) {                              return; }
1839   if (value < (1 << 12)) { add(reg, reg, value); return; }
1840   /* else */ {
1841     assert(reg != rscratch2, "invalid dst for register increment");
1842     movw(rscratch2, (unsigned)value);
1843     add(reg, reg, rscratch2);
1844   }
1845 }
1846 
1847 void MacroAssembler::incrementw(Address dst, int value)
1848 {
1849   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1850   ldrw(rscratch1, dst);
1851   incrementw(rscratch1, value);
1852   strw(rscratch1, dst);
1853 }
1854 
1855 void MacroAssembler::increment(Address dst, int value)
1856 {
1857   assert(!dst.uses(rscratch1), "invalid dst for address increment");
1858   ldr(rscratch1, dst);
1859   increment(rscratch1, value);
1860   str(rscratch1, dst);
1861 }
1862 
1863 
1864 void MacroAssembler::pusha() {
1865   push(0x7fffffff, sp);
1866 }
1867 
1868 void MacroAssembler::popa() {
1869   pop(0x7fffffff, sp);
1870 }
1871 
1872 // Push lots of registers in the bit set supplied.  Don't push sp.
1873 // Return the number of words pushed
1874 int MacroAssembler::push(unsigned int bitset, Register stack) {
1875   int words_pushed = 0;
1876 
1877   // Scan bitset to accumulate register pairs
1878   unsigned char regs[32];
1879   int count = 0;
1880   for (int reg = 0; reg <= 30; reg++) {
1881     if (1 & bitset)
1882       regs[count++] = reg;
1883     bitset >>= 1;
1884   }
1885   regs[count++] = zr->encoding_nocheck();
1886   count &= ~1;  // Only push an even nuber of regs
1887 
1888   if (count) {
1889     stp(as_Register(regs[0]), as_Register(regs[1]),
1890        Address(pre(stack, -count * wordSize)));
1891     words_pushed += 2;
1892   }
1893   for (int i = 2; i < count; i += 2) {
1894     stp(as_Register(regs[i]), as_Register(regs[i+1]),
1895        Address(stack, i * wordSize));
1896     words_pushed += 2;
1897   }
1898 
1899   assert(words_pushed == count, "oops, pushed != count");
1900 
1901   return count;
1902 }
1903 
1904 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1905   int words_pushed = 0;
1906 
1907   // Scan bitset to accumulate register pairs
1908   unsigned char regs[32];
1909   int count = 0;
1910   for (int reg = 0; reg <= 30; reg++) {
1911     if (1 & bitset)
1912       regs[count++] = reg;
1913     bitset >>= 1;
1914   }
1915   regs[count++] = zr->encoding_nocheck();
1916   count &= ~1;
1917 
1918   for (int i = 2; i < count; i += 2) {
1919     ldp(as_Register(regs[i]), as_Register(regs[i+1]),
1920        Address(stack, i * wordSize));
1921     words_pushed += 2;
1922   }
1923   if (count) {
1924     ldp(as_Register(regs[0]), as_Register(regs[1]),
1925        Address(post(stack, count * wordSize)));
1926     words_pushed += 2;
1927   }
1928 
1929   assert(words_pushed == count, "oops, pushed != count");
1930 
1931   return count;
1932 }
1933 #ifdef ASSERT
1934 void MacroAssembler::verify_heapbase(const char* msg) {
1935 #if 0
1936   assert (UseCompressedOops || UseCompressedClassPointers, "should be compressed");
1937   assert (Universe::heap() != NULL, "java heap should be initialized");
1938   if (CheckCompressedOops) {
1939     Label ok;
1940     push(1 << rscratch1->encoding(), sp); // cmpptr trashes rscratch1
1941     cmpptr(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
1942     br(Assembler::EQ, ok);
1943     stop(msg);
1944     bind(ok);
1945     pop(1 << rscratch1->encoding(), sp);
1946   }
1947 #endif
1948 }
1949 #endif
1950 
1951 void MacroAssembler::stop(const char* msg) {
1952   address ip = pc();
1953   pusha();
1954   mov(c_rarg0, (address)msg);
1955   mov(c_rarg1, (address)ip);
1956   mov(c_rarg2, sp);
1957   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
1958   // call(c_rarg3);
1959   blrt(c_rarg3, 3, 0, 1);
1960   hlt(0);
1961 }
1962 
1963 // If a constant does not fit in an immediate field, generate some
1964 // number of MOV instructions and then perform the operation.
1965 void MacroAssembler::wrap_add_sub_imm_insn(Register Rd, Register Rn, unsigned imm,
1966                                            add_sub_imm_insn insn1,
1967                                            add_sub_reg_insn insn2) {
1968   assert(Rd != zr, "Rd = zr and not setting flags?");
1969   if (operand_valid_for_add_sub_immediate((int)imm)) {
1970     (this->*insn1)(Rd, Rn, imm);
1971   } else {
1972     if (uabs(imm) < (1 << 24)) {
1973        (this->*insn1)(Rd, Rn, imm & -(1 << 12));
1974        (this->*insn1)(Rd, Rd, imm & ((1 << 12)-1));
1975     } else {
1976        assert_different_registers(Rd, Rn);
1977        mov(Rd, (uint64_t)imm);
1978        (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1979     }
1980   }
1981 }
1982 
1983 // Seperate vsn which sets the flags. Optimisations are more restricted
1984 // because we must set the flags correctly.
1985 void MacroAssembler::wrap_adds_subs_imm_insn(Register Rd, Register Rn, unsigned imm,
1986                                            add_sub_imm_insn insn1,
1987                                            add_sub_reg_insn insn2) {
1988   if (operand_valid_for_add_sub_immediate((int)imm)) {
1989     (this->*insn1)(Rd, Rn, imm);
1990   } else {
1991     assert_different_registers(Rd, Rn);
1992     assert(Rd != zr, "overflow in immediate operand");
1993     mov(Rd, (uint64_t)imm);
1994     (this->*insn2)(Rd, Rn, Rd, LSL, 0);
1995   }
1996 }
1997 
1998 
1999 void MacroAssembler::add(Register Rd, Register Rn, RegisterOrConstant increment) {
2000   if (increment.is_register()) {
2001     add(Rd, Rn, increment.as_register());
2002   } else {
2003     add(Rd, Rn, increment.as_constant());
2004   }
2005 }
2006 
2007 void MacroAssembler::addw(Register Rd, Register Rn, RegisterOrConstant increment) {
2008   if (increment.is_register()) {
2009     addw(Rd, Rn, increment.as_register());
2010   } else {
2011     addw(Rd, Rn, increment.as_constant());
2012   }
2013 }
2014 
2015 void MacroAssembler::sub(Register Rd, Register Rn, RegisterOrConstant decrement) {
2016   if (decrement.is_register()) {
2017     sub(Rd, Rn, decrement.as_register());
2018   } else {
2019     sub(Rd, Rn, decrement.as_constant());
2020   }
2021 }
2022 
2023 void MacroAssembler::reinit_heapbase()
2024 {
2025   if (UseCompressedOops) {
2026     if (Universe::is_fully_initialized()) {
2027       mov(rheapbase, Universe::narrow_ptrs_base());
2028     } else {
2029       lea(rheapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr()));
2030       ldr(rheapbase, Address(rheapbase));
2031     }
2032   }
2033 }
2034 
2035 // this simulates the behaviour of the x86 cmpxchg instruction using a
2036 // load linked/store conditional pair. we use the acquire/release
2037 // versions of these instructions so that we flush pending writes as
2038 // per Java semantics.
2039 
2040 // n.b the x86 version assumes the old value to be compared against is
2041 // in rax and updates rax with the value located in memory if the
2042 // cmpxchg fails. we supply a register for the old value explicitly
2043 
2044 // the aarch64 load linked/store conditional instructions do not
2045 // accept an offset. so, unlike x86, we must provide a plain register
2046 // to identify the memory word to be compared/exchanged rather than a
2047 // register+offset Address.
2048 
2049 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
2050                                 Label &succeed, Label *fail) {
2051   // oldv holds comparison value
2052   // newv holds value to write in exchange
2053   // addr identifies memory word to compare against/update
2054   // tmp returns 0/1 for success/failure
2055   Label retry_load, nope;
2056 
2057   bind(retry_load);
2058   // flush and load exclusive from the memory location
2059   // and fail if it is not what we expect
2060   ldaxr(tmp, addr);
2061   cmp(tmp, oldv);
2062   br(Assembler::NE, nope);
2063   // if we store+flush with no intervening write tmp wil be zero
2064   stlxr(tmp, newv, addr);
2065   cbzw(tmp, succeed);
2066   // retry so we only ever return after a load fails to compare
2067   // ensures we don't return a stale value after a failed write.
2068   b(retry_load);
2069   // if the memory word differs we return it in oldv and signal a fail
2070   bind(nope);
2071   membar(AnyAny);
2072   mov(oldv, tmp);
2073   if (fail)
2074     b(*fail);
2075 }
2076 
2077 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
2078                                 Label &succeed, Label *fail) {
2079   // oldv holds comparison value
2080   // newv holds value to write in exchange
2081   // addr identifies memory word to compare against/update
2082   // tmp returns 0/1 for success/failure
2083   Label retry_load, nope;
2084 
2085   bind(retry_load);
2086   // flush and load exclusive from the memory location
2087   // and fail if it is not what we expect
2088   ldaxrw(tmp, addr);
2089   cmp(tmp, oldv);
2090   br(Assembler::NE, nope);
2091   // if we store+flush with no intervening write tmp wil be zero
2092   stlxrw(tmp, newv, addr);
2093   cbzw(tmp, succeed);
2094   // retry so we only ever return after a load fails to compare
2095   // ensures we don't return a stale value after a failed write.
2096   b(retry_load);
2097   // if the memory word differs we return it in oldv and signal a fail
2098   bind(nope);
2099   membar(AnyAny);
2100   mov(oldv, tmp);
2101   if (fail)
2102     b(*fail);
2103 }
2104 
2105 static bool different(Register a, RegisterOrConstant b, Register c) {
2106   if (b.is_constant())
2107     return a != c;
2108   else
2109     return a != b.as_register() && a != c && b.as_register() != c;
2110 }
2111 
2112 #define ATOMIC_OP(LDXR, OP, STXR)                                       \
2113 void MacroAssembler::atomic_##OP(Register prev, RegisterOrConstant incr, Register addr) { \
2114   Register result = rscratch2;                                          \
2115   if (prev->is_valid())                                                 \
2116     result = different(prev, incr, addr) ? prev : rscratch2;            \
2117                                                                         \
2118   Label retry_load;                                                     \
2119   bind(retry_load);                                                     \
2120   LDXR(result, addr);                                                   \
2121   OP(rscratch1, result, incr);                                          \
2122   STXR(rscratch1, rscratch1, addr);                                     \
2123   cbnzw(rscratch1, retry_load);                                         \
2124   if (prev->is_valid() && prev != result)                               \
2125     mov(prev, result);                                                  \
2126 }
2127 
2128 ATOMIC_OP(ldxr, add, stxr)
2129 ATOMIC_OP(ldxrw, addw, stxrw)
2130 
2131 #undef ATOMIC_OP
2132 
2133 #define ATOMIC_XCHG(OP, LDXR, STXR)                                     \
2134 void MacroAssembler::atomic_##OP(Register prev, Register newv, Register addr) { \
2135   Register result = rscratch2;                                          \
2136   if (prev->is_valid())                                                 \
2137     result = different(prev, newv, addr) ? prev : rscratch2;            \
2138                                                                         \
2139   Label retry_load;                                                     \
2140   bind(retry_load);                                                     \
2141   LDXR(result, addr);                                                   \
2142   STXR(rscratch1, newv, addr);                                          \
2143   cbnzw(rscratch1, retry_load);                                         \
2144   if (prev->is_valid() && prev != result)                               \
2145     mov(prev, result);                                                  \
2146 }
2147 
2148 ATOMIC_XCHG(xchg, ldxr, stxr)
2149 ATOMIC_XCHG(xchgw, ldxrw, stxrw)
2150 
2151 #undef ATOMIC_XCHG
2152 
2153 void MacroAssembler::incr_allocated_bytes(Register thread,
2154                                           Register var_size_in_bytes,
2155                                           int con_size_in_bytes,
2156                                           Register t1) {
2157   if (!thread->is_valid()) {
2158     thread = rthread;
2159   }
2160   assert(t1->is_valid(), "need temp reg");
2161 
2162   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2163   if (var_size_in_bytes->is_valid()) {
2164     add(t1, t1, var_size_in_bytes);
2165   } else {
2166     add(t1, t1, con_size_in_bytes);
2167   }
2168   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
2169 }
2170 
2171 #ifndef PRODUCT
2172 extern "C" void findpc(intptr_t x);
2173 #endif
2174 
2175 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[])
2176 {
2177   // In order to get locks to work, we need to fake a in_VM state
2178   if (ShowMessageBoxOnError ) {
2179     JavaThread* thread = JavaThread::current();
2180     JavaThreadState saved_state = thread->thread_state();
2181     thread->set_thread_state(_thread_in_vm);
2182 #ifndef PRODUCT
2183     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
2184       ttyLocker ttyl;
2185       BytecodeCounter::print();
2186     }
2187 #endif
2188     if (os::message_box(msg, "Execution stopped, print registers?")) {
2189       ttyLocker ttyl;
2190       tty->print_cr(" pc = 0x%016lx", pc);
2191 #ifndef PRODUCT
2192       tty->cr();
2193       findpc(pc);
2194       tty->cr();
2195 #endif
2196       tty->print_cr(" r0 = 0x%016lx", regs[0]);
2197       tty->print_cr(" r1 = 0x%016lx", regs[1]);
2198       tty->print_cr(" r2 = 0x%016lx", regs[2]);
2199       tty->print_cr(" r3 = 0x%016lx", regs[3]);
2200       tty->print_cr(" r4 = 0x%016lx", regs[4]);
2201       tty->print_cr(" r5 = 0x%016lx", regs[5]);
2202       tty->print_cr(" r6 = 0x%016lx", regs[6]);
2203       tty->print_cr(" r7 = 0x%016lx", regs[7]);
2204       tty->print_cr(" r8 = 0x%016lx", regs[8]);
2205       tty->print_cr(" r9 = 0x%016lx", regs[9]);
2206       tty->print_cr("r10 = 0x%016lx", regs[10]);
2207       tty->print_cr("r11 = 0x%016lx", regs[11]);
2208       tty->print_cr("r12 = 0x%016lx", regs[12]);
2209       tty->print_cr("r13 = 0x%016lx", regs[13]);
2210       tty->print_cr("r14 = 0x%016lx", regs[14]);
2211       tty->print_cr("r15 = 0x%016lx", regs[15]);
2212       tty->print_cr("r16 = 0x%016lx", regs[16]);
2213       tty->print_cr("r17 = 0x%016lx", regs[17]);
2214       tty->print_cr("r18 = 0x%016lx", regs[18]);
2215       tty->print_cr("r19 = 0x%016lx", regs[19]);
2216       tty->print_cr("r20 = 0x%016lx", regs[20]);
2217       tty->print_cr("r21 = 0x%016lx", regs[21]);
2218       tty->print_cr("r22 = 0x%016lx", regs[22]);
2219       tty->print_cr("r23 = 0x%016lx", regs[23]);
2220       tty->print_cr("r24 = 0x%016lx", regs[24]);
2221       tty->print_cr("r25 = 0x%016lx", regs[25]);
2222       tty->print_cr("r26 = 0x%016lx", regs[26]);
2223       tty->print_cr("r27 = 0x%016lx", regs[27]);
2224       tty->print_cr("r28 = 0x%016lx", regs[28]);
2225       tty->print_cr("r30 = 0x%016lx", regs[30]);
2226       tty->print_cr("r31 = 0x%016lx", regs[31]);
2227       BREAKPOINT;
2228     }
2229     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
2230   } else {
2231     ttyLocker ttyl;
2232     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n",
2233                     msg);
2234     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
2235   }
2236 }
2237 
2238 #ifdef BUILTIN_SIM
2239 // routine to generate an x86 prolog for a stub function which
2240 // bootstraps into the generated ARM code which directly follows the
2241 // stub
2242 //
2243 // the argument encodes the number of general and fp registers
2244 // passed by the caller and the callng convention (currently just
2245 // the number of general registers and assumes C argument passing)
2246 
2247 extern "C" {
2248 int aarch64_stub_prolog_size();
2249 void aarch64_stub_prolog();
2250 void aarch64_prolog();
2251 }
2252 
2253 void MacroAssembler::c_stub_prolog(int gp_arg_count, int fp_arg_count, int ret_type,
2254                                    address *prolog_ptr)
2255 {
2256   int calltype = (((ret_type & 0x3) << 8) |
2257                   ((fp_arg_count & 0xf) << 4) |
2258                   (gp_arg_count & 0xf));
2259 
2260   // the addresses for the x86 to ARM entry code we need to use
2261   address start = pc();
2262   // printf("start = %lx\n", start);
2263   int byteCount =  aarch64_stub_prolog_size();
2264   // printf("byteCount = %x\n", byteCount);
2265   int instructionCount = (byteCount + 3)/ 4;
2266   // printf("instructionCount = %x\n", instructionCount);
2267   for (int i = 0; i < instructionCount; i++) {
2268     nop();
2269   }
2270 
2271   memcpy(start, (void*)aarch64_stub_prolog, byteCount);
2272 
2273   // write the address of the setup routine and the call format at the
2274   // end of into the copied code
2275   u_int64_t *patch_end = (u_int64_t *)(start + byteCount);
2276   if (prolog_ptr)
2277     patch_end[-2] = (u_int64_t)prolog_ptr;
2278   patch_end[-1] = calltype;
2279 }
2280 #endif
2281 
2282 void MacroAssembler::push_CPU_state() {
2283     push(0x3fffffff, sp);         // integer registers except lr & sp
2284 
2285     for (int i = 30; i >= 0; i -= 2)
2286       stpd(as_FloatRegister(i), as_FloatRegister(i+1),
2287            Address(pre(sp, -2 * wordSize)));
2288 }
2289 
2290 void MacroAssembler::pop_CPU_state() {
2291   for (int i = 0; i < 32; i += 2)
2292     ldpd(as_FloatRegister(i), as_FloatRegister(i+1),
2293          Address(post(sp, 2 * wordSize)));
2294 
2295   pop(0x3fffffff, sp);         // integer registers except lr & sp
2296 }
2297 
2298 /**
2299  * Helpers for multiply_to_len().
2300  */
2301 void MacroAssembler::add2_with_carry(Register final_dest_hi, Register dest_hi, Register dest_lo,
2302                                      Register src1, Register src2) {
2303   adds(dest_lo, dest_lo, src1);
2304   adc(dest_hi, dest_hi, zr);
2305   adds(dest_lo, dest_lo, src2);
2306   adc(final_dest_hi, dest_hi, zr);
2307 }
2308 
2309 // Generate an address from (r + r1 extend offset).  "size" is the
2310 // size of the operand.  The result may be in rscratch2.
2311 Address MacroAssembler::offsetted_address(Register r, Register r1,
2312                                           Address::extend ext, int offset, int size) {
2313   if (offset || (ext.shift() % size != 0)) {
2314     lea(rscratch2, Address(r, r1, ext));
2315     return Address(rscratch2, offset);
2316   } else {
2317     return Address(r, r1, ext);
2318   }
2319 }
2320 
2321 Address MacroAssembler::spill_address(int size, int offset, Register tmp)
2322 {
2323   assert(offset >= 0, "spill to negative address?");
2324   // Offset reachable ?
2325   //   Not aligned - 9 bits signed offset
2326   //   Aligned - 12 bits unsigned offset shifted
2327   Register base = sp;
2328   if ((offset & (size-1)) && offset >= (1<<8)) {
2329     add(tmp, base, offset & ((1<<12)-1));
2330     base = tmp;
2331     offset &= -1<<12;
2332   }
2333 
2334   if (offset >= (1<<12) * size) {
2335     add(tmp, base, offset & (((1<<12)-1)<<12));
2336     base = tmp;
2337     offset &= ~(((1<<12)-1)<<12);
2338   }
2339 
2340   return Address(base, offset);
2341 }
2342 
2343 /**
2344  * Multiply 64 bit by 64 bit first loop.
2345  */
2346 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart,
2347                                            Register y, Register y_idx, Register z,
2348                                            Register carry, Register product,
2349                                            Register idx, Register kdx) {
2350   //
2351   //  jlong carry, x[], y[], z[];
2352   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2353   //    huge_128 product = y[idx] * x[xstart] + carry;
2354   //    z[kdx] = (jlong)product;
2355   //    carry  = (jlong)(product >>> 64);
2356   //  }
2357   //  z[xstart] = carry;
2358   //
2359 
2360   Label L_first_loop, L_first_loop_exit;
2361   Label L_one_x, L_one_y, L_multiply;
2362 
2363   subsw(xstart, xstart, 1);
2364   br(Assembler::MI, L_one_x);
2365 
2366   lea(rscratch1, Address(x, xstart, Address::lsl(LogBytesPerInt)));
2367   ldr(x_xstart, Address(rscratch1));
2368   ror(x_xstart, x_xstart, 32); // convert big-endian to little-endian
2369 
2370   bind(L_first_loop);
2371   subsw(idx, idx, 1);
2372   br(Assembler::MI, L_first_loop_exit);
2373   subsw(idx, idx, 1);
2374   br(Assembler::MI, L_one_y);
2375   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2376   ldr(y_idx, Address(rscratch1));
2377   ror(y_idx, y_idx, 32); // convert big-endian to little-endian
2378   bind(L_multiply);
2379 
2380   // AArch64 has a multiply-accumulate instruction that we can't use
2381   // here because it has no way to process carries, so we have to use
2382   // separate add and adc instructions.  Bah.
2383   umulh(rscratch1, x_xstart, y_idx); // x_xstart * y_idx -> rscratch1:product
2384   mul(product, x_xstart, y_idx);
2385   adds(product, product, carry);
2386   adc(carry, rscratch1, zr);   // x_xstart * y_idx + carry -> carry:product
2387 
2388   subw(kdx, kdx, 2);
2389   ror(product, product, 32); // back to big-endian
2390   str(product, offsetted_address(z, kdx, Address::uxtw(LogBytesPerInt), 0, BytesPerLong));
2391 
2392   b(L_first_loop);
2393 
2394   bind(L_one_y);
2395   ldrw(y_idx, Address(y,  0));
2396   b(L_multiply);
2397 
2398   bind(L_one_x);
2399   ldrw(x_xstart, Address(x,  0));
2400   b(L_first_loop);
2401 
2402   bind(L_first_loop_exit);
2403 }
2404 
2405 /**
2406  * Multiply 128 bit by 128. Unrolled inner loop.
2407  *
2408  */
2409 void MacroAssembler::multiply_128_x_128_loop(Register y, Register z,
2410                                              Register carry, Register carry2,
2411                                              Register idx, Register jdx,
2412                                              Register yz_idx1, Register yz_idx2,
2413                                              Register tmp, Register tmp3, Register tmp4,
2414                                              Register tmp6, Register product_hi) {
2415 
2416   //   jlong carry, x[], y[], z[];
2417   //   int kdx = ystart+1;
2418   //   for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop
2419   //     huge_128 tmp3 = (y[idx+1] * product_hi) + z[kdx+idx+1] + carry;
2420   //     jlong carry2  = (jlong)(tmp3 >>> 64);
2421   //     huge_128 tmp4 = (y[idx]   * product_hi) + z[kdx+idx] + carry2;
2422   //     carry  = (jlong)(tmp4 >>> 64);
2423   //     z[kdx+idx+1] = (jlong)tmp3;
2424   //     z[kdx+idx] = (jlong)tmp4;
2425   //   }
2426   //   idx += 2;
2427   //   if (idx > 0) {
2428   //     yz_idx1 = (y[idx] * product_hi) + z[kdx+idx] + carry;
2429   //     z[kdx+idx] = (jlong)yz_idx1;
2430   //     carry  = (jlong)(yz_idx1 >>> 64);
2431   //   }
2432   //
2433 
2434   Label L_third_loop, L_third_loop_exit, L_post_third_loop_done;
2435 
2436   lsrw(jdx, idx, 2);
2437 
2438   bind(L_third_loop);
2439 
2440   subsw(jdx, jdx, 1);
2441   br(Assembler::MI, L_third_loop_exit);
2442   subw(idx, idx, 4);
2443 
2444   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2445 
2446   ldp(yz_idx2, yz_idx1, Address(rscratch1, 0));
2447 
2448   lea(tmp6, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2449 
2450   ror(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian
2451   ror(yz_idx2, yz_idx2, 32);
2452 
2453   ldp(rscratch2, rscratch1, Address(tmp6, 0));
2454 
2455   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2456   umulh(tmp4, product_hi, yz_idx1);
2457 
2458   ror(rscratch1, rscratch1, 32); // convert big-endian to little-endian
2459   ror(rscratch2, rscratch2, 32);
2460 
2461   mul(tmp, product_hi, yz_idx2);   //  yz_idx2 * product_hi -> carry2:tmp
2462   umulh(carry2, product_hi, yz_idx2);
2463 
2464   // propagate sum of both multiplications into carry:tmp4:tmp3
2465   adds(tmp3, tmp3, carry);
2466   adc(tmp4, tmp4, zr);
2467   adds(tmp3, tmp3, rscratch1);
2468   adcs(tmp4, tmp4, tmp);
2469   adc(carry, carry2, zr);
2470   adds(tmp4, tmp4, rscratch2);
2471   adc(carry, carry, zr);
2472 
2473   ror(tmp3, tmp3, 32); // convert little-endian to big-endian
2474   ror(tmp4, tmp4, 32);
2475   stp(tmp4, tmp3, Address(tmp6, 0));
2476 
2477   b(L_third_loop);
2478   bind (L_third_loop_exit);
2479 
2480   andw (idx, idx, 0x3);
2481   cbz(idx, L_post_third_loop_done);
2482 
2483   Label L_check_1;
2484   subsw(idx, idx, 2);
2485   br(Assembler::MI, L_check_1);
2486 
2487   lea(rscratch1, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2488   ldr(yz_idx1, Address(rscratch1, 0));
2489   ror(yz_idx1, yz_idx1, 32);
2490   mul(tmp3, product_hi, yz_idx1);  //  yz_idx1 * product_hi -> tmp4:tmp3
2491   umulh(tmp4, product_hi, yz_idx1);
2492   lea(rscratch1, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2493   ldr(yz_idx2, Address(rscratch1, 0));
2494   ror(yz_idx2, yz_idx2, 32);
2495 
2496   add2_with_carry(carry, tmp4, tmp3, carry, yz_idx2);
2497 
2498   ror(tmp3, tmp3, 32);
2499   str(tmp3, Address(rscratch1, 0));
2500 
2501   bind (L_check_1);
2502 
2503   andw (idx, idx, 0x1);
2504   subsw(idx, idx, 1);
2505   br(Assembler::MI, L_post_third_loop_done);
2506   ldrw(tmp4, Address(y, idx, Address::uxtw(LogBytesPerInt)));
2507   mul(tmp3, tmp4, product_hi);  //  tmp4 * product_hi -> carry2:tmp3
2508   umulh(carry2, tmp4, product_hi);
2509   ldrw(tmp4, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2510 
2511   add2_with_carry(carry2, tmp3, tmp4, carry);
2512 
2513   strw(tmp3, Address(z, idx, Address::uxtw(LogBytesPerInt)));
2514   extr(carry, carry2, tmp3, 32);
2515 
2516   bind(L_post_third_loop_done);
2517 }
2518 
2519 /**
2520  * Code for BigInteger::multiplyToLen() instrinsic.
2521  *
2522  * r0: x
2523  * r1: xlen
2524  * r2: y
2525  * r3: ylen
2526  * r4:  z
2527  * r5: zlen
2528  * r10: tmp1
2529  * r11: tmp2
2530  * r12: tmp3
2531  * r13: tmp4
2532  * r14: tmp5
2533  * r15: tmp6
2534  * r16: tmp7
2535  *
2536  */
2537 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2538                                      Register z, Register zlen,
2539                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2540                                      Register tmp5, Register tmp6, Register product_hi) {
2541 
2542   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2543 
2544   const Register idx = tmp1;
2545   const Register kdx = tmp2;
2546   const Register xstart = tmp3;
2547 
2548   const Register y_idx = tmp4;
2549   const Register carry = tmp5;
2550   const Register product  = xlen;
2551   const Register x_xstart = zlen;  // reuse register
2552 
2553   // First Loop.
2554   //
2555   //  final static long LONG_MASK = 0xffffffffL;
2556   //  int xstart = xlen - 1;
2557   //  int ystart = ylen - 1;
2558   //  long carry = 0;
2559   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) {
2560   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2561   //    z[kdx] = (int)product;
2562   //    carry = product >>> 32;
2563   //  }
2564   //  z[xstart] = (int)carry;
2565   //
2566 
2567   movw(idx, ylen);      // idx = ylen;
2568   movw(kdx, zlen);      // kdx = xlen+ylen;
2569   mov(carry, zr);       // carry = 0;
2570 
2571   Label L_done;
2572 
2573   movw(xstart, xlen);
2574   subsw(xstart, xstart, 1);
2575   br(Assembler::MI, L_done);
2576 
2577   multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx);
2578 
2579   Label L_second_loop;
2580   cbzw(kdx, L_second_loop);
2581 
2582   Label L_carry;
2583   subw(kdx, kdx, 1);
2584   cbzw(kdx, L_carry);
2585 
2586   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2587   lsr(carry, carry, 32);
2588   subw(kdx, kdx, 1);
2589 
2590   bind(L_carry);
2591   strw(carry, Address(z, kdx, Address::uxtw(LogBytesPerInt)));
2592 
2593   // Second and third (nested) loops.
2594   //
2595   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2596   //   carry = 0;
2597   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2598   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2599   //                    (z[k] & LONG_MASK) + carry;
2600   //     z[k] = (int)product;
2601   //     carry = product >>> 32;
2602   //   }
2603   //   z[i] = (int)carry;
2604   // }
2605   //
2606   // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = product_hi
2607 
2608   const Register jdx = tmp1;
2609 
2610   bind(L_second_loop);
2611   mov(carry, zr);                // carry = 0;
2612   movw(jdx, ylen);               // j = ystart+1
2613 
2614   subsw(xstart, xstart, 1);      // i = xstart-1;
2615   br(Assembler::MI, L_done);
2616 
2617   str(z, Address(pre(sp, -4 * wordSize)));
2618 
2619   Label L_last_x;
2620   lea(z, offsetted_address(z, xstart, Address::uxtw(LogBytesPerInt), 4, BytesPerInt)); // z = z + k - j
2621   subsw(xstart, xstart, 1);       // i = xstart-1;
2622   br(Assembler::MI, L_last_x);
2623 
2624   lea(rscratch1, Address(x, xstart, Address::uxtw(LogBytesPerInt)));
2625   ldr(product_hi, Address(rscratch1));
2626   ror(product_hi, product_hi, 32);  // convert big-endian to little-endian
2627 
2628   Label L_third_loop_prologue;
2629   bind(L_third_loop_prologue);
2630 
2631   str(ylen, Address(sp, wordSize));
2632   stp(x, xstart, Address(sp, 2 * wordSize));
2633   multiply_128_x_128_loop(y, z, carry, x, jdx, ylen, product,
2634                           tmp2, x_xstart, tmp3, tmp4, tmp6, product_hi);
2635   ldp(z, ylen, Address(post(sp, 2 * wordSize)));
2636   ldp(x, xlen, Address(post(sp, 2 * wordSize)));   // copy old xstart -> xlen
2637 
2638   addw(tmp3, xlen, 1);
2639   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2640   subsw(tmp3, tmp3, 1);
2641   br(Assembler::MI, L_done);
2642 
2643   lsr(carry, carry, 32);
2644   strw(carry, Address(z, tmp3, Address::uxtw(LogBytesPerInt)));
2645   b(L_second_loop);
2646 
2647   // Next infrequent code is moved outside loops.
2648   bind(L_last_x);
2649   ldrw(product_hi, Address(x,  0));
2650   b(L_third_loop_prologue);
2651 
2652   bind(L_done);
2653 }
2654 
2655 /**
2656  * Emits code to update CRC-32 with a byte value according to constants in table
2657  *
2658  * @param [in,out]crc   Register containing the crc.
2659  * @param [in]val       Register containing the byte to fold into the CRC.
2660  * @param [in]table     Register containing the table of crc constants.
2661  *
2662  * uint32_t crc;
2663  * val = crc_table[(val ^ crc) & 0xFF];
2664  * crc = val ^ (crc >> 8);
2665  *
2666  */
2667 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
2668   eor(val, val, crc);
2669   andr(val, val, 0xff);
2670   ldrw(val, Address(table, val, Address::lsl(2)));
2671   eor(crc, val, crc, Assembler::LSR, 8);
2672 }
2673 
2674 /**
2675  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
2676  *
2677  * @param [in,out]crc   Register containing the crc.
2678  * @param [in]v         Register containing the 32-bit to fold into the CRC.
2679  * @param [in]table0    Register containing table 0 of crc constants.
2680  * @param [in]table1    Register containing table 1 of crc constants.
2681  * @param [in]table2    Register containing table 2 of crc constants.
2682  * @param [in]table3    Register containing table 3 of crc constants.
2683  *
2684  * uint32_t crc;
2685  *   v = crc ^ v
2686  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
2687  *
2688  */
2689 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
2690         Register table0, Register table1, Register table2, Register table3,
2691         bool upper) {
2692   eor(v, crc, v, upper ? LSR:LSL, upper ? 32:0);
2693   uxtb(tmp, v);
2694   ldrw(crc, Address(table3, tmp, Address::lsl(2)));
2695   ubfx(tmp, v, 8, 8);
2696   ldrw(tmp, Address(table2, tmp, Address::lsl(2)));
2697   eor(crc, crc, tmp);
2698   ubfx(tmp, v, 16, 8);
2699   ldrw(tmp, Address(table1, tmp, Address::lsl(2)));
2700   eor(crc, crc, tmp);
2701   ubfx(tmp, v, 24, 8);
2702   ldrw(tmp, Address(table0, tmp, Address::lsl(2)));
2703   eor(crc, crc, tmp);
2704 }
2705 
2706 /**
2707  * @param crc   register containing existing CRC (32-bit)
2708  * @param buf   register pointing to input byte buffer (byte*)
2709  * @param len   register containing number of bytes
2710  * @param table register that will contain address of CRC table
2711  * @param tmp   scratch register
2712  */
2713 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
2714         Register table0, Register table1, Register table2, Register table3,
2715         Register tmp, Register tmp2, Register tmp3) {
2716   Label L_by16, L_by16_loop, L_by4, L_by4_loop, L_by1, L_by1_loop, L_exit;
2717   unsigned long offset;
2718 
2719     ornw(crc, zr, crc);
2720 
2721   if (UseCRC32) {
2722     Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2723 
2724       subs(len, len, 64);
2725       br(Assembler::GE, CRC_by64_loop);
2726       adds(len, len, 64-4);
2727       br(Assembler::GE, CRC_by4_loop);
2728       adds(len, len, 4);
2729       br(Assembler::GT, CRC_by1_loop);
2730       b(L_exit);
2731 
2732     BIND(CRC_by4_loop);
2733       ldrw(tmp, Address(post(buf, 4)));
2734       subs(len, len, 4);
2735       crc32w(crc, crc, tmp);
2736       br(Assembler::GE, CRC_by4_loop);
2737       adds(len, len, 4);
2738       br(Assembler::LE, L_exit);
2739     BIND(CRC_by1_loop);
2740       ldrb(tmp, Address(post(buf, 1)));
2741       subs(len, len, 1);
2742       crc32b(crc, crc, tmp);
2743       br(Assembler::GT, CRC_by1_loop);
2744       b(L_exit);
2745 
2746       align(CodeEntryAlignment);
2747     BIND(CRC_by64_loop);
2748       subs(len, len, 64);
2749       ldp(tmp, tmp3, Address(post(buf, 16)));
2750       crc32x(crc, crc, tmp);
2751       crc32x(crc, crc, tmp3);
2752       ldp(tmp, tmp3, Address(post(buf, 16)));
2753       crc32x(crc, crc, tmp);
2754       crc32x(crc, crc, tmp3);
2755       ldp(tmp, tmp3, Address(post(buf, 16)));
2756       crc32x(crc, crc, tmp);
2757       crc32x(crc, crc, tmp3);
2758       ldp(tmp, tmp3, Address(post(buf, 16)));
2759       crc32x(crc, crc, tmp);
2760       crc32x(crc, crc, tmp3);
2761       br(Assembler::GE, CRC_by64_loop);
2762       adds(len, len, 64-4);
2763       br(Assembler::GE, CRC_by4_loop);
2764       adds(len, len, 4);
2765       br(Assembler::GT, CRC_by1_loop);
2766     BIND(L_exit);
2767       ornw(crc, zr, crc);
2768       return;
2769   }
2770 
2771     adrp(table0, ExternalAddress(StubRoutines::crc_table_addr()), offset);
2772     if (offset) add(table0, table0, offset);
2773     add(table1, table0, 1*256*sizeof(juint));
2774     add(table2, table0, 2*256*sizeof(juint));
2775     add(table3, table0, 3*256*sizeof(juint));
2776 
2777   if (UseNeon) {
2778       cmp(len, 64);
2779       br(Assembler::LT, L_by16);
2780       eor(v16, T16B, v16, v16);
2781 
2782     Label L_fold;
2783 
2784       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
2785 
2786       ld1(v0, v1, T2D, post(buf, 32));
2787       ld1r(v4, T2D, post(tmp, 8));
2788       ld1r(v5, T2D, post(tmp, 8));
2789       ld1r(v6, T2D, post(tmp, 8));
2790       ld1r(v7, T2D, post(tmp, 8));
2791       mov(v16, T4S, 0, crc);
2792 
2793       eor(v0, T16B, v0, v16);
2794       sub(len, len, 64);
2795 
2796     BIND(L_fold);
2797       pmull(v22, T8H, v0, v5, T8B);
2798       pmull(v20, T8H, v0, v7, T8B);
2799       pmull(v23, T8H, v0, v4, T8B);
2800       pmull(v21, T8H, v0, v6, T8B);
2801 
2802       pmull2(v18, T8H, v0, v5, T16B);
2803       pmull2(v16, T8H, v0, v7, T16B);
2804       pmull2(v19, T8H, v0, v4, T16B);
2805       pmull2(v17, T8H, v0, v6, T16B);
2806 
2807       uzp1(v24, v20, v22, T8H);
2808       uzp2(v25, v20, v22, T8H);
2809       eor(v20, T16B, v24, v25);
2810 
2811       uzp1(v26, v16, v18, T8H);
2812       uzp2(v27, v16, v18, T8H);
2813       eor(v16, T16B, v26, v27);
2814 
2815       ushll2(v22, T4S, v20, T8H, 8);
2816       ushll(v20, T4S, v20, T4H, 8);
2817 
2818       ushll2(v18, T4S, v16, T8H, 8);
2819       ushll(v16, T4S, v16, T4H, 8);
2820 
2821       eor(v22, T16B, v23, v22);
2822       eor(v18, T16B, v19, v18);
2823       eor(v20, T16B, v21, v20);
2824       eor(v16, T16B, v17, v16);
2825 
2826       uzp1(v17, v16, v20, T2D);
2827       uzp2(v21, v16, v20, T2D);
2828       eor(v17, T16B, v17, v21);
2829 
2830       ushll2(v20, T2D, v17, T4S, 16);
2831       ushll(v16, T2D, v17, T2S, 16);
2832 
2833       eor(v20, T16B, v20, v22);
2834       eor(v16, T16B, v16, v18);
2835 
2836       uzp1(v17, v20, v16, T2D);
2837       uzp2(v21, v20, v16, T2D);
2838       eor(v28, T16B, v17, v21);
2839 
2840       pmull(v22, T8H, v1, v5, T8B);
2841       pmull(v20, T8H, v1, v7, T8B);
2842       pmull(v23, T8H, v1, v4, T8B);
2843       pmull(v21, T8H, v1, v6, T8B);
2844 
2845       pmull2(v18, T8H, v1, v5, T16B);
2846       pmull2(v16, T8H, v1, v7, T16B);
2847       pmull2(v19, T8H, v1, v4, T16B);
2848       pmull2(v17, T8H, v1, v6, T16B);
2849 
2850       ld1(v0, v1, T2D, post(buf, 32));
2851 
2852       uzp1(v24, v20, v22, T8H);
2853       uzp2(v25, v20, v22, T8H);
2854       eor(v20, T16B, v24, v25);
2855 
2856       uzp1(v26, v16, v18, T8H);
2857       uzp2(v27, v16, v18, T8H);
2858       eor(v16, T16B, v26, v27);
2859 
2860       ushll2(v22, T4S, v20, T8H, 8);
2861       ushll(v20, T4S, v20, T4H, 8);
2862 
2863       ushll2(v18, T4S, v16, T8H, 8);
2864       ushll(v16, T4S, v16, T4H, 8);
2865 
2866       eor(v22, T16B, v23, v22);
2867       eor(v18, T16B, v19, v18);
2868       eor(v20, T16B, v21, v20);
2869       eor(v16, T16B, v17, v16);
2870 
2871       uzp1(v17, v16, v20, T2D);
2872       uzp2(v21, v16, v20, T2D);
2873       eor(v16, T16B, v17, v21);
2874 
2875       ushll2(v20, T2D, v16, T4S, 16);
2876       ushll(v16, T2D, v16, T2S, 16);
2877 
2878       eor(v20, T16B, v22, v20);
2879       eor(v16, T16B, v16, v18);
2880 
2881       uzp1(v17, v20, v16, T2D);
2882       uzp2(v21, v20, v16, T2D);
2883       eor(v20, T16B, v17, v21);
2884 
2885       shl(v16, T2D, v28, 1);
2886       shl(v17, T2D, v20, 1);
2887 
2888       eor(v0, T16B, v0, v16);
2889       eor(v1, T16B, v1, v17);
2890 
2891       subs(len, len, 32);
2892       br(Assembler::GE, L_fold);
2893 
2894       mov(crc, 0);
2895       mov(tmp, v0, T1D, 0);
2896       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2897       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2898       mov(tmp, v0, T1D, 1);
2899       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2900       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2901       mov(tmp, v1, T1D, 0);
2902       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2903       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2904       mov(tmp, v1, T1D, 1);
2905       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2906       update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2907 
2908       add(len, len, 32);
2909   }
2910 
2911   BIND(L_by16);
2912     subs(len, len, 16);
2913     br(Assembler::GE, L_by16_loop);
2914     adds(len, len, 16-4);
2915     br(Assembler::GE, L_by4_loop);
2916     adds(len, len, 4);
2917     br(Assembler::GT, L_by1_loop);
2918     b(L_exit);
2919 
2920   BIND(L_by4_loop);
2921     ldrw(tmp, Address(post(buf, 4)));
2922     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3);
2923     subs(len, len, 4);
2924     br(Assembler::GE, L_by4_loop);
2925     adds(len, len, 4);
2926     br(Assembler::LE, L_exit);
2927   BIND(L_by1_loop);
2928     subs(len, len, 1);
2929     ldrb(tmp, Address(post(buf, 1)));
2930     update_byte_crc32(crc, tmp, table0);
2931     br(Assembler::GT, L_by1_loop);
2932     b(L_exit);
2933 
2934     align(CodeEntryAlignment);
2935   BIND(L_by16_loop);
2936     subs(len, len, 16);
2937     ldp(tmp, tmp3, Address(post(buf, 16)));
2938     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, false);
2939     update_word_crc32(crc, tmp, tmp2, table0, table1, table2, table3, true);
2940     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, false);
2941     update_word_crc32(crc, tmp3, tmp2, table0, table1, table2, table3, true);
2942     br(Assembler::GE, L_by16_loop);
2943     adds(len, len, 16-4);
2944     br(Assembler::GE, L_by4_loop);
2945     adds(len, len, 4);
2946     br(Assembler::GT, L_by1_loop);
2947   BIND(L_exit);
2948     ornw(crc, zr, crc);
2949 }
2950 
2951 /**
2952  * @param crc   register containing existing CRC (32-bit)
2953  * @param buf   register pointing to input byte buffer (byte*)
2954  * @param len   register containing number of bytes
2955  * @param table register that will contain address of CRC table
2956  * @param tmp   scratch register
2957  */
2958 void MacroAssembler::kernel_crc32c(Register crc, Register buf, Register len,
2959         Register table0, Register table1, Register table2, Register table3,
2960         Register tmp, Register tmp2, Register tmp3) {
2961   Label L_exit;
2962   Label CRC_by64_loop, CRC_by4_loop, CRC_by1_loop;
2963 
2964     subs(len, len, 64);
2965     br(Assembler::GE, CRC_by64_loop);
2966     adds(len, len, 64-4);
2967     br(Assembler::GE, CRC_by4_loop);
2968     adds(len, len, 4);
2969     br(Assembler::GT, CRC_by1_loop);
2970     b(L_exit);
2971 
2972   BIND(CRC_by4_loop);
2973     ldrw(tmp, Address(post(buf, 4)));
2974     subs(len, len, 4);
2975     crc32cw(crc, crc, tmp);
2976     br(Assembler::GE, CRC_by4_loop);
2977     adds(len, len, 4);
2978     br(Assembler::LE, L_exit);
2979   BIND(CRC_by1_loop);
2980     ldrb(tmp, Address(post(buf, 1)));
2981     subs(len, len, 1);
2982     crc32cb(crc, crc, tmp);
2983     br(Assembler::GT, CRC_by1_loop);
2984     b(L_exit);
2985 
2986     align(CodeEntryAlignment);
2987   BIND(CRC_by64_loop);
2988     subs(len, len, 64);
2989     ldp(tmp, tmp3, Address(post(buf, 16)));
2990     crc32cx(crc, crc, tmp);
2991     crc32cx(crc, crc, tmp3);
2992     ldp(tmp, tmp3, Address(post(buf, 16)));
2993     crc32cx(crc, crc, tmp);
2994     crc32cx(crc, crc, tmp3);
2995     ldp(tmp, tmp3, Address(post(buf, 16)));
2996     crc32cx(crc, crc, tmp);
2997     crc32cx(crc, crc, tmp3);
2998     ldp(tmp, tmp3, Address(post(buf, 16)));
2999     crc32cx(crc, crc, tmp);
3000     crc32cx(crc, crc, tmp3);
3001     br(Assembler::GE, CRC_by64_loop);
3002     adds(len, len, 64-4);
3003     br(Assembler::GE, CRC_by4_loop);
3004     adds(len, len, 4);
3005     br(Assembler::GT, CRC_by1_loop);
3006   BIND(L_exit);
3007     return;
3008 }
3009 
3010 SkipIfEqual::SkipIfEqual(
3011     MacroAssembler* masm, const bool* flag_addr, bool value) {
3012   _masm = masm;
3013   unsigned long offset;
3014   _masm->adrp(rscratch1, ExternalAddress((address)flag_addr), offset);
3015   _masm->ldrb(rscratch1, Address(rscratch1, offset));
3016   _masm->cbzw(rscratch1, _label);
3017 }
3018 
3019 SkipIfEqual::~SkipIfEqual() {
3020   _masm->bind(_label);
3021 }
3022 
3023 void MacroAssembler::cmpptr(Register src1, Address src2) {
3024   unsigned long offset;
3025   adrp(rscratch1, src2, offset);
3026   ldr(rscratch1, Address(rscratch1, offset));
3027   cmp(src1, rscratch1);
3028 }
3029 
3030 void MacroAssembler::store_check(Register obj, Address dst) {
3031   store_check(obj);
3032 }
3033 
3034 void MacroAssembler::store_check(Register obj) {
3035   // Does a store check for the oop in register obj. The content of
3036   // register obj is destroyed afterwards.
3037 
3038   BarrierSet* bs = Universe::heap()->barrier_set();
3039   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
3040 
3041   CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
3042   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3043 
3044   lsr(obj, obj, CardTableModRefBS::card_shift);
3045 
3046   assert(CardTableModRefBS::dirty_card_val() == 0, "must be");
3047 
3048   {
3049     ExternalAddress cardtable((address) ct->byte_map_base);
3050     unsigned long offset;
3051     adrp(rscratch1, cardtable, offset);
3052     assert(offset == 0, "byte_map_base is misaligned");
3053   }
3054 
3055   if (UseCondCardMark) {
3056     Label L_already_dirty;
3057     ldrb(rscratch2,  Address(obj, rscratch1));
3058     cbz(rscratch2, L_already_dirty);
3059     strb(zr, Address(obj, rscratch1));
3060     bind(L_already_dirty);
3061   } else {
3062     strb(zr, Address(obj, rscratch1));
3063   }
3064 }
3065 
3066 void MacroAssembler::load_klass(Register dst, Register src) {
3067   if (UseCompressedClassPointers) {
3068     ldrw(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3069     decode_klass_not_null(dst);
3070   } else {
3071     ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
3072   }
3073 }
3074 
3075 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
3076   if (UseCompressedClassPointers) {
3077     ldrw(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3078     if (Universe::narrow_klass_base() == NULL) {
3079       cmp(trial_klass, tmp, LSL, Universe::narrow_klass_shift());
3080       return;
3081     } else if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3082                && Universe::narrow_klass_shift() == 0) {
3083       // Only the bottom 32 bits matter
3084       cmpw(trial_klass, tmp);
3085       return;
3086     }
3087     decode_klass_not_null(tmp);
3088   } else {
3089     ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
3090   }
3091   cmp(trial_klass, tmp);
3092 }
3093 
3094 void MacroAssembler::load_prototype_header(Register dst, Register src) {
3095   load_klass(dst, src);
3096   ldr(dst, Address(dst, Klass::prototype_header_offset()));
3097 }
3098 
3099 void MacroAssembler::store_klass(Register dst, Register src) {
3100   // FIXME: Should this be a store release?  concurrent gcs assumes
3101   // klass length is valid if klass field is not null.
3102   if (UseCompressedClassPointers) {
3103     encode_klass_not_null(src);
3104     strw(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3105   } else {
3106     str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
3107   }
3108 }
3109 
3110 void MacroAssembler::store_klass_gap(Register dst, Register src) {
3111   if (UseCompressedClassPointers) {
3112     // Store to klass gap in destination
3113     strw(src, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
3114   }
3115 }
3116 
3117 // Algorithm must match oop.inline.hpp encode_heap_oop.
3118 void MacroAssembler::encode_heap_oop(Register d, Register s) {
3119 #ifdef ASSERT
3120   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3121 #endif
3122   verify_oop(s, "broken oop in encode_heap_oop");
3123   if (Universe::narrow_oop_base() == NULL) {
3124     if (Universe::narrow_oop_shift() != 0) {
3125       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3126       lsr(d, s, LogMinObjAlignmentInBytes);
3127     } else {
3128       mov(d, s);
3129     }
3130   } else {
3131     subs(d, s, rheapbase);
3132     csel(d, d, zr, Assembler::HS);
3133     lsr(d, d, LogMinObjAlignmentInBytes);
3134 
3135     /*  Old algorithm: is this any worse?
3136     Label nonnull;
3137     cbnz(r, nonnull);
3138     sub(r, r, rheapbase);
3139     bind(nonnull);
3140     lsr(r, r, LogMinObjAlignmentInBytes);
3141     */
3142   }
3143 }
3144 
3145 void MacroAssembler::encode_heap_oop_not_null(Register r) {
3146 #ifdef ASSERT
3147   verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?");
3148   if (CheckCompressedOops) {
3149     Label ok;
3150     cbnz(r, ok);
3151     stop("null oop passed to encode_heap_oop_not_null");
3152     bind(ok);
3153   }
3154 #endif
3155   verify_oop(r, "broken oop in encode_heap_oop_not_null");
3156   if (Universe::narrow_oop_base() != NULL) {
3157     sub(r, r, rheapbase);
3158   }
3159   if (Universe::narrow_oop_shift() != 0) {
3160     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3161     lsr(r, r, LogMinObjAlignmentInBytes);
3162   }
3163 }
3164 
3165 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3166 #ifdef ASSERT
3167   verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?");
3168   if (CheckCompressedOops) {
3169     Label ok;
3170     cbnz(src, ok);
3171     stop("null oop passed to encode_heap_oop_not_null2");
3172     bind(ok);
3173   }
3174 #endif
3175   verify_oop(src, "broken oop in encode_heap_oop_not_null2");
3176 
3177   Register data = src;
3178   if (Universe::narrow_oop_base() != NULL) {
3179     sub(dst, src, rheapbase);
3180     data = dst;
3181   }
3182   if (Universe::narrow_oop_shift() != 0) {
3183     assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3184     lsr(dst, data, LogMinObjAlignmentInBytes);
3185     data = dst;
3186   }
3187   if (data == src)
3188     mov(dst, src);
3189 }
3190 
3191 void  MacroAssembler::decode_heap_oop(Register d, Register s) {
3192 #ifdef ASSERT
3193   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3194 #endif
3195   if (Universe::narrow_oop_base() == NULL) {
3196     if (Universe::narrow_oop_shift() != 0 || d != s) {
3197       lsl(d, s, Universe::narrow_oop_shift());
3198     }
3199   } else {
3200     Label done;
3201     if (d != s)
3202       mov(d, s);
3203     cbz(s, done);
3204     add(d, rheapbase, s, Assembler::LSL, LogMinObjAlignmentInBytes);
3205     bind(done);
3206   }
3207   verify_oop(d, "broken oop in decode_heap_oop");
3208 }
3209 
3210 void  MacroAssembler::decode_heap_oop_not_null(Register r) {
3211   assert (UseCompressedOops, "should only be used for compressed headers");
3212   assert (Universe::heap() != NULL, "java heap should be initialized");
3213   // Cannot assert, unverified entry point counts instructions (see .ad file)
3214   // vtableStubs also counts instructions in pd_code_size_limit.
3215   // Also do not verify_oop as this is called by verify_oop.
3216   if (Universe::narrow_oop_shift() != 0) {
3217     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3218     if (Universe::narrow_oop_base() != NULL) {
3219       add(r, rheapbase, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3220     } else {
3221       add(r, zr, r, Assembler::LSL, LogMinObjAlignmentInBytes);
3222     }
3223   } else {
3224     assert (Universe::narrow_oop_base() == NULL, "sanity");
3225   }
3226 }
3227 
3228 void  MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3229   assert (UseCompressedOops, "should only be used for compressed headers");
3230   assert (Universe::heap() != NULL, "java heap should be initialized");
3231   // Cannot assert, unverified entry point counts instructions (see .ad file)
3232   // vtableStubs also counts instructions in pd_code_size_limit.
3233   // Also do not verify_oop as this is called by verify_oop.
3234   if (Universe::narrow_oop_shift() != 0) {
3235     assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3236     if (Universe::narrow_oop_base() != NULL) {
3237       add(dst, rheapbase, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3238     } else {
3239       add(dst, zr, src, Assembler::LSL, LogMinObjAlignmentInBytes);
3240     }
3241   } else {
3242     assert (Universe::narrow_oop_base() == NULL, "sanity");
3243     if (dst != src) {
3244       mov(dst, src);
3245     }
3246   }
3247 }
3248 
3249 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3250   if (Universe::narrow_klass_base() == NULL) {
3251     if (Universe::narrow_klass_shift() != 0) {
3252       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3253       lsr(dst, src, LogKlassAlignmentInBytes);
3254     } else {
3255       if (dst != src) mov(dst, src);
3256     }
3257     return;
3258   }
3259 
3260   if (use_XOR_for_compressed_class_base) {
3261     if (Universe::narrow_klass_shift() != 0) {
3262       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3263       lsr(dst, dst, LogKlassAlignmentInBytes);
3264     } else {
3265       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3266     }
3267     return;
3268   }
3269 
3270   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3271       && Universe::narrow_klass_shift() == 0) {
3272     movw(dst, src);
3273     return;
3274   }
3275 
3276 #ifdef ASSERT
3277   verify_heapbase("MacroAssembler::encode_klass_not_null2: heap base corrupted?");
3278 #endif
3279 
3280   Register rbase = dst;
3281   if (dst == src) rbase = rheapbase;
3282   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3283   sub(dst, src, rbase);
3284   if (Universe::narrow_klass_shift() != 0) {
3285     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3286     lsr(dst, dst, LogKlassAlignmentInBytes);
3287   }
3288   if (dst == src) reinit_heapbase();
3289 }
3290 
3291 void MacroAssembler::encode_klass_not_null(Register r) {
3292   encode_klass_not_null(r, r);
3293 }
3294 
3295 void  MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3296   Register rbase = dst;
3297   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3298 
3299   if (Universe::narrow_klass_base() == NULL) {
3300     if (Universe::narrow_klass_shift() != 0) {
3301       assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3302       lsl(dst, src, LogKlassAlignmentInBytes);
3303     } else {
3304       if (dst != src) mov(dst, src);
3305     }
3306     return;
3307   }
3308 
3309   if (use_XOR_for_compressed_class_base) {
3310     if (Universe::narrow_klass_shift() != 0) {
3311       lsl(dst, src, LogKlassAlignmentInBytes);
3312       eor(dst, dst, (uint64_t)Universe::narrow_klass_base());
3313     } else {
3314       eor(dst, src, (uint64_t)Universe::narrow_klass_base());
3315     }
3316     return;
3317   }
3318 
3319   if (((uint64_t)Universe::narrow_klass_base() & 0xffffffff) == 0
3320       && Universe::narrow_klass_shift() == 0) {
3321     if (dst != src)
3322       movw(dst, src);
3323     movk(dst, (uint64_t)Universe::narrow_klass_base() >> 32, 32);
3324     return;
3325   }
3326 
3327   // Cannot assert, unverified entry point counts instructions (see .ad file)
3328   // vtableStubs also counts instructions in pd_code_size_limit.
3329   // Also do not verify_oop as this is called by verify_oop.
3330   if (dst == src) rbase = rheapbase;
3331   mov(rbase, (uint64_t)Universe::narrow_klass_base());
3332   if (Universe::narrow_klass_shift() != 0) {
3333     assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3334     add(dst, rbase, src, Assembler::LSL, LogKlassAlignmentInBytes);
3335   } else {
3336     add(dst, rbase, src);
3337   }
3338   if (dst == src) reinit_heapbase();
3339 }
3340 
3341 void  MacroAssembler::decode_klass_not_null(Register r) {
3342   decode_klass_not_null(r, r);
3343 }
3344 
3345 void  MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3346   assert (UseCompressedOops, "should only be used for compressed oops");
3347   assert (Universe::heap() != NULL, "java heap should be initialized");
3348   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3349 
3350   int oop_index = oop_recorder()->find_index(obj);
3351   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3352 
3353   InstructionMark im(this);
3354   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3355   code_section()->relocate(inst_mark(), rspec);
3356   movz(dst, 0xDEAD, 16);
3357   movk(dst, 0xBEEF);
3358 }
3359 
3360 void  MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3361   assert (UseCompressedClassPointers, "should only be used for compressed headers");
3362   assert (oop_recorder() != NULL, "this assembler needs an OopRecorder");
3363   int index = oop_recorder()->find_index(k);
3364   assert(! Universe::heap()->is_in_reserved(k), "should not be an oop");
3365 
3366   InstructionMark im(this);
3367   RelocationHolder rspec = metadata_Relocation::spec(index);
3368   code_section()->relocate(inst_mark(), rspec);
3369   narrowKlass nk = Klass::encode_klass(k);
3370   movz(dst, (nk >> 16), 16);
3371   movk(dst, nk & 0xffff);
3372 }
3373 
3374 void MacroAssembler::load_heap_oop(Register dst, Address src)
3375 {
3376   if (UseCompressedOops) {
3377     ldrw(dst, src);
3378     decode_heap_oop(dst);
3379   } else {
3380     ldr(dst, src);
3381   }
3382 }
3383 
3384 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
3385 {
3386   if (UseCompressedOops) {
3387     ldrw(dst, src);
3388     decode_heap_oop_not_null(dst);
3389   } else {
3390     ldr(dst, src);
3391   }
3392 }
3393 
3394 void MacroAssembler::store_heap_oop(Address dst, Register src) {
3395   if (UseCompressedOops) {
3396     assert(!dst.uses(src), "not enough registers");
3397     encode_heap_oop(src);
3398     strw(src, dst);
3399   } else
3400     str(src, dst);
3401 }
3402 
3403 // Used for storing NULLs.
3404 void MacroAssembler::store_heap_oop_null(Address dst) {
3405   if (UseCompressedOops) {
3406     strw(zr, dst);
3407   } else
3408     str(zr, dst);
3409 }
3410 
3411 #if INCLUDE_ALL_GCS
3412 void MacroAssembler::g1_write_barrier_pre(Register obj,
3413                                           Register pre_val,
3414                                           Register thread,
3415                                           Register tmp,
3416                                           bool tosca_live,
3417                                           bool expand_call) {
3418   // If expand_call is true then we expand the call_VM_leaf macro
3419   // directly to skip generating the check by
3420   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
3421 
3422   assert(thread == rthread, "must be");
3423 
3424   Label done;
3425   Label runtime;
3426 
3427   assert(pre_val != noreg, "check this code");
3428 
3429   if (obj != noreg)
3430     assert_different_registers(obj, pre_val, tmp);
3431 
3432   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3433                                        PtrQueue::byte_offset_of_active()));
3434   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3435                                        PtrQueue::byte_offset_of_index()));
3436   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
3437                                        PtrQueue::byte_offset_of_buf()));
3438 
3439 
3440   // Is marking active?
3441   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
3442     ldrw(tmp, in_progress);
3443   } else {
3444     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
3445     ldrb(tmp, in_progress);
3446   }
3447   cbzw(tmp, done);
3448 
3449   // Do we need to load the previous value?
3450   if (obj != noreg) {
3451     load_heap_oop(pre_val, Address(obj, 0));
3452   }
3453 
3454   // Is the previous value null?
3455   cbz(pre_val, done);
3456 
3457   // Can we store original value in the thread's buffer?
3458   // Is index == 0?
3459   // (The index field is typed as size_t.)
3460 
3461   ldr(tmp, index);                      // tmp := *index_adr
3462   cbz(tmp, runtime);                    // tmp == 0?
3463                                         // If yes, goto runtime
3464 
3465   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
3466   str(tmp, index);                      // *index_adr := tmp
3467   ldr(rscratch1, buffer);
3468   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
3469 
3470   // Record the previous value
3471   str(pre_val, Address(tmp, 0));
3472   b(done);
3473 
3474   bind(runtime);
3475   // save the live input values
3476   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3477 
3478   // Calling the runtime using the regular call_VM_leaf mechanism generates
3479   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
3480   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
3481   //
3482   // If we care generating the pre-barrier without a frame (e.g. in the
3483   // intrinsified Reference.get() routine) then ebp might be pointing to
3484   // the caller frame and so this check will most likely fail at runtime.
3485   //
3486   // Expanding the call directly bypasses the generation of the check.
3487   // So when we do not have have a full interpreter frame on the stack
3488   // expand_call should be passed true.
3489 
3490   if (expand_call) {
3491     assert(pre_val != c_rarg1, "smashed arg");
3492     pass_arg1(this, thread);
3493     pass_arg0(this, pre_val);
3494     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
3495   } else {
3496     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
3497   }
3498 
3499   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
3500 
3501   bind(done);
3502 }
3503 
3504 void MacroAssembler::g1_write_barrier_post(Register store_addr,
3505                                            Register new_val,
3506                                            Register thread,
3507                                            Register tmp,
3508                                            Register tmp2) {
3509   assert(thread == rthread, "must be");
3510 
3511   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3512                                        PtrQueue::byte_offset_of_index()));
3513   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
3514                                        PtrQueue::byte_offset_of_buf()));
3515 
3516   BarrierSet* bs = Universe::heap()->barrier_set();
3517   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
3518   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3519 
3520   Label done;
3521   Label runtime;
3522 
3523   // Does store cross heap regions?
3524 
3525   eor(tmp, store_addr, new_val);
3526   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
3527   cbz(tmp, done);
3528 
3529   // crosses regions, storing NULL?
3530 
3531   cbz(new_val, done);
3532 
3533   // storing region crossing non-NULL, is card already dirty?
3534 
3535   ExternalAddress cardtable((address) ct->byte_map_base);
3536   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
3537   const Register card_addr = tmp;
3538 
3539   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
3540 
3541   unsigned long offset;
3542   adrp(tmp2, cardtable, offset);
3543 
3544   // get the address of the card
3545   add(card_addr, card_addr, tmp2);
3546   ldrb(tmp2, Address(card_addr, offset));
3547   cmpw(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
3548   br(Assembler::EQ, done);
3549 
3550   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
3551 
3552   membar(Assembler::StoreLoad);
3553 
3554   ldrb(tmp2, Address(card_addr, offset));
3555   cbzw(tmp2, done);
3556 
3557   // storing a region crossing, non-NULL oop, card is clean.
3558   // dirty card and log.
3559 
3560   strb(zr, Address(card_addr, offset));
3561 
3562   ldr(rscratch1, queue_index);
3563   cbz(rscratch1, runtime);
3564   sub(rscratch1, rscratch1, wordSize);
3565   str(rscratch1, queue_index);
3566 
3567   ldr(tmp2, buffer);
3568   str(card_addr, Address(tmp2, rscratch1));
3569   b(done);
3570 
3571   bind(runtime);
3572   // save the live input values
3573   push(store_addr->bit(true) | new_val->bit(true), sp);
3574   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
3575   pop(store_addr->bit(true) | new_val->bit(true), sp);
3576 
3577   bind(done);
3578 }
3579 
3580 #endif // INCLUDE_ALL_GCS
3581 
3582 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
3583   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
3584   int index = oop_recorder()->allocate_metadata_index(obj);
3585   RelocationHolder rspec = metadata_Relocation::spec(index);
3586   return Address((address)obj, rspec);
3587 }
3588 
3589 // Move an oop into a register.  immediate is true if we want
3590 // immediate instrcutions, i.e. we are not going to patch this
3591 // instruction while the code is being executed by another thread.  In
3592 // that case we can use move immediates rather than the constant pool.
3593 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
3594   int oop_index;
3595   if (obj == NULL) {
3596     oop_index = oop_recorder()->allocate_oop_index(obj);
3597   } else {
3598     oop_index = oop_recorder()->find_index(obj);
3599     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
3600   }
3601   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3602   if (! immediate) {
3603     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
3604     ldr_constant(dst, Address(dummy, rspec));
3605   } else
3606     mov(dst, Address((address)obj, rspec));
3607 }
3608 
3609 // Move a metadata address into a register.
3610 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
3611   int oop_index;
3612   if (obj == NULL) {
3613     oop_index = oop_recorder()->allocate_metadata_index(obj);
3614   } else {
3615     oop_index = oop_recorder()->find_index(obj);
3616   }
3617   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
3618   mov(dst, Address((address)obj, rspec));
3619 }
3620 
3621 Address MacroAssembler::constant_oop_address(jobject obj) {
3622   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3623   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
3624   int oop_index = oop_recorder()->find_index(obj);
3625   return Address((address)obj, oop_Relocation::spec(oop_index));
3626 }
3627 
3628 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
3629 void MacroAssembler::tlab_allocate(Register obj,
3630                                    Register var_size_in_bytes,
3631                                    int con_size_in_bytes,
3632                                    Register t1,
3633                                    Register t2,
3634                                    Label& slow_case) {
3635   assert_different_registers(obj, t2);
3636   assert_different_registers(obj, var_size_in_bytes);
3637   Register end = t2;
3638 
3639   // verify_tlab();
3640 
3641   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
3642   if (var_size_in_bytes == noreg) {
3643     lea(end, Address(obj, con_size_in_bytes));
3644   } else {
3645     lea(end, Address(obj, var_size_in_bytes));
3646   }
3647   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
3648   cmp(end, rscratch1);
3649   br(Assembler::HI, slow_case);
3650 
3651   // update the tlab top pointer
3652   str(end, Address(rthread, JavaThread::tlab_top_offset()));
3653 
3654   // recover var_size_in_bytes if necessary
3655   if (var_size_in_bytes == end) {
3656     sub(var_size_in_bytes, var_size_in_bytes, obj);
3657   }
3658   // verify_tlab();
3659 }
3660 
3661 // Preserves r19, and r3.
3662 Register MacroAssembler::tlab_refill(Label& retry,
3663                                      Label& try_eden,
3664                                      Label& slow_case) {
3665   Register top = r0;
3666   Register t1  = r2;
3667   Register t2  = r4;
3668   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r19, r3);
3669   Label do_refill, discard_tlab;
3670 
3671   if (!Universe::heap()->supports_inline_contig_alloc()) {
3672     // No allocation in the shared eden.
3673     b(slow_case);
3674   }
3675 
3676   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3677   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3678 
3679   // calculate amount of free space
3680   sub(t1, t1, top);
3681   lsr(t1, t1, LogHeapWordSize);
3682 
3683   // Retain tlab and allocate object in shared space if
3684   // the amount free in the tlab is too large to discard.
3685 
3686   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3687   cmp(t1, rscratch1);
3688   br(Assembler::LE, discard_tlab);
3689 
3690   // Retain
3691   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3692   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
3693   add(rscratch1, rscratch1, t2);
3694   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
3695 
3696   if (TLABStats) {
3697     // increment number of slow_allocations
3698     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
3699          1, rscratch1);
3700   }
3701   b(try_eden);
3702 
3703   bind(discard_tlab);
3704   if (TLABStats) {
3705     // increment number of refills
3706     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
3707          rscratch1);
3708     // accumulate wastage -- t1 is amount free in tlab
3709     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
3710          rscratch1);
3711   }
3712 
3713   // if tlab is currently allocated (top or end != null) then
3714   // fill [top, end + alignment_reserve) with array object
3715   cbz(top, do_refill);
3716 
3717   // set up the mark word
3718   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
3719   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
3720   // set the length to the remaining space
3721   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
3722   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
3723   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
3724   strw(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
3725   // set klass to intArrayKlass
3726   {
3727     unsigned long offset;
3728     // dubious reloc why not an oop reloc?
3729     adrp(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()),
3730          offset);
3731     ldr(t1, Address(rscratch1, offset));
3732   }
3733   // store klass last.  concurrent gcs assumes klass length is valid if
3734   // klass field is not null.
3735   store_klass(top, t1);
3736 
3737   mov(t1, top);
3738   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3739   sub(t1, t1, rscratch1);
3740   incr_allocated_bytes(rthread, t1, 0, rscratch1);
3741 
3742   // refill the tlab with an eden allocation
3743   bind(do_refill);
3744   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3745   lsl(t1, t1, LogHeapWordSize);
3746   // allocate new tlab, address returned in top
3747   eden_allocate(top, t1, 0, t2, slow_case);
3748 
3749   // Check that t1 was preserved in eden_allocate.
3750 #ifdef ASSERT
3751   if (UseTLAB) {
3752     Label ok;
3753     Register tsize = r4;
3754     assert_different_registers(tsize, rthread, t1);
3755     str(tsize, Address(pre(sp, -16)));
3756     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
3757     lsl(tsize, tsize, LogHeapWordSize);
3758     cmp(t1, tsize);
3759     br(Assembler::EQ, ok);
3760     STOP("assert(t1 != tlab size)");
3761     should_not_reach_here();
3762 
3763     bind(ok);
3764     ldr(tsize, Address(post(sp, 16)));
3765   }
3766 #endif
3767   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3768   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3769   add(top, top, t1);
3770   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
3771   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3772   verify_tlab();
3773   b(retry);
3774 
3775   return rthread; // for use by caller
3776 }
3777 
3778 // Defines obj, preserves var_size_in_bytes
3779 void MacroAssembler::eden_allocate(Register obj,
3780                                    Register var_size_in_bytes,
3781                                    int con_size_in_bytes,
3782                                    Register t1,
3783                                    Label& slow_case) {
3784   assert_different_registers(obj, var_size_in_bytes, t1);
3785   if (!Universe::heap()->supports_inline_contig_alloc()) {
3786     b(slow_case);
3787   } else {
3788     Register end = t1;
3789     Register heap_end = rscratch2;
3790     Label retry;
3791     bind(retry);
3792     {
3793       unsigned long offset;
3794       adrp(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()), offset);
3795       ldr(heap_end, Address(rscratch1, offset));
3796     }
3797 
3798     ExternalAddress heap_top((address) Universe::heap()->top_addr());
3799 
3800     // Get the current top of the heap
3801     {
3802       unsigned long offset;
3803       adrp(rscratch1, heap_top, offset);
3804       // Use add() here after ARDP, rather than lea().
3805       // lea() does not generate anything if its offset is zero.
3806       // However, relocs expect to find either an ADD or a load/store
3807       // insn after an ADRP.  add() always generates an ADD insn, even
3808       // for add(Rn, Rn, 0).
3809       add(rscratch1, rscratch1, offset);
3810       ldaxr(obj, rscratch1);
3811     }
3812 
3813     // Adjust it my the size of our new object
3814     if (var_size_in_bytes == noreg) {
3815       lea(end, Address(obj, con_size_in_bytes));
3816     } else {
3817       lea(end, Address(obj, var_size_in_bytes));
3818     }
3819 
3820     // if end < obj then we wrapped around high memory
3821     cmp(end, obj);
3822     br(Assembler::LO, slow_case);
3823 
3824     cmp(end, heap_end);
3825     br(Assembler::HI, slow_case);
3826 
3827     // If heap_top hasn't been changed by some other thread, update it.
3828     stlxr(rscratch2, end, rscratch1);
3829     cbnzw(rscratch2, retry);
3830   }
3831 }
3832 
3833 void MacroAssembler::verify_tlab() {
3834 #ifdef ASSERT
3835   if (UseTLAB && VerifyOops) {
3836     Label next, ok;
3837 
3838     stp(rscratch2, rscratch1, Address(pre(sp, -16)));
3839 
3840     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3841     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
3842     cmp(rscratch2, rscratch1);
3843     br(Assembler::HS, next);
3844     STOP("assert(top >= start)");
3845     should_not_reach_here();
3846 
3847     bind(next);
3848     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
3849     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
3850     cmp(rscratch2, rscratch1);
3851     br(Assembler::HS, ok);
3852     STOP("assert(top <= end)");
3853     should_not_reach_here();
3854 
3855     bind(ok);
3856     ldp(rscratch2, rscratch1, Address(post(sp, 16)));
3857   }
3858 #endif
3859 }
3860 
3861 // Writes to stack successive pages until offset reached to check for
3862 // stack overflow + shadow pages.  This clobbers tmp.
3863 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
3864   assert_different_registers(tmp, size, rscratch1);
3865   mov(tmp, sp);
3866   // Bang stack for total size given plus shadow page size.
3867   // Bang one page at a time because large size can bang beyond yellow and
3868   // red zones.
3869   Label loop;
3870   mov(rscratch1, os::vm_page_size());
3871   bind(loop);
3872   lea(tmp, Address(tmp, -os::vm_page_size()));
3873   subsw(size, size, rscratch1);
3874   str(size, Address(tmp));
3875   br(Assembler::GT, loop);
3876 
3877   // Bang down shadow pages too.
3878   // At this point, (tmp-0) is the last address touched, so don't
3879   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
3880   // was post-decremented.)  Skip this address by starting at i=1, and
3881   // touch a few more pages below.  N.B.  It is important to touch all
3882   // the way down to and including i=StackShadowPages.
3883   for (int i = 0; i< StackShadowPages-1; i++) {
3884     // this could be any sized move but this is can be a debugging crumb
3885     // so the bigger the better.
3886     lea(tmp, Address(tmp, -os::vm_page_size()));
3887     str(size, Address(tmp));
3888   }
3889 }
3890 
3891 
3892 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
3893   unsigned long off;
3894   adrp(r, Address(page, rtype), off);
3895   InstructionMark im(this);
3896   code_section()->relocate(inst_mark(), rtype);
3897   ldrw(zr, Address(r, off));
3898   return inst_mark();
3899 }
3900 
3901 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
3902   InstructionMark im(this);
3903   code_section()->relocate(inst_mark(), rtype);
3904   ldrw(zr, Address(r, 0));
3905   return inst_mark();
3906 }
3907 
3908 void MacroAssembler::adrp(Register reg1, const Address &dest, unsigned long &byte_offset) {
3909   relocInfo::relocType rtype = dest.rspec().reloc()->type();
3910   if (uabs(pc() - dest.target()) >= (1LL << 32)) {
3911     guarantee(rtype == relocInfo::none
3912               || rtype == relocInfo::external_word_type
3913               || rtype == relocInfo::poll_type
3914               || rtype == relocInfo::poll_return_type,
3915               "can only use a fixed address with an ADRP");
3916     // Out of range.  This doesn't happen very often, but we have to
3917     // handle it
3918     mov(reg1, dest);
3919     byte_offset = 0;
3920   } else {
3921     InstructionMark im(this);
3922     code_section()->relocate(inst_mark(), dest.rspec());
3923     byte_offset = (uint64_t)dest.target() & 0xfff;
3924     _adrp(reg1, dest.target());
3925   }
3926 }
3927 
3928 void MacroAssembler::build_frame(int framesize) {
3929   assert(framesize > 0, "framesize must be > 0");
3930   if (framesize < ((1 << 9) + 2 * wordSize)) {
3931     sub(sp, sp, framesize);
3932     stp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3933     if (PreserveFramePointer) add(rfp, sp, framesize - 2 * wordSize);
3934   } else {
3935     stp(rfp, lr, Address(pre(sp, -2 * wordSize)));
3936     if (PreserveFramePointer) mov(rfp, sp);
3937     if (framesize < ((1 << 12) + 2 * wordSize))
3938       sub(sp, sp, framesize - 2 * wordSize);
3939     else {
3940       mov(rscratch1, framesize - 2 * wordSize);
3941       sub(sp, sp, rscratch1);
3942     }
3943   }
3944 }
3945 
3946 void MacroAssembler::remove_frame(int framesize) {
3947   assert(framesize > 0, "framesize must be > 0");
3948   if (framesize < ((1 << 9) + 2 * wordSize)) {
3949     ldp(rfp, lr, Address(sp, framesize - 2 * wordSize));
3950     add(sp, sp, framesize);
3951   } else {
3952     if (framesize < ((1 << 12) + 2 * wordSize))
3953       add(sp, sp, framesize - 2 * wordSize);
3954     else {
3955       mov(rscratch1, framesize - 2 * wordSize);
3956       add(sp, sp, rscratch1);
3957     }
3958     ldp(rfp, lr, Address(post(sp, 2 * wordSize)));
3959   }
3960 }
3961 
3962 
3963 // Search for str1 in str2 and return index or -1
3964 void MacroAssembler::string_indexof(Register str2, Register str1,
3965                                     Register cnt2, Register cnt1,
3966                                     Register tmp1, Register tmp2,
3967                                     Register tmp3, Register tmp4,
3968                                     int icnt1, Register result) {
3969   Label BM, LINEARSEARCH, DONE, NOMATCH, MATCH;
3970 
3971   Register ch1 = rscratch1;
3972   Register ch2 = rscratch2;
3973   Register cnt1tmp = tmp1;
3974   Register cnt2tmp = tmp2;
3975   Register cnt1_neg = cnt1;
3976   Register cnt2_neg = cnt2;
3977   Register result_tmp = tmp4;
3978 
3979   // Note, inline_string_indexOf() generates checks:
3980   // if (substr.count > string.count) return -1;
3981   // if (substr.count == 0) return 0;
3982 
3983 // We have two strings, a source string in str2, cnt2 and a pattern string
3984 // in str1, cnt1. Find the 1st occurence of pattern in source or return -1.
3985 
3986 // For larger pattern and source we use a simplified Boyer Moore algorithm.
3987 // With a small pattern and source we use linear scan.
3988 
3989   if (icnt1 == -1) {
3990     cmp(cnt1, 256);             // Use Linear Scan if cnt1 < 8 || cnt1 >= 256
3991     ccmp(cnt1, 8, 0b0000, LO);  // Can't handle skip >= 256 because we use
3992     br(LO, LINEARSEARCH);       // a byte array.
3993     cmp(cnt1, cnt2, LSR, 2);    // Source must be 4 * pattern for BM
3994     br(HS, LINEARSEARCH);
3995   }
3996 
3997 // The Boyer Moore alogorithm is based on the description here:-
3998 //
3999 // http://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string_search_algorithm
4000 //
4001 // This describes and algorithm with 2 shift rules. The 'Bad Character' rule
4002 // and the 'Good Suffix' rule.
4003 //
4004 // These rules are essentially heuristics for how far we can shift the
4005 // pattern along the search string.
4006 //
4007 // The implementation here uses the 'Bad Character' rule only because of the
4008 // complexity of initialisation for the 'Good Suffix' rule.
4009 //
4010 // This is also known as the Boyer-Moore-Horspool algorithm:-
4011 //
4012 // http://en.wikipedia.org/wiki/Boyer-Moore-Horspool_algorithm
4013 //
4014 // #define ASIZE 128
4015 //
4016 //    int bm(unsigned char *x, int m, unsigned char *y, int n) {
4017 //       int i, j;
4018 //       unsigned c;
4019 //       unsigned char bc[ASIZE];
4020 //
4021 //       /* Preprocessing */
4022 //       for (i = 0; i < ASIZE; ++i)
4023 //          bc[i] = 0;
4024 //       for (i = 0; i < m - 1; ) {
4025 //          c = x[i];
4026 //          ++i;
4027 //          if (c < ASIZE) bc[c] = i;
4028 //       }
4029 //
4030 //       /* Searching */
4031 //       j = 0;
4032 //       while (j <= n - m) {
4033 //          c = y[i+j];
4034 //          if (x[m-1] == c)
4035 //            for (i = m - 2; i >= 0 && x[i] == y[i + j]; --i);
4036 //          if (i < 0) return j;
4037 //          if (c < ASIZE)
4038 //            j = j - bc[y[j+m-1]] + m;
4039 //          else
4040 //            j += 1; // Advance by 1 only if char >= ASIZE
4041 //       }
4042 //    }
4043 
4044   if (icnt1 == -1) {
4045     BIND(BM);
4046 
4047     Label ZLOOP, BCLOOP, BCSKIP, BMLOOPSTR2, BMLOOPSTR1, BMSKIP;
4048     Label BMADV, BMMATCH, BMCHECKEND;
4049 
4050     Register cnt1end = tmp2;
4051     Register str2end = cnt2;
4052     Register skipch = tmp2;
4053 
4054     // Restrict ASIZE to 128 to reduce stack space/initialisation.
4055     // The presence of chars >= ASIZE in the target string does not affect
4056     // performance, but we must be careful not to initialise them in the stack
4057     // array.
4058     // The presence of chars >= ASIZE in the source string may adversely affect
4059     // performance since we can only advance by one when we encounter one.
4060 
4061       stp(zr, zr, pre(sp, -128));
4062       for (int i = 1; i < 8; i++)
4063           stp(zr, zr, Address(sp, i*16));
4064 
4065       mov(cnt1tmp, 0);
4066       sub(cnt1end, cnt1, 1);
4067     BIND(BCLOOP);
4068       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4069       cmp(ch1, 128);
4070       add(cnt1tmp, cnt1tmp, 1);
4071       br(HS, BCSKIP);
4072       strb(cnt1tmp, Address(sp, ch1));
4073     BIND(BCSKIP);
4074       cmp(cnt1tmp, cnt1end);
4075       br(LT, BCLOOP);
4076 
4077       mov(result_tmp, str2);
4078 
4079       sub(cnt2, cnt2, cnt1);
4080       add(str2end, str2, cnt2, LSL, 1);
4081     BIND(BMLOOPSTR2);
4082       sub(cnt1tmp, cnt1, 1);
4083       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4084       ldrh(skipch, Address(str2, cnt1tmp, Address::lsl(1)));
4085       cmp(ch1, skipch);
4086       br(NE, BMSKIP);
4087       subs(cnt1tmp, cnt1tmp, 1);
4088       br(LT, BMMATCH);
4089     BIND(BMLOOPSTR1);
4090       ldrh(ch1, Address(str1, cnt1tmp, Address::lsl(1)));
4091       ldrh(ch2, Address(str2, cnt1tmp, Address::lsl(1)));
4092       cmp(ch1, ch2);
4093       br(NE, BMSKIP);
4094       subs(cnt1tmp, cnt1tmp, 1);
4095       br(GE, BMLOOPSTR1);
4096     BIND(BMMATCH);
4097       sub(result_tmp, str2, result_tmp);
4098       lsr(result, result_tmp, 1);
4099       add(sp, sp, 128);
4100       b(DONE);
4101     BIND(BMADV);
4102       add(str2, str2, 2);
4103       b(BMCHECKEND);
4104     BIND(BMSKIP);
4105       cmp(skipch, 128);
4106       br(HS, BMADV);
4107       ldrb(ch2, Address(sp, skipch));
4108       add(str2, str2, cnt1, LSL, 1);
4109       sub(str2, str2, ch2, LSL, 1);
4110     BIND(BMCHECKEND);
4111       cmp(str2, str2end);
4112       br(LE, BMLOOPSTR2);
4113       add(sp, sp, 128);
4114       b(NOMATCH);
4115   }
4116 
4117   BIND(LINEARSEARCH);
4118   {
4119     Label DO1, DO2, DO3;
4120 
4121     Register str2tmp = tmp2;
4122     Register first = tmp3;
4123 
4124     if (icnt1 == -1)
4125     {
4126         Label DOSHORT, FIRST_LOOP, STR2_NEXT, STR1_LOOP, STR1_NEXT, LAST_WORD;
4127 
4128         cmp(cnt1, 4);
4129         br(LT, DOSHORT);
4130 
4131         sub(cnt2, cnt2, cnt1);
4132         sub(cnt1, cnt1, 4);
4133         mov(result_tmp, cnt2);
4134 
4135         lea(str1, Address(str1, cnt1, Address::uxtw(1)));
4136         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4137         sub(cnt1_neg, zr, cnt1, LSL, 1);
4138         sub(cnt2_neg, zr, cnt2, LSL, 1);
4139         ldr(first, Address(str1, cnt1_neg));
4140 
4141       BIND(FIRST_LOOP);
4142         ldr(ch2, Address(str2, cnt2_neg));
4143         cmp(first, ch2);
4144         br(EQ, STR1_LOOP);
4145       BIND(STR2_NEXT);
4146         adds(cnt2_neg, cnt2_neg, 2);
4147         br(LE, FIRST_LOOP);
4148         b(NOMATCH);
4149 
4150       BIND(STR1_LOOP);
4151         adds(cnt1tmp, cnt1_neg, 8);
4152         add(cnt2tmp, cnt2_neg, 8);
4153         br(GE, LAST_WORD);
4154 
4155       BIND(STR1_NEXT);
4156         ldr(ch1, Address(str1, cnt1tmp));
4157         ldr(ch2, Address(str2, cnt2tmp));
4158         cmp(ch1, ch2);
4159         br(NE, STR2_NEXT);
4160         adds(cnt1tmp, cnt1tmp, 8);
4161         add(cnt2tmp, cnt2tmp, 8);
4162         br(LT, STR1_NEXT);
4163 
4164       BIND(LAST_WORD);
4165         ldr(ch1, Address(str1));
4166         sub(str2tmp, str2, cnt1_neg);         // adjust to corresponding
4167         ldr(ch2, Address(str2tmp, cnt2_neg)); // word in str2
4168         cmp(ch1, ch2);
4169         br(NE, STR2_NEXT);
4170         b(MATCH);
4171 
4172       BIND(DOSHORT);
4173         cmp(cnt1, 2);
4174         br(LT, DO1);
4175         br(GT, DO3);
4176     }
4177 
4178     if (icnt1 == 4) {
4179       Label CH1_LOOP;
4180 
4181         ldr(ch1, str1);
4182         sub(cnt2, cnt2, 4);
4183         mov(result_tmp, cnt2);
4184         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4185         sub(cnt2_neg, zr, cnt2, LSL, 1);
4186 
4187       BIND(CH1_LOOP);
4188         ldr(ch2, Address(str2, cnt2_neg));
4189         cmp(ch1, ch2);
4190         br(EQ, MATCH);
4191         adds(cnt2_neg, cnt2_neg, 2);
4192         br(LE, CH1_LOOP);
4193         b(NOMATCH);
4194     }
4195 
4196     if (icnt1 == -1 || icnt1 == 2) {
4197       Label CH1_LOOP;
4198 
4199       BIND(DO2);
4200         ldrw(ch1, str1);
4201         sub(cnt2, cnt2, 2);
4202         mov(result_tmp, cnt2);
4203         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4204         sub(cnt2_neg, zr, cnt2, LSL, 1);
4205 
4206       BIND(CH1_LOOP);
4207         ldrw(ch2, Address(str2, cnt2_neg));
4208         cmp(ch1, ch2);
4209         br(EQ, MATCH);
4210         adds(cnt2_neg, cnt2_neg, 2);
4211         br(LE, CH1_LOOP);
4212         b(NOMATCH);
4213     }
4214 
4215     if (icnt1 == -1 || icnt1 == 3) {
4216       Label FIRST_LOOP, STR2_NEXT, STR1_LOOP;
4217 
4218       BIND(DO3);
4219         ldrw(first, str1);
4220         ldrh(ch1, Address(str1, 4));
4221 
4222         sub(cnt2, cnt2, 3);
4223         mov(result_tmp, cnt2);
4224         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4225         sub(cnt2_neg, zr, cnt2, LSL, 1);
4226 
4227       BIND(FIRST_LOOP);
4228         ldrw(ch2, Address(str2, cnt2_neg));
4229         cmpw(first, ch2);
4230         br(EQ, STR1_LOOP);
4231       BIND(STR2_NEXT);
4232         adds(cnt2_neg, cnt2_neg, 2);
4233         br(LE, FIRST_LOOP);
4234         b(NOMATCH);
4235 
4236       BIND(STR1_LOOP);
4237         add(cnt2tmp, cnt2_neg, 4);
4238         ldrh(ch2, Address(str2, cnt2tmp));
4239         cmp(ch1, ch2);
4240         br(NE, STR2_NEXT);
4241         b(MATCH);
4242     }
4243 
4244     if (icnt1 == -1 || icnt1 == 1) {
4245       Label CH1_LOOP, HAS_ZERO;
4246       Label DO1_SHORT, DO1_LOOP;
4247 
4248       BIND(DO1);
4249         ldrh(ch1, str1);
4250         cmp(cnt2, 4);
4251         br(LT, DO1_SHORT);
4252 
4253         orr(ch1, ch1, ch1, LSL, 16);
4254         orr(ch1, ch1, ch1, LSL, 32);
4255 
4256         sub(cnt2, cnt2, 4);
4257         mov(result_tmp, cnt2);
4258         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4259         sub(cnt2_neg, zr, cnt2, LSL, 1);
4260 
4261         mov(tmp3, 0x0001000100010001);
4262       BIND(CH1_LOOP);
4263         ldr(ch2, Address(str2, cnt2_neg));
4264         eor(ch2, ch1, ch2);
4265         sub(tmp1, ch2, tmp3);
4266         orr(tmp2, ch2, 0x7fff7fff7fff7fff);
4267         bics(tmp1, tmp1, tmp2);
4268         br(NE, HAS_ZERO);
4269         adds(cnt2_neg, cnt2_neg, 8);
4270         br(LT, CH1_LOOP);
4271 
4272         cmp(cnt2_neg, 8);
4273         mov(cnt2_neg, 0);
4274         br(LT, CH1_LOOP);
4275         b(NOMATCH);
4276 
4277       BIND(HAS_ZERO);
4278         rev(tmp1, tmp1);
4279         clz(tmp1, tmp1);
4280         add(cnt2_neg, cnt2_neg, tmp1, LSR, 3);
4281         b(MATCH);
4282 
4283       BIND(DO1_SHORT);
4284         mov(result_tmp, cnt2);
4285         lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4286         sub(cnt2_neg, zr, cnt2, LSL, 1);
4287       BIND(DO1_LOOP);
4288         ldrh(ch2, Address(str2, cnt2_neg));
4289         cmpw(ch1, ch2);
4290         br(EQ, MATCH);
4291         adds(cnt2_neg, cnt2_neg, 2);
4292         br(LT, DO1_LOOP);
4293     }
4294   }
4295   BIND(NOMATCH);
4296     mov(result, -1);
4297     b(DONE);
4298   BIND(MATCH);
4299     add(result, result_tmp, cnt2_neg, ASR, 1);
4300   BIND(DONE);
4301 }
4302 
4303 // Compare strings.
4304 void MacroAssembler::string_compare(Register str1, Register str2,
4305                                     Register cnt1, Register cnt2, Register result,
4306                                     Register tmp1) {
4307   Label LENGTH_DIFF, DONE, SHORT_LOOP, SHORT_STRING,
4308     NEXT_WORD, DIFFERENCE;
4309 
4310   BLOCK_COMMENT("string_compare {");
4311 
4312   // Compute the minimum of the string lengths and save the difference.
4313   subsw(tmp1, cnt1, cnt2);
4314   cselw(cnt2, cnt1, cnt2, Assembler::LE); // min
4315 
4316   // A very short string
4317   cmpw(cnt2, 4);
4318   br(Assembler::LT, SHORT_STRING);
4319 
4320   // Check if the strings start at the same location.
4321   cmp(str1, str2);
4322   br(Assembler::EQ, LENGTH_DIFF);
4323 
4324   // Compare longwords
4325   {
4326     subw(cnt2, cnt2, 4); // The last longword is a special case
4327 
4328     // Move both string pointers to the last longword of their
4329     // strings, negate the remaining count, and convert it to bytes.
4330     lea(str1, Address(str1, cnt2, Address::uxtw(1)));
4331     lea(str2, Address(str2, cnt2, Address::uxtw(1)));
4332     sub(cnt2, zr, cnt2, LSL, 1);
4333 
4334     // Loop, loading longwords and comparing them into rscratch2.
4335     bind(NEXT_WORD);
4336     ldr(result, Address(str1, cnt2));
4337     ldr(cnt1, Address(str2, cnt2));
4338     adds(cnt2, cnt2, wordSize);
4339     eor(rscratch2, result, cnt1);
4340     cbnz(rscratch2, DIFFERENCE);
4341     br(Assembler::LT, NEXT_WORD);
4342 
4343     // Last longword.  In the case where length == 4 we compare the
4344     // same longword twice, but that's still faster than another
4345     // conditional branch.
4346 
4347     ldr(result, Address(str1));
4348     ldr(cnt1, Address(str2));
4349     eor(rscratch2, result, cnt1);
4350     cbz(rscratch2, LENGTH_DIFF);
4351 
4352     // Find the first different characters in the longwords and
4353     // compute their difference.
4354     bind(DIFFERENCE);
4355     rev(rscratch2, rscratch2);
4356     clz(rscratch2, rscratch2);
4357     andr(rscratch2, rscratch2, -16);
4358     lsrv(result, result, rscratch2);
4359     uxthw(result, result);
4360     lsrv(cnt1, cnt1, rscratch2);
4361     uxthw(cnt1, cnt1);
4362     subw(result, result, cnt1);
4363     b(DONE);
4364   }
4365 
4366   bind(SHORT_STRING);
4367   // Is the minimum length zero?
4368   cbz(cnt2, LENGTH_DIFF);
4369 
4370   bind(SHORT_LOOP);
4371   load_unsigned_short(result, Address(post(str1, 2)));
4372   load_unsigned_short(cnt1, Address(post(str2, 2)));
4373   subw(result, result, cnt1);
4374   cbnz(result, DONE);
4375   sub(cnt2, cnt2, 1);
4376   cbnz(cnt2, SHORT_LOOP);
4377 
4378   // Strings are equal up to min length.  Return the length difference.
4379   bind(LENGTH_DIFF);
4380   mov(result, tmp1);
4381 
4382   // That's it
4383   bind(DONE);
4384 
4385   BLOCK_COMMENT("} string_compare");
4386 }
4387 
4388 
4389 void MacroAssembler::string_equals(Register str1, Register str2,
4390                                    Register cnt, Register result,
4391                                    Register tmp1) {
4392   Label SAME_CHARS, DONE, SHORT_LOOP, SHORT_STRING,
4393     NEXT_WORD;
4394 
4395   const Register tmp2 = rscratch1;
4396   assert_different_registers(str1, str2, cnt, result, tmp1, tmp2, rscratch2);
4397 
4398   BLOCK_COMMENT("string_equals {");
4399 
4400   // Start by assuming that the strings are not equal.
4401   mov(result, zr);
4402 
4403   // A very short string
4404   cmpw(cnt, 4);
4405   br(Assembler::LT, SHORT_STRING);
4406 
4407   // Check if the strings start at the same location.
4408   cmp(str1, str2);
4409   br(Assembler::EQ, SAME_CHARS);
4410 
4411   // Compare longwords
4412   {
4413     subw(cnt, cnt, 4); // The last longword is a special case
4414 
4415     // Move both string pointers to the last longword of their
4416     // strings, negate the remaining count, and convert it to bytes.
4417     lea(str1, Address(str1, cnt, Address::uxtw(1)));
4418     lea(str2, Address(str2, cnt, Address::uxtw(1)));
4419     sub(cnt, zr, cnt, LSL, 1);
4420 
4421     // Loop, loading longwords and comparing them into rscratch2.
4422     bind(NEXT_WORD);
4423     ldr(tmp1, Address(str1, cnt));
4424     ldr(tmp2, Address(str2, cnt));
4425     adds(cnt, cnt, wordSize);
4426     eor(rscratch2, tmp1, tmp2);
4427     cbnz(rscratch2, DONE);
4428     br(Assembler::LT, NEXT_WORD);
4429 
4430     // Last longword.  In the case where length == 4 we compare the
4431     // same longword twice, but that's still faster than another
4432     // conditional branch.
4433 
4434     ldr(tmp1, Address(str1));
4435     ldr(tmp2, Address(str2));
4436     eor(rscratch2, tmp1, tmp2);
4437     cbz(rscratch2, SAME_CHARS);
4438     b(DONE);
4439   }
4440 
4441   bind(SHORT_STRING);
4442   // Is the length zero?
4443   cbz(cnt, SAME_CHARS);
4444 
4445   bind(SHORT_LOOP);
4446   load_unsigned_short(tmp1, Address(post(str1, 2)));
4447   load_unsigned_short(tmp2, Address(post(str2, 2)));
4448   subw(tmp1, tmp1, tmp2);
4449   cbnz(tmp1, DONE);
4450   sub(cnt, cnt, 1);
4451   cbnz(cnt, SHORT_LOOP);
4452 
4453   // Strings are equal.
4454   bind(SAME_CHARS);
4455   mov(result, true);
4456 
4457   // That's it
4458   bind(DONE);
4459 
4460   BLOCK_COMMENT("} string_equals");
4461 }
4462 
4463 // Compare char[] arrays aligned to 4 bytes
4464 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
4465                                         Register result, Register tmp1)
4466 {
4467   Register cnt1 = rscratch1;
4468   Register cnt2 = rscratch2;
4469   Register tmp2 = rscratch2;
4470 
4471   Label SAME, DIFFER, NEXT, TAIL03, TAIL01;
4472 
4473   int length_offset  = arrayOopDesc::length_offset_in_bytes();
4474   int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
4475 
4476   BLOCK_COMMENT("char_arrays_equals  {");
4477 
4478     // different until proven equal
4479     mov(result, false);
4480 
4481     // same array?
4482     cmp(ary1, ary2);
4483     br(Assembler::EQ, SAME);
4484 
4485     // ne if either null
4486     cbz(ary1, DIFFER);
4487     cbz(ary2, DIFFER);
4488 
4489     // lengths ne?
4490     ldrw(cnt1, Address(ary1, length_offset));
4491     ldrw(cnt2, Address(ary2, length_offset));
4492     cmp(cnt1, cnt2);
4493     br(Assembler::NE, DIFFER);
4494 
4495     lea(ary1, Address(ary1, base_offset));
4496     lea(ary2, Address(ary2, base_offset));
4497 
4498     subs(cnt1, cnt1, 4);
4499     br(LT, TAIL03);
4500 
4501   BIND(NEXT);
4502     ldr(tmp1, Address(post(ary1, 8)));
4503     ldr(tmp2, Address(post(ary2, 8)));
4504     subs(cnt1, cnt1, 4);
4505     eor(tmp1, tmp1, tmp2);
4506     cbnz(tmp1, DIFFER);
4507     br(GE, NEXT);
4508 
4509   BIND(TAIL03);  // 0-3 chars left, cnt1 = #chars left - 4
4510     tst(cnt1, 0b10);
4511     br(EQ, TAIL01);
4512     ldrw(tmp1, Address(post(ary1, 4)));
4513     ldrw(tmp2, Address(post(ary2, 4)));
4514     cmp(tmp1, tmp2);
4515     br(NE, DIFFER);
4516   BIND(TAIL01);  // 0-1 chars left
4517     tst(cnt1, 0b01);
4518     br(EQ, SAME);
4519     ldrh(tmp1, ary1);
4520     ldrh(tmp2, ary2);
4521     cmp(tmp1, tmp2);
4522     br(NE, DIFFER);
4523 
4524   BIND(SAME);
4525     mov(result, true);
4526   BIND(DIFFER); // result already set
4527 
4528   BLOCK_COMMENT("} char_arrays_equals");
4529 }
4530 
4531 // encode char[] to byte[] in ISO_8859_1
4532 void MacroAssembler::encode_iso_array(Register src, Register dst,
4533                       Register len, Register result,
4534                       FloatRegister Vtmp1, FloatRegister Vtmp2,
4535                       FloatRegister Vtmp3, FloatRegister Vtmp4)
4536 {
4537     Label DONE, NEXT_32, LOOP_8, NEXT_8, LOOP_1, NEXT_1;
4538     Register tmp1 = rscratch1;
4539 
4540       mov(result, len); // Save initial len
4541 
4542 #ifndef BUILTIN_SIM
4543       subs(len, len, 32);
4544       br(LT, LOOP_8);
4545 
4546 // The following code uses the SIMD 'uqxtn' and 'uqxtn2' instructions
4547 // to convert chars to bytes. These set the 'QC' bit in the FPSR if
4548 // any char could not fit in a byte, so clear the FPSR so we can test it.
4549       clear_fpsr();
4550 
4551     BIND(NEXT_32);
4552       ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src);
4553       uqxtn(Vtmp1, T8B, Vtmp1, T8H);  // uqxtn  - write bottom half
4554       uqxtn(Vtmp1, T16B, Vtmp2, T8H); // uqxtn2 - write top half
4555       uqxtn(Vtmp2, T8B, Vtmp3, T8H);
4556       uqxtn(Vtmp2, T16B, Vtmp4, T8H); // uqxtn2
4557       get_fpsr(tmp1);
4558       cbnzw(tmp1, LOOP_8);
4559       st1(Vtmp1, Vtmp2, T16B, post(dst, 32));
4560       subs(len, len, 32);
4561       add(src, src, 64);
4562       br(GE, NEXT_32);
4563 
4564     BIND(LOOP_8);
4565       adds(len, len, 32-8);
4566       br(LT, LOOP_1);
4567       clear_fpsr(); // QC may be set from loop above, clear again
4568     BIND(NEXT_8);
4569       ld1(Vtmp1, T8H, src);
4570       uqxtn(Vtmp1, T8B, Vtmp1, T8H);
4571       get_fpsr(tmp1);
4572       cbnzw(tmp1, LOOP_1);
4573       st1(Vtmp1, T8B, post(dst, 8));
4574       subs(len, len, 8);
4575       add(src, src, 16);
4576       br(GE, NEXT_8);
4577 
4578     BIND(LOOP_1);
4579       adds(len, len, 8);
4580       br(LE, DONE);
4581 #else
4582       cbz(len, DONE);
4583 #endif
4584     BIND(NEXT_1);
4585       ldrh(tmp1, Address(post(src, 2)));
4586       tst(tmp1, 0xff00);
4587       br(NE, DONE);
4588       strb(tmp1, Address(post(dst, 1)));
4589       subs(len, len, 1);
4590       br(GT, NEXT_1);
4591 
4592     BIND(DONE);
4593       sub(result, result, len); // Return index where we stopped
4594 }