1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This code is free software; you can redistribute it and/or modify it
   9  * under the terms of the GNU General Public License version 2 only, as
  10  * published by the Free Software Foundation.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  *
  26  */
  27 
  28 #include <sys/types.h>
  29 
  30 #include "precompiled.hpp"
  31 #include "jvm.h"
  32 #include "asm/assembler.hpp"
  33 #include "asm/assembler.inline.hpp"
  34 #include "gc/shared/barrierSet.hpp"
  35 #include "gc/shared/cardTable.hpp"
  36 #include "gc/shared/barrierSetAssembler.hpp"
  37 #include "gc/shared/cardTableBarrierSet.hpp"
  38 #include "interpreter/interpreter.hpp"
  39 #include "compiler/disassembler.hpp"
  40 #include "memory/resourceArea.hpp"
  41 #include "nativeInst_aarch32.hpp"
  42 #include "oops/accessDecorators.hpp"
  43 //This ifdef was introduced so a core build can be built
  44 #ifdef COMPILER2
  45 #include "opto/compile.hpp"
  46 #include "opto/node.hpp"
  47 #endif
  48 
  49 #include "runtime/biasedLocking.hpp"
  50 #include "runtime/icache.hpp"
  51 #include "runtime/interfaceSupport.inline.hpp"
  52 #include "runtime/jniHandles.inline.hpp"
  53 #include "runtime/sharedRuntime.hpp"
  54 
  55 #ifdef PRODUCT
  56 #define BLOCK_COMMENT(str) /* nothing */
  57 #define STOP(error) stop(error)
  58 #else
  59 #define BLOCK_COMMENT(str) block_comment(str)
  60 #define STOP(error) block_comment(error); stop(error)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 // FIXME This is not a nice fix, this constant was in a compiler2 header
  66 #define MAX_stubs_size_div2 (128 / 2)
  67 // FIXME END
  68 
  69 // Note the corrections in the following three instructions for the PC.
  70 // All literal modes that use the PC need to have the offset adjusted
  71 // Patch any kind of instruction; there may be several instructions.
  72 // Return the total length (in bytes) of the instructions.
  73 
  74 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  75   // Note the corrections
  76   int instructions = 1;
  77   long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
  78   bool add = offset >= 0;
  79   unsigned insn = *(unsigned*)branch;
  80   int opc = Instruction_aarch32::extract(insn, 27, 24);
  81 
  82   if(0b1010 == opc || 0b1011 == opc) {
  83     // Branch or branch with link
  84     assert(0 == (offset & 3), "not aligned correctly");
  85     Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
  86   } else if (0b0011 == opc) {
  87     // Movw, Movt or mov, orr, orr, orr
  88     // patch up address load to registers (absolute address).
  89       instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
  90   } else if (0b010 == (opc >> 1)) {
  91     // LDR, LDRB, STR, STRB
  92     Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
  93     Instruction_aarch32::patch(branch, 23, 23, add);
  94   } else if (0b000 == (opc >> 1)) {
  95     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
  96     offset = uabs(offset);
  97     Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
  98     Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
  99     Instruction_aarch32::patch(branch, 23, 23, add);
 100   } else if (0b1101 == opc) {
 101     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 102     offset = uabs(offset);
 103     assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
 104     Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
 105     Instruction_aarch32::patch(branch, 23, 23, add);
 106   } else if (0b0010 == opc) {
 107     // ADR
 108     Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
 109     Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
 110   } else {
 111     ShouldNotReachHere();
 112   }
 113   // aarch64 had something for polling page load?
 114   return instructions * NativeInstruction::arm_insn_sz;
 115 }
 116 
 117 int MacroAssembler::patch_oop(address insn_addr, address o) {
 118     unsigned insn = *(unsigned*)insn_addr;
 119     int opc = Instruction_aarch32::extract(insn, 27, 21);
 120     if(0b0011000 == opc) {
 121         //32-bit pointers, formed of a mov and a movt
 122         assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
 123 
 124         uint32_t btm = (uint32_t)o & 0xffff;
 125         Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
 126         Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
 127         uint32_t top = (uint32_t)o >> 16;
 128         Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
 129         Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
 130         return 2 * NativeInstruction::arm_insn_sz;
 131   } else if(0b0011101 == opc) {
 132     //Instead 32bit load sequence uses mov, orr, orr, orr
 133     assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
 134     assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
 135     assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
 136     // FIXME this could carry us outside valid memory
 137 
 138     uint32_t addr = (uint32_t)o;
 139     Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
 140     Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
 141     Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
 142     Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
 143     return 4 * NativeInstruction::arm_insn_sz;
 144   } else {
 145     ShouldNotReachHere();
 146   }
 147   return 0; //won't reach here
 148 }
 149 
 150 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 151   long offset = 0;
 152   int opc = Instruction_aarch32::extract(insn, 27, 24);
 153 
 154   if(0b1010 == opc || 0b1011 == opc) {
 155     // Branch or branch with link
 156     offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
 157   } else if (0b0011 == opc) {
 158     unsigned *insn_buf = (unsigned*)insn_addr;
 159     int opc2 = Instruction_aarch32::extract(insn, 23, 21);
 160     if(0b000 == opc2) {
 161       // movw, movt (only on newer ARMs)
 162       assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
 163       uint32_t addr;
 164       addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
 165       addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
 166       addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
 167       addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
 168       return address(addr);
 169     } else if(0b101 == opc2) {
 170       // mov, orr, orr, orr
 171       assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
 172       assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
 173       assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
 174       uint32_t addr;
 175       // TODO Check that the rotations are in the expected order.
 176       addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
 177       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
 178       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
 179       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
 180       return address(addr);
 181     } else {
 182       ShouldNotReachHere();
 183     }
 184   } else if (0b010 == (opc >> 1)) {
 185     // LDR, LDRB, STR, STRB
 186     offset = Instruction_aarch32::extract(insn, 11, 0);
 187     bool add = Instruction_aarch32::extract(insn, 23, 23);
 188     offset = add ? offset : -offset;
 189   } else if (0b000 == (opc >> 1)) {
 190     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
 191     offset = Instruction_aarch32::extract(insn, 3, 0);
 192     offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
 193     bool add = Instruction_aarch32::extract(insn, 23, 23);
 194     offset = add ? offset : -offset;
 195   } else if (0b1101 == opc) {
 196     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 197     offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
 198     bool add = Instruction_aarch32::extract(insn, 23, 23);
 199     offset = add ? offset : -offset;
 200   } else if (0b0010 == opc) {
 201     // ADR
 202     offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
 203     int code = Instruction_aarch32::extract(insn, 23, 22);
 204     switch(code) {
 205       case 0b01: offset = -offset; break;
 206       case 0b10:                   break;
 207       default: ShouldNotReachHere();
 208     }
 209   } else {
 210     ShouldNotReachHere();
 211   }
 212   //Correct offset for PC
 213   offset += 8;
 214   return address(((uint32_t)insn_addr + offset));
 215 }
 216 
 217 
 218 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 219   dmb(Assembler::ISH);
 220 }
 221 
 222 void MacroAssembler::safepoint_poll(Label& slow_path) {
 223   if (SafepointMechanism::uses_thread_local_poll()) {
 224     ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
 225     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 226   } else {
 227     mov(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()));
 228     ldr(rscratch1, Address(rscratch1));
 229     cmp(rscratch1, SafepointSynchronize::_not_synchronized);
 230     b(slow_path, Assembler::NE);
 231   }
 232 }
 233 
 234 // Just like safepoint_poll, but use an acquiring load for thread-
 235 // local polling.
 236 //
 237 // We need an acquire here to ensure that any subsequent load of the
 238 // global SafepointSynchronize::_state flag is ordered after this load
 239 // of the local Thread::_polling page.  We don't want this poll to
 240 // return false (i.e. not safepointing) and a later poll of the global
 241 // SafepointSynchronize::_state spuriously to return true.
 242 //
 243 // This is to avoid a race when we're in a native->Java transition
 244 // racing the code which wakes up from a safepoint.
 245 //
 246 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
 247   if (SafepointMechanism::uses_thread_local_poll()) {
 248     lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
 249     ldr(rscratch1, rscratch1);
 250     dmb(Assembler::ISH);
 251     tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
 252   } else {
 253     safepoint_poll(slow_path);
 254   }
 255 }
 256 
 257 void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
 258   mov(rscratch1, 0);
 259   // we must set sp to zero to clear frame
 260   str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
 261   // must clear fp, so that compiled frames are not confused; it is
 262   // possible that we need it only for debugging
 263   if (clear_fp) {
 264     str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
 265   }
 266 
 267   // Always clear the pc because it could have been set by make_walkable()
 268   str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
 269 }
 270 
 271 // Calls to C land
 272 //
 273 // When entering C land, the rfp & sp of the last Java frame have to be recorded
 274 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 275 // has to be reset to 0. This is required to allow proper stack traversal.
 276 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 277                                          Register last_java_fp,
 278                                          Register last_java_pc,
 279                                          Register scratch) {
 280 
 281   if (last_java_pc->is_valid()) {
 282       str(last_java_pc, Address(rthread,
 283                                 JavaThread::frame_anchor_offset()
 284                                 + JavaFrameAnchor::last_Java_pc_offset()));
 285     }
 286 
 287   // determine last_java_sp register
 288   if (last_java_sp == sp) {
 289     mov(scratch, sp);
 290     last_java_sp = scratch;
 291   } else if (!last_java_sp->is_valid()) {
 292     last_java_sp = sp;
 293   }
 294 
 295   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 296 
 297   // last_java_fp is optional
 298   if (last_java_fp->is_valid()) {
 299     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 300   }
 301 }
 302 
 303 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 304                                          Register last_java_fp,
 305                                          address  last_java_pc,
 306                                          Register scratch) {
 307   if (last_java_pc != NULL) {
 308     adr(scratch, last_java_pc);
 309   } else {
 310     // FIXME: This is almost never correct.  We should delete all
 311     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 312     // correct return address instead.
 313     adr(scratch, pc());
 314   }
 315 
 316   str(scratch, Address(rthread,
 317                        JavaThread::frame_anchor_offset()
 318                        + JavaFrameAnchor::last_Java_pc_offset()));
 319 
 320   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 321 }
 322 
 323 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 324                                          Register last_java_fp,
 325                                          Label &L,
 326                                          Register scratch) {
 327   if (L.is_bound()) {
 328     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 329   } else {
 330     InstructionMark im(this);
 331     L.add_patch_at(code(), locator());
 332     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 333   }
 334 }
 335 
 336 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf) {
 337   assert(CodeCache::find_blob(entry.target()) != NULL,
 338          "destination of far call not found in code cache");
 339   if (far_branches()) {
 340     lea(lr, entry);
 341     if (cbuf) cbuf->set_insts_mark();
 342     bl(lr);
 343   } else {
 344     if (cbuf) cbuf->set_insts_mark();
 345     bl(entry);
 346   }
 347 }
 348 
 349 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 350   assert(CodeCache::find_blob(entry.target()) != NULL,
 351          "destination of far call not found in code cache");
 352   if (far_branches()) {
 353     lea(tmp, entry);
 354     if (cbuf) cbuf->set_insts_mark();
 355     b(tmp);
 356   } else {
 357     if (cbuf) cbuf->set_insts_mark();
 358     b(entry);
 359   }
 360 }
 361 
 362 void MacroAssembler::reserved_stack_check() {
 363     // testing if reserved zone needs to be enabled
 364     Label no_reserved_zone_enabling;
 365 
 366     ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
 367     cmp(sp, rscratch1);
 368     b(no_reserved_zone_enabling, Assembler::LO);
 369 
 370     enter();   // LR and FP are live.
 371     lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
 372     mov(c_rarg0, rthread);
 373     bl(rscratch1);
 374     leave();
 375 
 376     // We have already removed our own frame.
 377     // throw_delayed_StackOverflowError will think that it's been
 378     // called by our caller.
 379     lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
 380     b(rscratch1);
 381     should_not_reach_here();
 382 
 383     bind(no_reserved_zone_enabling);
 384 }
 385 
 386 int MacroAssembler::biased_locking_enter(Register obj_reg,
 387                                          Register swap_reg,
 388                                          Register tmp_reg,
 389                                          Register tmp_reg2,
 390                                          bool swap_reg_contains_mark,
 391                                          Label& done,
 392                                          Label* slow_case,
 393                                          BiasedLockingCounters* counters) {
 394   assert(UseBiasedLocking, "why call this otherwise?");
 395 
 396   if (PrintBiasedLockingStatistics && counters == NULL)
 397     counters = BiasedLocking::counters();
 398 
 399   assert(tmp_reg != noreg, "must be real register");
 400   assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp_reg2);
 401   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 402   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 403   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 404 
 405   // Biased locking
 406   // See whether the lock is currently biased toward our thread and
 407   // whether the epoch is still valid
 408   // Note that the runtime guarantees sufficient alignment of JavaThread
 409   // pointers to allow age to be placed into low bits
 410   // First check to see whether biasing is even enabled for this object
 411   Label cas_label;
 412   int null_check_offset = -1;
 413   if (!swap_reg_contains_mark) {
 414     null_check_offset = offset();
 415     ldr(swap_reg, mark_addr);
 416   }
 417   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 418   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 419   b(cas_label, Assembler::NE);
 420   // The bias pattern is present in the object's header. Need to check
 421   // whether the bias owner and the epoch are both still current.
 422   load_prototype_header(tmp_reg, obj_reg);
 423   orr(tmp_reg, tmp_reg, rthread);
 424   eor(tmp_reg, swap_reg, tmp_reg);
 425 //  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 426   bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
 427   if (counters != NULL) {
 428     Label around;
 429     cbnz(tmp_reg, around);
 430     atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, tmp_reg2);
 431     b(done);
 432     bind(around);
 433   } else {
 434     cbz(tmp_reg, done);
 435   }
 436 
 437   Label try_revoke_bias;
 438   Label try_rebias;
 439 
 440   // At this point we know that the header has the bias pattern and
 441   // that we are not the bias owner in the current epoch. We need to
 442   // figure out more details about the state of the header in order to
 443   // know what operations can be legally performed on the object's
 444   // header.
 445 
 446   // If the low three bits in the xor result aren't clear, that means
 447   // the prototype header is no longer biased and we have to revoke
 448   // the bias on this object.
 449   andr(tmp_reg2, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 450   cbnz(tmp_reg2, try_revoke_bias);
 451 
 452   // Biasing is still enabled for this data type. See whether the
 453   // epoch of the current bias is still valid, meaning that the epoch
 454   // bits of the mark word are equal to the epoch bits of the
 455   // prototype header. (Note that the prototype header's epoch bits
 456   // only change at a safepoint.) If not, attempt to rebias the object
 457   // toward the current thread. Note that we must be absolutely sure
 458   // that the current epoch is invalid in order to do this because
 459   // otherwise the manipulations it performs on the mark word are
 460   // illegal.
 461   andr(tmp_reg2, tmp_reg, markOopDesc::epoch_mask_in_place);
 462   cbnz(tmp_reg2, try_rebias);
 463 
 464   // The epoch of the current bias is still valid but we know nothing
 465   // about the owner; it might be set or it might be clear. Try to
 466   // acquire the bias of the object using an atomic operation. If this
 467   // fails we will go in to the runtime to revoke the object's bias.
 468   // Note that we first construct the presumed unbiased header so we
 469   // don't accidentally blow away another thread's valid bias.
 470   {
 471     Label here;
 472     mov(tmp_reg2, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 473     andr(swap_reg, swap_reg, tmp_reg2);
 474     orr(tmp_reg, swap_reg, rthread);
 475     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
 476     // If the biasing toward our thread failed, this means that
 477     // another thread succeeded in biasing it toward itself and we
 478     // need to revoke that bias. The revocation will occur in the
 479     // interpreter runtime in the slow case.
 480     bind(here);
 481     if (counters != NULL) {
 482       atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 483                   tmp_reg, tmp_reg2);
 484     }
 485   }
 486   b(done);
 487 
 488   bind(try_rebias);
 489   // At this point we know the epoch has expired, meaning that the
 490   // current "bias owner", if any, is actually invalid. Under these
 491   // circumstances _only_, we are allowed to use the current header's
 492   // value as the comparison value when doing the cas to acquire the
 493   // bias in the current epoch. In other words, we allow transfer of
 494   // the bias from one thread to another directly in this situation.
 495   //
 496   // FIXME: due to a lack of registers we currently blow away the age
 497   // bits in this situation. Should attempt to preserve them.
 498   {
 499     Label here;
 500     load_prototype_header(tmp_reg, obj_reg);
 501     orr(tmp_reg, rthread, tmp_reg);
 502     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
 503     // If the biasing toward our thread failed, then another thread
 504     // succeeded in biasing it toward itself and we need to revoke that
 505     // bias. The revocation will occur in the runtime in the slow case.
 506     bind(here);
 507     if (counters != NULL) {
 508       atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
 509                   tmp_reg, tmp_reg2);
 510     }
 511   }
 512   b(done);
 513 
 514   bind(try_revoke_bias);
 515   // The prototype mark in the klass doesn't have the bias bit set any
 516   // more, indicating that objects of this data type are not supposed
 517   // to be biased any more. We are going to try to reset the mark of
 518   // this object to the prototype value and fall through to the
 519   // CAS-based locking scheme. Note that if our CAS fails, it means
 520   // that another thread raced us for the privilege of revoking the
 521   // bias of this particular object, so it's okay to continue in the
 522   // normal locking code.
 523   //
 524   // FIXME: due to a lack of registers we currently blow away the age
 525   // bits in this situation. Should attempt to preserve them.
 526   {
 527     Label here, nope;
 528     load_prototype_header(tmp_reg, obj_reg);
 529     cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, &nope);
 530     bind(here);
 531 
 532     // Fall through to the normal CAS-based lock, because no matter what
 533     // the result of the above CAS, some thread must have succeeded in
 534     // removing the bias bit from the object's header.
 535     if (counters != NULL) {
 536       atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 537                   tmp_reg2);
 538     }
 539     bind(nope);
 540   }
 541 
 542   bind(cas_label);
 543 
 544   return null_check_offset;
 545 }
 546 
 547 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 548   assert(UseBiasedLocking, "why call this otherwise?");
 549 
 550   // Check for biased locking unlock case, which is a no-op
 551   // Note: we do not have to check the thread ID for two reasons.
 552   // First, the interpreter checks for IllegalMonitorStateException at
 553   // a higher level. Second, if the bias was revoked while we held the
 554   // lock, the object could not be rebiased toward another thread, so
 555   // the bias bit would be clear.
 556   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 557   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 558   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 559   b(done, Assembler::EQ);
 560 }
 561 
 562 
 563 static void pass_arg0(MacroAssembler* masm, Register arg) {
 564   if (c_rarg0 != arg ) {
 565     masm->mov(c_rarg0, arg);
 566   }
 567 }
 568 
 569 static void pass_arg1(MacroAssembler* masm, Register arg) {
 570   if (c_rarg1 != arg ) {
 571     masm->mov(c_rarg1, arg);
 572   }
 573 }
 574 
 575 static void pass_arg2(MacroAssembler* masm, Register arg) {
 576   if (c_rarg2 != arg ) {
 577     masm->mov(c_rarg2, arg);
 578   }
 579 }
 580 
 581 static void pass_arg3(MacroAssembler* masm, Register arg) {
 582   if (c_rarg3 != arg ) {
 583     masm->mov(c_rarg3, arg);
 584   }
 585 }
 586 
 587 void MacroAssembler::call_VM_base(Register oop_result,
 588                                   Register java_thread,
 589                                   Register last_java_sp,
 590                                   address  entry_point,
 591                                   int      number_of_arguments,
 592                                   bool     check_exceptions) {
 593    // determine java_thread register
 594   if (!java_thread->is_valid()) {
 595     java_thread = rthread;
 596   }
 597 
 598   // determine last_java_sp register
 599   if (!last_java_sp->is_valid()) {
 600     last_java_sp = sp;
 601   }
 602 
 603   // debugging support
 604   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 605   assert(java_thread == rthread, "unexpected register");
 606 
 607   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 608   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 609 
 610   // push java thread (becomes first argument of C function)
 611 
 612   mov(c_rarg0, java_thread);
 613 
 614   // set last Java frame before call
 615   assert(last_java_sp != rfp, "can't use rfp");
 616 
 617   Label l;
 618   set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
 619 
 620 
 621   // FIXME - Can save lr in more elegant way ?
 622   //str(lr, pre(sp, -wordSize));
 623 
 624   // do the call, remove parameters
 625   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 626 
 627   //ldr(lr, post(sp, wordSize));
 628 
 629   // reset last Java frame
 630   // Only interpreter should have to clear fp
 631   reset_last_Java_frame(true);
 632 
 633    // C++ interp handles this in the interpreter
 634   check_and_handle_popframe(java_thread);
 635   check_and_handle_earlyret(java_thread);
 636 
 637   if (check_exceptions) {
 638     // check for pending exceptions (java_thread is set upon return)
 639     ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 640     Label ok;
 641     cbz(rscratch2, ok);
 642 
 643     lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
 644     // forward_exception uses LR to choose exception handler but LR is trashed by previous code
 645     // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
 646     bl(rscratch2);
 647     bind(ok);
 648   }
 649 
 650   // get oop result if there is one and reset the value in the thread
 651   if (oop_result->is_valid()) {
 652     get_vm_result(oop_result, java_thread);
 653   }
 654 }
 655 
 656 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 657   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 658 }
 659 
 660 // Maybe emit a call via a trampoline.  If the code cache is small
 661 // trampolines won't be emitted.
 662 
 663 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 664   assert(JavaThread::current()->is_Compiler_thread(), "just checking");
 665   assert(entry.rspec().type() == relocInfo::runtime_call_type
 666          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 667          || entry.rspec().type() == relocInfo::static_call_type
 668          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 669 
 670   if (cbuf) {
 671     cbuf->set_insts_mark();
 672   }
 673 
 674   if (far_branches()) {
 675     // Have make trampoline such way: destination address should be raw 4 byte value,
 676     // so it's patching could be done atomically.
 677     relocate(entry.rspec());
 678     address start = pc();
 679     add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
 680     ldr(r15_pc, Address(r15_pc, 4));
 681     emit_int32((uintptr_t) entry.target());
 682     // possibly pad the call to the NativeCall size to make patching happy
 683     while (pc() - start < NativeCall::instruction_size) {
 684       nop();
 685     }
 686     assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!");
 687   } else {
 688     bl(entry);
 689   }
 690 }
 691 
 692 void MacroAssembler::c2bool(Register x) {
 693   ands(r0, r0, 0xff);
 694   mov(r0, 1, Assembler::NE);
 695 }
 696 
 697 void MacroAssembler::ic_call(address entry, jint method_index) {
 698   RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
 699   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 700   // unsigned long offset;
 701   // ldr_constant(rscratch2, const_ptr);
 702   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 703   trampoline_call(Address(entry, rh));
 704 }
 705 
 706 // Implementation of call_VM versions
 707 
 708 void MacroAssembler::call_VM(Register oop_result,
 709                              address entry_point,
 710                              bool check_exceptions) {
 711   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 712 }
 713 
 714 void MacroAssembler::call_VM(Register oop_result,
 715                              address entry_point,
 716                              Register arg_1,
 717                              bool check_exceptions) {
 718   pass_arg1(this, arg_1);
 719   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 720 }
 721 
 722 void MacroAssembler::call_VM(Register oop_result,
 723                              address entry_point,
 724                              Register arg_1,
 725                              Register arg_2,
 726                              bool check_exceptions) {
 727   assert(arg_1 != c_rarg2, "smashed arg");
 728   pass_arg2(this, arg_2);
 729   pass_arg1(this, arg_1);
 730   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 731 }
 732 
 733 void MacroAssembler::call_VM(Register oop_result,
 734                              address entry_point,
 735                              Register arg_1,
 736                              Register arg_2,
 737                              Register arg_3,
 738                              bool check_exceptions) {
 739   assert(arg_1 != c_rarg3, "smashed arg");
 740   assert(arg_2 != c_rarg3, "smashed arg");
 741   pass_arg3(this, arg_3);
 742 
 743   assert(arg_1 != c_rarg2, "smashed arg");
 744   pass_arg2(this, arg_2);
 745 
 746   pass_arg1(this, arg_1);
 747   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 748 }
 749 
 750 void MacroAssembler::call_VM(Register oop_result,
 751                              Register last_java_sp,
 752                              address entry_point,
 753                              int number_of_arguments,
 754                              bool check_exceptions) {
 755   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 756 }
 757 
 758 void MacroAssembler::call_VM(Register oop_result,
 759                              Register last_java_sp,
 760                              address entry_point,
 761                              Register arg_1,
 762                              bool check_exceptions) {
 763   pass_arg1(this, arg_1);
 764   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 765 }
 766 
 767 void MacroAssembler::call_VM(Register oop_result,
 768                              Register last_java_sp,
 769                              address entry_point,
 770                              Register arg_1,
 771                              Register arg_2,
 772                              bool check_exceptions) {
 773 
 774   assert(arg_1 != c_rarg2, "smashed arg");
 775   pass_arg2(this, arg_2);
 776   pass_arg1(this, arg_1);
 777   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 778 }
 779 
 780 void MacroAssembler::call_VM(Register oop_result,
 781                              Register last_java_sp,
 782                              address entry_point,
 783                              Register arg_1,
 784                              Register arg_2,
 785                              Register arg_3,
 786                              bool check_exceptions) {
 787   assert(arg_1 != c_rarg3, "smashed arg");
 788   assert(arg_2 != c_rarg3, "smashed arg");
 789   pass_arg3(this, arg_3);
 790   assert(arg_1 != c_rarg2, "smashed arg");
 791   pass_arg2(this, arg_2);
 792   pass_arg1(this, arg_1);
 793   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 794 }
 795 
 796 
 797 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 798   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 799   assert(oop_result != rscratch2, "can't be");
 800   mov(rscratch2, 0);
 801   str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
 802   verify_oop(oop_result, "broken oop in call_VM_base");
 803 }
 804 
 805 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 806   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 807   assert(metadata_result != rscratch2 &&
 808          java_thread != rscratch2, "can't be");
 809   mov(rscratch2, 0);
 810   str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
 811 }
 812 
 813 void MacroAssembler::align(int modulus) {
 814   while (offset() % modulus != 0) nop();
 815 }
 816 
 817 // these are no-ops overridden by InterpreterMacroAssembler
 818 
 819 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 820 
 821 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 822 
 823 
 824 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 825                                                       Register tmp,
 826                                                       int offset) {
 827   intptr_t value = *delayed_value_addr;
 828   if (value != 0)
 829     return RegisterOrConstant(value + offset);
 830 
 831   // load indirectly to solve generation ordering problem
 832   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 833 
 834   if (offset != 0)
 835     add(tmp, tmp, offset);
 836 
 837   return RegisterOrConstant(tmp);
 838 }
 839 
 840 
 841 // Look up the method for a megamorphic invokeinterface call.
 842 // The target method is determined by <intf_klass, itable_index>.
 843 // The receiver klass is in recv_klass.
 844 // On success, the result will be in method_result, and execution falls through.
 845 // On failure, execution transfers to the given label.
 846 void MacroAssembler::lookup_interface_method(Register recv_klass,
 847                                              Register intf_klass,
 848                                              RegisterOrConstant itable_index,
 849                                              Register method_result,
 850                                              Register scan_temp,
 851                                              Label& L_no_such_interface,
 852                                              bool return_method) {
 853   assert_different_registers(recv_klass, intf_klass, scan_temp);
 854   assert_different_registers(method_result, intf_klass, scan_temp);
 855   assert(recv_klass != method_result || !return_method,
 856          "recv_klass can be destroyed when method isn't needed");
 857 
 858   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 859   int vtable_base = in_bytes(InstanceKlass::vtable_start_offset());
 860   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 861   int scan_step   = itableOffsetEntry::size() * wordSize;
 862   int vte_size    = vtableEntry::size_in_bytes();
 863   assert(vte_size == wordSize, "else adjust times_vte_scale");
 864 
 865   ldr(scan_temp, Address(recv_klass, in_bytes(InstanceKlass::vtable_length_offset())));
 866 
 867   // %%% Could store the aligned, prescaled offset in the klassoop.
 868   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 869   lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
 870   add(scan_temp, scan_temp, vtable_base);
 871 
 872   if (return_method) {
 873     // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 874     assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 875     // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 876     lea(recv_klass, itable_index.is_register() ?
 877             Address(recv_klass, itable_index, lsl(2)) :
 878             Address(recv_klass, itable_index.as_constant() << 2));
 879     if (itentry_off)
 880       add(recv_klass, recv_klass, itentry_off);
 881   }
 882 
 883   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 884   //   if (scan->interface() == intf) {
 885   //     result = (klass + scan->offset() + itable_index);
 886   //   }
 887   // }
 888   Label search, found_method;
 889 
 890   for (int peel = 1; peel >= 0; peel--) {
 891     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 892     cmp(intf_klass, method_result);
 893 
 894     if (peel) {
 895       b(found_method, Assembler::EQ);
 896     } else {
 897       b(search, Assembler::NE);
 898       // (invert the test to fall through to found_method...)
 899     }
 900 
 901     if (!peel)  break;
 902 
 903     bind(search);
 904 
 905     // Check that the previous entry is non-null.  A null entry means that
 906     // the receiver class doesn't implement the interface, and wasn't the
 907     // same as when the caller was compiled.
 908     cbz(method_result, L_no_such_interface);
 909     add(scan_temp, scan_temp, scan_step);
 910   }
 911 
 912   bind(found_method);
 913 
 914   if (return_method) {
 915     // Got a hit.
 916     ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 917     ldr(method_result, Address(recv_klass, scan_temp));
 918   }
 919 }
 920 
 921 // virtual method calling
 922 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 923                                            RegisterOrConstant vtable_index,
 924                                            Register method_result) {
 925   const int base = in_bytes(InstanceKlass::vtable_start_offset());
 926   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 927   if (vtable_index.is_register()) {
 928     lea(method_result, Address(recv_klass,
 929                                vtable_index.as_register(),
 930                                lsl(LogBytesPerWord)));
 931     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 932   } else {
 933     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 934     if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) {
 935       ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 936     } else {
 937       mov(method_result, vtable_offset_in_bytes);
 938       ldr(method_result, Address(recv_klass, method_result));
 939     }
 940   }
 941 }
 942 
 943 void MacroAssembler::check_klass_subtype(Register sub_klass,
 944                            Register super_klass,
 945                            Register temp_reg,
 946                            Label& L_success) {
 947   Label L_failure;
 948   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 949   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 950   bind(L_failure);
 951 }
 952 
 953 
 954 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 955                                                    Register super_klass,
 956                                                    Register temp_reg,
 957                                                    Label* L_success,
 958                                                    Label* L_failure,
 959                                                    Label* L_slow_path,
 960                                         RegisterOrConstant super_check_offset) {
 961   assert_different_registers(sub_klass, super_klass, temp_reg);
 962   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 963   if (super_check_offset.is_register()) {
 964     assert_different_registers(sub_klass, super_klass,
 965                                super_check_offset.as_register());
 966   } else if (must_load_sco) {
 967     assert(temp_reg != noreg, "supply either a temp or a register offset");
 968   }
 969 
 970   Label L_fallthrough;
 971   int label_nulls = 0;
 972   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 973   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 974   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 975   assert(label_nulls <= 1, "at most one NULL in the batch");
 976 
 977   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 978   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 979   Address super_check_offset_addr(super_klass, sco_offset);
 980 
 981   // Hacked jmp, which may only be used just before L_fallthrough.
 982 #define final_jmp(label)                                                \
 983   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 984   else                            b(label)                /*omit semi*/
 985 
 986   // If the pointers are equal, we are done (e.g., String[] elements).
 987   // This self-check enables sharing of secondary supertype arrays among
 988   // non-primary types such as array-of-interface.  Otherwise, each such
 989   // type would need its own customized SSA.
 990   // We move this check to the front of the fast path because many
 991   // type checks are in fact trivially successful in this manner,
 992   // so we get a nicely predicted branch right at the start of the check.
 993   cmp(sub_klass, super_klass);
 994   b(*L_success, Assembler::EQ);
 995 
 996   // Check the supertype display:
 997   if (must_load_sco) {
 998     ldr(temp_reg, super_check_offset_addr);
 999     super_check_offset = RegisterOrConstant(temp_reg);
1000   }
1001   Address super_check_addr(sub_klass, super_check_offset);
1002   ldr(rscratch1, super_check_addr);
1003   cmp(super_klass, rscratch1); // load displayed supertype
1004 
1005   // This check has worked decisively for primary supers.
1006   // Secondary supers are sought in the super_cache ('super_cache_addr').
1007   // (Secondary supers are interfaces and very deeply nested subtypes.)
1008   // This works in the same check above because of a tricky aliasing
1009   // between the super_cache and the primary super display elements.
1010   // (The 'super_check_addr' can address either, as the case requires.)
1011   // Note that the cache is updated below if it does not help us find
1012   // what we need immediately.
1013   // So if it was a primary super, we can just fail immediately.
1014   // Otherwise, it's the slow path for us (no success at this point).
1015 
1016   if (super_check_offset.is_register()) {
1017     b(*L_success, Assembler::EQ);
1018     cmp(super_check_offset.as_register(), sc_offset);
1019     if (L_failure == &L_fallthrough) {
1020       b(*L_slow_path, Assembler::EQ);
1021     } else {
1022       b(*L_failure, Assembler::NE);
1023       final_jmp(*L_slow_path);
1024     }
1025   } else if (super_check_offset.as_constant() == sc_offset) {
1026     // Need a slow path; fast failure is impossible.
1027     if (L_slow_path == &L_fallthrough) {
1028       b(*L_success, Assembler::EQ);
1029     } else {
1030       b(*L_slow_path, Assembler::NE);
1031       final_jmp(*L_success);
1032     }
1033   } else {
1034     // No slow path; it's a fast decision.
1035     if (L_failure == &L_fallthrough) {
1036       b(*L_success, Assembler::EQ);
1037     } else {
1038       b(*L_failure, Assembler::NE);
1039       final_jmp(*L_success);
1040     }
1041   }
1042 
1043   bind(L_fallthrough);
1044 
1045 #undef final_jmp
1046 }
1047 
1048 // These two are taken from x86, but they look generally useful
1049 
1050 // scans count pointer sized words at [addr] for occurence of value,
1051 // generic
1052 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
1053                                 Register scratch) {
1054   Label loop, fail, found;
1055   cmp(count, 0);
1056   b(fail, EQ);
1057 
1058   bind(loop);
1059   ldr(scratch, post(addr, wordSize));
1060   cmp(value, scratch);
1061   b(found, EQ);
1062   subs(count, count, 1);
1063   b(loop, NE);
1064 
1065   bind(fail);
1066   cmp(sp, 0); // sp never zero
1067   bind(found);
1068 }
1069 
1070 // Form an address from base + offset in Rd.  Rd may or may
1071 // not actually be used: you must use the Address that is returned.
1072 // It is up to you to ensure that the shift provided matches the size
1073 // of your data.
1074 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1075   // form_address result should only be used together with ldr/str instructions
1076   // otherwise please provide exact type instead of IDT_INT or apply safe_for()
1077   if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
1078     // It fits; no need for any heroics
1079     return Address(base, byte_offset);
1080 
1081   // See if we can do this with two 12-bit offsets
1082   {
1083     unsigned long masked_offset = byte_offset & ~0xfff;
1084     if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
1085         && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
1086       add(Rd, base, masked_offset);
1087       byte_offset -= masked_offset;
1088       return Address(Rd, byte_offset);
1089     }
1090   }
1091 
1092   // Do it the hard way
1093   mov(Rd, byte_offset);
1094   add(Rd, base, Rd);
1095   return Address(Rd);
1096 }
1097 
1098 // scans count 4 byte words at [addr] for occurence of value,
1099 // generic
1100 /*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1101                                 Register scratch) {
1102   Label Lloop, Lexit;
1103   cbz(count, Lexit);
1104   bind(Lloop);
1105   ldr(scratch, post(addr, wordSize));
1106   cmp(value, scratch);
1107   b(Lexit, EQ);
1108   sub(count, count, 1);
1109   cbnz(count, Lloop);
1110   bind(Lexit);
1111 }*/
1112 
1113 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1114                                                    Register super_klass,
1115                                                    Register temp_reg,
1116                                                    Register temp2_reg,
1117                                                    Label* L_success,
1118                                                    Label* L_failure,
1119                                                    bool set_cond_codes) {
1120   assert_different_registers(sub_klass, super_klass, temp_reg);
1121   if (temp2_reg != noreg)
1122     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1123 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1124 
1125   Label L_fallthrough;
1126   int label_nulls = 0;
1127   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1128   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1129   assert(label_nulls <= 1, "at most one NULL in the batch");
1130 
1131   // a couple of useful fields in sub_klass:
1132   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1133   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1134   Address secondary_supers_addr(sub_klass, ss_offset);
1135   Address super_cache_addr(     sub_klass, sc_offset);
1136 
1137   BLOCK_COMMENT("check_klass_subtype_slow_path");
1138 
1139   // Do a linear scan of the secondary super-klass chain.
1140   // This code is rarely used, so simplicity is a virtue here.
1141   // The repne_scan instruction uses fixed registers, which we must spill.
1142   // Don't worry too much about pre-existing connections with the input regs.
1143 
1144   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1145   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1146 
1147   RegSet pushed_registers;
1148   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1149   if (!IS_A_TEMP(r14))    pushed_registers += r14;
1150 
1151   if (super_klass != r0) {
1152     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1153   }
1154 
1155   push(pushed_registers, sp);
1156 
1157   // Get super_klass value into r0 (even if it was in r5 or r2).
1158   if (super_klass != r0) {
1159     mov(r0, super_klass);
1160   }
1161 
1162 #ifndef PRODUCT
1163   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1164   Address pst_counter_addr(rscratch2);
1165   ldr(rscratch1, pst_counter_addr);
1166   add(rscratch1, rscratch1, 1);
1167   str(rscratch1, pst_counter_addr);
1168 #endif //PRODUCT
1169 
1170   // We will consult the secondary-super array.
1171   ldr(r14, secondary_supers_addr);
1172   // Load the array length.
1173   ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
1174   // Skip to start of data.
1175   add(r14, r14, Array<Klass*>::base_offset_in_bytes());
1176 
1177   cmp(sp, 0); // Clear Z flag; SP is never zero
1178   // Scan R2 words at [R14] for an occurrence of R0.
1179   // Set NZ/Z based on last compare.
1180   repne_scan(r14, r0, r2, rscratch1);
1181 
1182   // Unspill the temp. registers:
1183   pop(pushed_registers, sp);
1184 
1185   b(*L_failure, Assembler::NE);
1186 
1187   // Success.  Cache the super we found and proceed in triumph.
1188   str(super_klass, super_cache_addr);
1189 
1190   if (L_success != &L_fallthrough) {
1191     b(*L_success);
1192   }
1193 
1194 #undef IS_A_TEMP
1195 
1196   bind(L_fallthrough);
1197 }
1198 
1199 
1200 void MacroAssembler::verify_oop(Register reg, const char* s) {
1201   if (!VerifyOops) return;
1202 
1203   // Pass register number to verify_oop_subroutine
1204   const char* b = NULL;
1205   {
1206     ResourceMark rm;
1207     stringStream ss;
1208     ss.print("verify_oop: %s: %s", reg->name(), s);
1209     b = code_string(ss.as_string());
1210   }
1211   BLOCK_COMMENT("verify_oop {");
1212 
1213   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1214 
1215   mov(r0, reg);
1216   mov(rscratch1, (address)b);
1217   mrs(r1);
1218 
1219   // call indirectly to solve generation ordering problem
1220   reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
1221   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1222   ldr(rscratch2, Address(rscratch2));
1223   bl(rscratch2);
1224   reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
1225 
1226   msr(r1);
1227   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1228 
1229   BLOCK_COMMENT("} verify_oop");
1230 }
1231 
1232 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1233   if (!VerifyOops) return;
1234 
1235   const char* b = NULL;
1236   {
1237     ResourceMark rm;
1238     stringStream ss;
1239     ss.print("verify_oop_addr: %s", s);
1240     b = code_string(ss.as_string());
1241   }
1242   BLOCK_COMMENT("verify_oop_addr {");
1243 
1244   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1245   mrs(r1);
1246 
1247   // addr may contain sp so we will have to adjust it based on the
1248   // pushes that we just did.
1249   if (addr.uses(sp)) {
1250     lea(r0, addr);
1251     ldr(r0, Address(r0, 5 * wordSize));
1252   } else {
1253     ldr(r0, addr);
1254   }
1255   mov(rscratch1, (address)b);
1256 
1257   // call indirectly to solve generation ordering problem
1258   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1259   ldr(rscratch2, Address(rscratch2));
1260   bl(rscratch2);
1261 
1262   msr(r1);
1263   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1264 
1265   BLOCK_COMMENT("} verify_oop_addr");
1266 }
1267 
1268 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1269                                          int extra_slot_offset) {
1270   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1271   int stackElementSize = Interpreter::stackElementSize;
1272   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1273 #ifdef ASSERT
1274   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1275   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1276 #endif
1277   if (arg_slot.is_constant()) {
1278     return Address(sp, arg_slot.as_constant() * stackElementSize
1279                    + offset);
1280   } else {
1281     add(rscratch1, sp, arg_slot.as_register(),
1282         lsl(exact_log2(stackElementSize)));
1283     return Address(rscratch1, offset);
1284   }
1285 }
1286 
1287 void MacroAssembler::call_VM_leaf_base(address entry_point,
1288                                        int number_of_arguments,
1289                                        Label *retaddr) {
1290   Label E, L;
1291 
1292   //FIXME Do this alignment in a more elegant way
1293   mov(rscratch2, sp);
1294   sub(sp, sp, wordSize);
1295   bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
1296   str(rscratch2, Address(sp));
1297 
1298   // FIXME Do we need to preserve rscratch2?
1299   //str(rscratch2, Address(pre(sp, -wordSize)));
1300 
1301   mov(rscratch2, entry_point);
1302   reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
1303   bl(rscratch2);
1304   if (retaddr)
1305     bind(*retaddr);
1306   reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
1307 
1308   //ldr(rscratch2, Address(post(sp, wordSize)));
1309 
1310   //Undo alignment
1311   ldr(sp, Address(sp));
1312 
1313   maybe_isb();
1314 }
1315 
1316 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1317   call_VM_leaf_base(entry_point, number_of_arguments);
1318 }
1319 
1320 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1321   pass_arg0(this, arg_0);
1322   call_VM_leaf_base(entry_point, 1);
1323 }
1324 
1325 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1326   pass_arg0(this, arg_0);
1327   pass_arg1(this, arg_1);
1328   call_VM_leaf_base(entry_point, 2);
1329 }
1330 
1331 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1332                                   Register arg_1, Register arg_2) {
1333   pass_arg0(this, arg_0);
1334   pass_arg1(this, arg_1);
1335   pass_arg2(this, arg_2);
1336   call_VM_leaf_base(entry_point, 3);
1337 }
1338 
1339 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1340   pass_arg0(this, arg_0);
1341   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1342 }
1343 
1344 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1345 
1346   assert(arg_0 != c_rarg1, "smashed arg");
1347   pass_arg1(this, arg_1);
1348   pass_arg0(this, arg_0);
1349   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1350 }
1351 
1352 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1353   assert(arg_0 != c_rarg2, "smashed arg");
1354   assert(arg_1 != c_rarg2, "smashed arg");
1355   pass_arg2(this, arg_2);
1356   assert(arg_0 != c_rarg1, "smashed arg");
1357   pass_arg1(this, arg_1);
1358   pass_arg0(this, arg_0);
1359   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1360 }
1361 
1362 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1363   assert(arg_0 != c_rarg3, "smashed arg");
1364   assert(arg_1 != c_rarg3, "smashed arg");
1365   assert(arg_2 != c_rarg3, "smashed arg");
1366   pass_arg3(this, arg_3);
1367   assert(arg_0 != c_rarg2, "smashed arg");
1368   assert(arg_1 != c_rarg2, "smashed arg");
1369   pass_arg2(this, arg_2);
1370   assert(arg_0 != c_rarg1, "smashed arg");
1371   pass_arg1(this, arg_1);
1372   pass_arg0(this, arg_0);
1373   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1374 }
1375 
1376 // Clobbers rscratch1
1377 void MacroAssembler::null_check(Register reg, int offset) {
1378   if (needs_explicit_null_check(offset)) {
1379     // provoke OS NULL exception if reg = NULL by
1380     // accessing M[reg] w/o changing any registers
1381     // NOTE: this is plenty to provoke a segv
1382     reg_printf("Generating OS check null with ptr = %p\n", reg);
1383     assert(reg != rscratch1, "can't be");
1384     ldr(rscratch1, Address(reg));
1385   } else {
1386     // nothing to do, (later) access of M[reg + offset]
1387     // will provoke OS NULL exception if reg = NULL
1388   }
1389 }
1390 
1391 // MacroAssembler protected routines needed to implement
1392 // public methods
1393 
1394 void MacroAssembler::mov(Register r, Address dest, Condition cond) {
1395   code_section()->relocate(pc(), dest.rspec());
1396   uint32_t imm32 = (uint32_t)dest.target();
1397   movptr(r, imm32, cond);
1398 }
1399 
1400 // Move a constant pointer into r.  In aarch32 address space
1401 // is 32 bits in size and so a pointer can be encoded in two mov
1402 // instructions.
1403 void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
1404 #ifndef PRODUCT
1405   {
1406     char buffer[64];
1407     snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1408     block_comment(buffer);
1409   }
1410 #endif
1411   Assembler::mov_immediate32(r, imm32, cond, false);
1412 }
1413 
1414 void MacroAssembler::ret(Register reg) {
1415   assert(reg == lr, "Can do return only to LR");
1416   b(lr);
1417 }
1418 
1419 void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
1420   Label retry_load;
1421   bind(retry_load);
1422   // flush and load exclusive from the memory location
1423   ldrex(tmp, counter_addr);
1424   add(tmp, tmp, 1);
1425   // if we store+flush with no intervening write tmp wil be zero
1426   strex(tmp, tmp, counter_addr);
1427   cmp(tmp, 0);
1428   b(retry_load, Assembler::NE);
1429 }
1430 
1431 
1432 // MacroAssembler routines found actually to be needed
1433 
1434 void MacroAssembler::push(Register src)
1435 {
1436   str(src, Address(pre(sp, -1 * wordSize)));
1437 }
1438 
1439 void MacroAssembler::pop(Register dst)
1440 {
1441   ldr(dst, Address(post(sp, 1 * wordSize)));
1442 }
1443 
1444 // Note: load_unsigned_short used to be called load_unsigned_word.
1445 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1446   int off = offset();
1447   ldrh(dst, src);
1448   return off;
1449 }
1450 
1451 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1452   int off = offset();
1453   ldrb(dst, src);
1454   return off;
1455 }
1456 
1457 int MacroAssembler::load_signed_short(Register dst, Address src) {
1458   int off = offset();
1459   ldrsh(dst, src);
1460   return off;
1461 }
1462 
1463 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1464   int off = offset();
1465   ldrsb(dst, src);
1466   return off;
1467 }
1468 
1469 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1470   switch (size_in_bytes) {
1471   //case  8:  ldr(dst, src); break;
1472   case  4:  ldr(dst, src); break;
1473   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1474   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1475   default:  ShouldNotReachHere();
1476   }
1477 }
1478 
1479 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1480   switch (size_in_bytes) {
1481   //case  8:  str(src, dst); break;
1482   case  4:  str(src, dst); break;
1483   case  2:  strh(src, dst); break;
1484   case  1:  strb(src, dst); break;
1485   default:  ShouldNotReachHere();
1486   }
1487 }
1488 
1489 void MacroAssembler::decrement(Register reg, int value) {
1490   if (value < 0) {
1491     increment(reg, -value);
1492     return;
1493   }
1494   if (value == 0) {
1495     return;
1496   }
1497   if (operand_valid_for_add_sub_immediate(value)) {
1498     sub(reg, reg, value);
1499     return;
1500   }
1501   assert(reg != rscratch2, "invalid register for decrement");
1502   mov(rscratch2, (unsigned int) value);
1503   sub(reg, reg, rscratch2);
1504 }
1505 
1506 void MacroAssembler::decrement(Address dst, int value) {
1507   assert(!dst.uses(rscratch1), "invalid address for decrement");
1508   ldr(rscratch1, dst);
1509   decrement(rscratch1, value);
1510   str(rscratch1, dst);
1511 }
1512 
1513 void MacroAssembler::increment(Register reg, int value) {
1514   if (value < 0) {
1515     decrement(reg, -value);
1516     return;
1517   }
1518   if (value == 0) {
1519     return;
1520   }
1521   if (operand_valid_for_add_sub_immediate(value)) {
1522     add(reg, reg, value);
1523     return;
1524   }
1525   assert(reg != rscratch2, "invalid register for increment");
1526   mov(rscratch2, (unsigned int) value);
1527   add(reg, reg, rscratch2);
1528 }
1529 
1530 void MacroAssembler::increment(Address dst, int value) {
1531   assert(!dst.uses(rscratch1), "invalid address for increment");
1532   ldr(rscratch1, dst);
1533   increment(rscratch1, value);
1534   str(rscratch1, dst);
1535 }
1536 
1537 // Loads and stores everything except the pc and sp
1538 void MacroAssembler::pusha() {
1539   unsigned regset = 0b0101111111111111;
1540   stmdb(sp, regset);
1541 }
1542 void MacroAssembler::popa() {
1543   unsigned regset = 0b0101111111111111;
1544   ldmia(sp, regset);
1545 }
1546 
1547 static void multiple_reg_check(unsigned int bitset, Register stack) {
1548   const unsigned int pcbit = 1 << r15_pc->encoding();
1549   const unsigned int lrbit = 1 << lr->encoding();
1550   const unsigned int spbit = 1 << sp->encoding();
1551   const unsigned int stackbit = 1 << stack->encoding();
1552   assert(!(bitset & spbit), "The SP can be in the list. However, "
1553       "ARM deprecates using these instructions with SP in the list.");
1554   assert(!(bitset & pcbit) || !(bitset & lrbit),
1555       "ARM deprecates using these instructions with both "
1556       "the LR and the PC in the list.");
1557   assert(!(bitset & stackbit), "Instructions with the base register "
1558       "in the list and ! specified are only available before ARMv7, "
1559       "and ARM deprecates the use of such instructions. "
1560       "The value of the base register after such an instruction is UNKNOWN");
1561 }
1562 
1563 // Push lots of registers in the bit set supplied.  Don't push sp.
1564 // Return the number of words pushed
1565 int MacroAssembler::push(unsigned int bitset, Register stack) {
1566   multiple_reg_check(bitset, stack);
1567   unsigned bc = bitset, count = 0, i;
1568   for(i = 0; i <= 15; i++) {
1569     if (1 & bc) count++;
1570     bc >>= 1;
1571   }
1572   // TODO Also why did it only do even quantities before?
1573   stmdb(stack, bitset);
1574   return count;
1575 }
1576 
1577 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1578   multiple_reg_check(bitset, stack);
1579   unsigned bc = bitset, count = 0, i;
1580   for(i = 0; i <= 15; i++) {
1581     if (1 & bc) count++;
1582     bc >>= 1;
1583   }
1584   // TODO Also why did it only do even quantities before?
1585   ldmia(stack, bitset);
1586   return count;
1587 }
1588 
1589 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
1590   Label done, not_weak;
1591   cbz(value, done);           // Use NULL as-is.
1592 
1593   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
1594   tbz(value, 0, not_weak);    // Test for jweak tag.
1595 
1596   // Resolve jweak.
1597 
1598   access_load_word_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
1599               value, Address(value, -JNIHandles::weak_tag_value), tmp, noreg);
1600   verify_oop(value);
1601   b(done);
1602 
1603 
1604   bind(not_weak);
1605   // Resolve (untagged) jobject.
1606   access_load_word_at(T_OBJECT, IN_NATIVE, value, Address(value), tmp, noreg);
1607   verify_oop(value);
1608   bind(done);
1609 }
1610 
1611 void MacroAssembler::stop(const char* msg) {
1612   pusha();
1613   // Save old sp value
1614   add(rscratch2, sp, 14 * wordSize);
1615   str(rscratch2, Address(pre(sp, -4)));
1616   mov(c_rarg0, (address)msg);
1617   mov(c_rarg1, r15_pc);
1618   sub(c_rarg1, c_rarg1, 8); // Restore to actual value
1619   mov(c_rarg2, sp);
1620   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
1621   bl(c_rarg3);
1622   hlt(0);
1623 }
1624 
1625 void MacroAssembler::unimplemented(const char* what) {
1626   const char* buf = NULL;
1627   {
1628     ResourceMark rm;
1629     stringStream ss;
1630     ss.print("unimplemented: %s", what);
1631     buf = code_string(ss.as_string());
1632   }
1633   stop(buf);
1634 }
1635 
1636 // this simulates the behaviour of the x86 cmpxchg instruction using a
1637 // load linked/store conditional pair. we use the acquire/release
1638 // versions of these instructions so that we flush pending writes as
1639 // per Java semantics.
1640 
1641 // n.b the x86 version assumes the old value to be compared against is
1642 // in rax and updates rax with the value located in memory if the
1643 // cmpxchg fails. we supply a register for the old value explicitly
1644 
1645 // the aarch32 load linked/store conditional instructions do not
1646 // accept an offset. so, unlike x86, we must provide a plain register
1647 // to identify the memory word to be compared/exchanged rather than a
1648 // register+offset Address.
1649 
1650 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1651                                 Label &succeed, Label *fail) {
1652   // oldv holds comparison value
1653   // newv holds value to write in exchange
1654   // addr identifies memory word to compare against/update
1655   // tmp returns 0/1 for success/failure
1656   Label retry_load, nope;
1657 
1658   bind(retry_load);
1659   // flush and load exclusive from the memory location
1660   // and fail if it is not what we expect
1661   ldrex(tmp, addr);
1662   cmp(tmp, oldv);
1663   b(nope, Assembler::NE);
1664   // if we store+flush with no intervening write tmp wil be zero
1665   strex(tmp, newv, addr);
1666   cmp(tmp, 0);
1667   b(succeed, Assembler::EQ);
1668   // retry so we only ever return after a load fails to compare
1669   // ensures we don't return a stale value after a failed write.
1670   b(retry_load);
1671   // if the memory word differs we return it in oldv and signal a fail
1672   bind(nope);
1673   membar(AnyAny);
1674   mov(oldv, tmp);
1675   if (fail)
1676     b(*fail);
1677 }
1678 
1679 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
1680                                         Label &succeed, Label *fail) {
1681   assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
1682   cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
1683 }
1684 
1685 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1686                                 Label &succeed, Label *fail) {
1687   // oldv holds comparison value
1688   // newv holds value to write in exchange
1689   // addr identifies memory word to compare against/update
1690   // tmp returns 0/1 for success/failure
1691   Label retry_load, nope;
1692 
1693   bind(retry_load);
1694   // flush and load exclusive from the memory location
1695   // and fail if it is not what we expect
1696   ldrex(tmp, addr);
1697   cmp(tmp, oldv);
1698   b(nope, Assembler::NE);
1699   // if we store+flush with no intervening write tmp wil be zero
1700   strex(tmp, newv, addr);
1701   cmp(tmp, 0);
1702   b(succeed, Assembler::EQ);
1703   // retry so we only ever return after a load fails to compare
1704   // ensures we don't return a stale value after a failed write.
1705   b(retry_load);
1706   // if the memory word differs we return it in oldv and signal a fail
1707   bind(nope);
1708   membar(AnyAny);
1709   mov(oldv, tmp);
1710   if (fail)
1711     b(*fail);
1712 }
1713 
1714 #ifndef PRODUCT
1715 extern "C" void findpc(intptr_t x);
1716 #endif
1717 
1718 void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
1719 {
1720   print_unseen_bytecodes();
1721   // In order to get locks to work, we need to fake a in_VM state
1722   if (ShowMessageBoxOnError) {
1723     JavaThread* thread = JavaThread::current();
1724     JavaThreadState saved_state = thread->thread_state();
1725     thread->set_thread_state(_thread_in_vm);
1726 #ifndef PRODUCT
1727     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
1728       ttyLocker ttyl;
1729       BytecodeCounter::print();
1730     }
1731 #endif
1732     if (os::message_box(msg, "Execution stopped, print registers?")) {
1733       ttyLocker ttyl;
1734       tty->print_cr(" pc = 0x%016x", pc);
1735 #ifndef PRODUCT
1736       tty->cr();
1737       findpc(pc);
1738       tty->cr();
1739 #endif
1740       tty->print_cr("THIS IS WRONG!");
1741       tty->print_cr(" r0 = 0x%016x", regs[0]);
1742       tty->print_cr(" r1 = 0x%016x", regs[1]);
1743       tty->print_cr(" r2 = 0x%016x", regs[2]);
1744       tty->print_cr(" r3 = 0x%016x", regs[3]);
1745       tty->print_cr(" r4 = 0x%016x", regs[4]);
1746       tty->print_cr(" r5 = 0x%016x", regs[5]);
1747       tty->print_cr(" r6 = 0x%016x", regs[6]);
1748       tty->print_cr(" r7 = 0x%016x", regs[7]);
1749       tty->print_cr(" r8 = 0x%016x", regs[8]);
1750       tty->print_cr(" r9 = 0x%016x", regs[9]);
1751       tty->print_cr("r10 = 0x%016x", regs[10]);
1752       tty->print_cr("r11 = 0x%016x", regs[11]);
1753       tty->print_cr("r12 = 0x%016x", regs[12]);
1754       tty->print_cr("r13 = 0x%016x", regs[13]);
1755       tty->print_cr("r14 = 0x%016x", regs[14]);
1756       tty->print_cr("r15 = 0x%016x", regs[15]);
1757       BREAKPOINT;
1758     }
1759     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
1760   } else {
1761     {
1762     ttyLocker ttyl;
1763     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
1764     ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
1765     ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
1766     ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
1767     ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
1768     ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
1769     ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
1770     ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
1771     ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
1772     ::tty->print_cr(" r8 [  rmethod  ] = 0x%08x", regs[9]);
1773     ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
1774     ::tty->print_cr("r10 [  rthread  ] = 0x%08x", regs[11]);
1775     ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
1776     ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
1777     ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
1778     ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
1779     ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
1780     }
1781     assert(false, "DEBUG MESSAGE: %s", msg);
1782   }
1783 }
1784 
1785 void MacroAssembler::push_call_clobbered_registers() {
1786   push(RegSet::range(r0, r3), sp);
1787   if(hasFPU()) {
1788     const int nfloat = 16; // number of callee-saved 32-bit float registers
1789     vstmdb_f64(sp, (1 << nfloat/2) - 1);
1790   }
1791 }
1792 
1793 void MacroAssembler::pop_call_clobbered_registers() {
1794   if(hasFPU()) {
1795     const int nfloat = 16; // number of callee-saved 32-bit float registers
1796     vldmia_f64(sp, (1 << nfloat/2) - 1);
1797   }
1798   pop(RegSet::range(r0, r3), sp);
1799 }
1800 
1801 void MacroAssembler::push_CPU_state() {
1802   // if fix this, update also RegisterSaved::save_live_registers and it's map
1803   push(0x5fff, sp); // integer registers except sp & (aarch32 pc)
1804 
1805   if(hasFPU()) {
1806     const int nfloat = FPUStateSizeInWords / 2; // saved by pairs
1807     vstmdb_f64(sp, (1 << nfloat) - 1);
1808   } else {
1809     sub(sp, sp, FPUStateSizeInWords * wordSize);
1810   }
1811 }
1812 
1813 void MacroAssembler::pop_CPU_state() {
1814   if(hasFPU()) {
1815     const int nfloat = FloatRegisterImpl::number_of_registers / 2;
1816     vldmia_f64(sp, (1 << nfloat) - 1);
1817   } else {
1818     add(sp, sp, FPUStateSizeInWords * wordSize);
1819   }
1820 
1821   pop(0x5fff, sp); // integer registers except sp & (aarch32 pc)
1822 }
1823 
1824 // appears this needs to round up!
1825 void MacroAssembler::round_to(Register reg, int modulus) {
1826   // from x86
1827   add(reg, reg, modulus - 1);
1828   bic(reg, reg, modulus - 1); // and( reg, -modulus)
1829 }
1830 
1831 SkipIfEqual::SkipIfEqual(
1832     MacroAssembler* masm, const bool* flag_addr, bool value) {
1833   _masm = masm;
1834   _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
1835   _masm->ldrb(rscratch1, rscratch1);
1836   _masm->cmp(rscratch1, 0);
1837   _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
1838 }
1839 
1840 SkipIfEqual::~SkipIfEqual() {
1841   _masm->bind(_label);
1842 }
1843 
1844 void MacroAssembler::cmpptr(Register src1, Address src2) {
1845   mov(rscratch1, src2);
1846   ldr(rscratch1, Address(rscratch1));
1847   cmp(src1, rscratch1);
1848 }
1849 
1850 void MacroAssembler::cmpoop(Register obj1, Register obj2) {
1851   BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
1852   bs->obj_equals(this, obj1, obj2);
1853 }
1854 
1855 void MacroAssembler::load_klass(Register dst, Register src) {
1856   ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1857 }
1858 
1859 // ((OopHandle)result).resolve();
1860 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
1861   // OopHandle::resolve is an indirection.
1862   access_load_word_at(T_OBJECT, IN_NATIVE, result, Address(result), tmp, noreg);
1863 }
1864 
1865 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
1866   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
1867   ldr(dst, Address(rmethod, Method::const_offset()));
1868   ldr(dst, Address(dst, ConstMethod::constants_offset()));
1869   ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
1870   ldr(dst, Address(dst, mirror_offset));
1871   resolve_oop_handle(dst, tmp);
1872 }
1873 
1874 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
1875   ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
1876   cmp(trial_klass, tmp);
1877 }
1878 
1879 void MacroAssembler::load_prototype_header(Register dst, Register src) {
1880   load_klass(dst, src);
1881   ldr(dst, Address(dst, Klass::prototype_header_offset()));
1882 }
1883 
1884 void MacroAssembler::store_klass(Register dst, Register src) {
1885   str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1886 }
1887 
1888 void MacroAssembler::store_klass_gap(Register dst, Register src) { }
1889 
1890 void MacroAssembler::access_load_word_at(BasicType type, DecoratorSet decorators,
1891                                          Register dst, Address src,
1892                                          Register tmp1, Register thread_tmp) {
1893   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1894   decorators = AccessInternal::decorator_fixup(decorators);
1895   bool as_raw = (decorators & AS_RAW) != 0;
1896   if (as_raw) {
1897     bs->BarrierSetAssembler::load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1898   } else {
1899     bs->load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1900   }
1901 }
1902 
1903 void MacroAssembler::access_store_word_at(BasicType type, DecoratorSet decorators,
1904                                           Address dst, Register src,
1905                                           Register tmp1, Register thread_tmp) {
1906   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1907   decorators = AccessInternal::decorator_fixup(decorators);
1908   bool as_raw = (decorators & AS_RAW) != 0;
1909   if (as_raw) {
1910     bs->BarrierSetAssembler::store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1911   } else {
1912     bs->store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
1913   }
1914 }
1915 
1916 void MacroAssembler::access_load_tos_at(BasicType type, DecoratorSet decorators,
1917                                     Address src,
1918                                     Register tmp1, Register thread_tmp) {
1919   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1920   decorators = AccessInternal::decorator_fixup(decorators);
1921   bool as_raw = (decorators & AS_RAW) != 0;
1922   if (as_raw) {
1923     bs->BarrierSetAssembler::load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
1924   } else {
1925     bs->load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
1926   }
1927 }
1928 
1929 void MacroAssembler::access_store_tos_at(BasicType type, DecoratorSet decorators,
1930                                          Address dst,
1931                                          Register tmp1, Register thread_tmp) {
1932   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1933   decorators = AccessInternal::decorator_fixup(decorators);
1934   bool as_raw = (decorators & AS_RAW) != 0;
1935   if (as_raw) {
1936     bs->BarrierSetAssembler::store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
1937   } else {
1938     bs->store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
1939   }
1940 }
1941 
1942 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
1943                                    Register thread_tmp, DecoratorSet decorators) {
1944   access_load_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
1945 }
1946 
1947 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
1948                                             Register thread_tmp, DecoratorSet decorators) {
1949   access_load_word_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
1950 }
1951 
1952 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
1953                                     Register thread_tmp, DecoratorSet decorators) {
1954   access_store_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
1955 }
1956 
1957 // Used for storing NULLs.
1958 void MacroAssembler::store_heap_oop_null(Address dst, Register tmp) {
1959   access_store_word_at(T_OBJECT, IN_HEAP, dst, noreg, tmp, noreg);
1960 }
1961 
1962 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
1963   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
1964   int index = oop_recorder()->allocate_metadata_index(obj);
1965   RelocationHolder rspec = metadata_Relocation::spec(index);
1966   return Address((address)obj, rspec);
1967 }
1968 
1969 // Move an oop into a register.  immediate is true if we want
1970 // immediate instrcutions, i.e. we are not going to patch this
1971 // instruction while the code is being executed by another thread.  In
1972 // that case we can use move immediates rather than the constant pool.
1973 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
1974   int oop_index;
1975   if (obj == NULL) {
1976     oop_index = oop_recorder()->allocate_oop_index(obj);
1977   } else {
1978 #ifdef ASSERT
1979     {
1980       ThreadInVMfromUnknown tiv;
1981       assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
1982     }
1983 #endif
1984     oop_index = oop_recorder()->find_index(obj);
1985   }
1986   if (! immediate) {
1987     far_load_oop(dst, oop_index);
1988   } else {
1989     RelocationHolder rspec = oop_Relocation::spec(oop_index);
1990     mov(dst, Address((address)obj, rspec));
1991   }
1992 }
1993 
1994 // Move a metadata address into a register.
1995 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
1996   int oop_index;
1997   if (obj == NULL) {
1998     oop_index = oop_recorder()->allocate_metadata_index(obj);
1999   } else {
2000     oop_index = oop_recorder()->find_index(obj);
2001   }
2002   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2003   mov(dst, Address((address)obj, rspec));
2004 }
2005 
2006 void MacroAssembler::far_load(Register dst, address addr) {
2007   address far_load_addr = pc();
2008   add(dst, r15_pc, 0);
2009   ldr(dst, Address(dst));
2010 
2011   NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr;
2012   far_load->set_data_addr((intptr_t*) addr);
2013 }
2014 
2015 void MacroAssembler::far_load_oop(Register dst, int oop_index) {
2016     relocate(oop_Relocation::spec(oop_index));
2017     // can't provide meaningful addr, give far_load addr itself
2018     far_load(dst, pc());
2019 }
2020 
2021 void MacroAssembler::far_load_metadata(Register dst, int metadata_index) {
2022     relocate(metadata_Relocation::spec(metadata_index));
2023     // can't provide meaningful addr, give far_load addr itself
2024     far_load(dst, pc());
2025 }
2026 
2027 void MacroAssembler::far_load_const(Register dst, address const_addr) {
2028     relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
2029     far_load(dst, const_addr);
2030 }
2031 
2032 Address MacroAssembler::constant_oop_address(jobject obj) {
2033 #ifdef ASSERT
2034   {
2035     ThreadInVMfromUnknown tiv;
2036     assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
2037     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
2038   }
2039 #endif
2040   int oop_index = oop_recorder()->find_index(obj);
2041   return Address((address)obj, oop_Relocation::spec(oop_index));
2042 }
2043 
2044 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
2045 void MacroAssembler::tlab_allocate(Register obj,
2046                                    Register var_size_in_bytes,
2047                                    int con_size_in_bytes,
2048                                    Register t1,
2049                                    Register t2,
2050                                    Label& slow_case) {
2051   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2052   bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
2053 }
2054 
2055 // Defines obj, preserves var_size_in_bytes
2056 void MacroAssembler::eden_allocate(Register obj,
2057                                    Register var_size_in_bytes,
2058                                    int con_size_in_bytes,
2059                                    Register t1,
2060                                    Label& slow_case) {
2061   BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2062   bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
2063 }
2064 
2065 // Zero words; len is in bytes
2066 // Destroys all registers except addr
2067 // len must be a nonzero multiple of wordSize
2068 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
2069   assert_different_registers(addr, len, t1, rscratch1, rscratch2);
2070 
2071 #ifdef ASSERT
2072   { Label L;
2073     tst(len, BytesPerWord - 1);
2074     b(L, Assembler::EQ);
2075     stop("len is not a multiple of BytesPerWord");
2076     bind(L);
2077   }
2078 #endif
2079 
2080 #ifndef PRODUCT
2081   block_comment("zero memory");
2082 #endif
2083 
2084   Label loop;
2085   Label entry;
2086 
2087 //  Algorithm:
2088 //
2089 //    scratch1 = cnt & 7;
2090 //    cnt -= scratch1;
2091 //    p += scratch1;
2092 //    switch (scratch1) {
2093 //      do {
2094 //        cnt -= 8;
2095 //          p[-8] = 0;
2096 //        case 7:
2097 //          p[-7] = 0;
2098 //        case 6:
2099 //          p[-6] = 0;
2100 //          // ...
2101 //        case 1:
2102 //          p[-1] = 0;
2103 //        case 0:
2104 //          p += 8;
2105 //      } while (cnt);
2106 //    }
2107 
2108   const int unroll = 8; // Number of str instructions we'll unroll
2109 
2110   lsr(len, len, LogBytesPerWord);
2111   andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
2112   sub(len, len, rscratch1);      // cnt -= unroll
2113   // t1 always points to the end of the region we're about to zero
2114   add(t1, addr, rscratch1, lsl(LogBytesPerWord));
2115   adr(rscratch2, entry);
2116   sub(rscratch2, rscratch2, rscratch1, lsl(2));
2117   mov(rscratch1, 0);
2118   b(rscratch2);
2119   bind(loop);
2120   sub(len, len, unroll);
2121   for (int i = -unroll; i < 0; i++)
2122     str(rscratch1, Address(t1, i * wordSize));
2123   bind(entry);
2124   add(t1, t1, unroll * wordSize);
2125   cbnz(len, loop);
2126 }
2127 
2128 void MacroAssembler::verify_tlab() {
2129 #ifdef ASSERT
2130   if (UseTLAB && VerifyOops) {
2131     Label next, ok;
2132 
2133     strd(rscratch2, rscratch1, Address(pre(sp, -16)));
2134 
2135     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2136     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2137     cmp(rscratch2, rscratch1);
2138     b(next, Assembler::HS);
2139     STOP("assert(top >= start)");
2140     should_not_reach_here();
2141 
2142     bind(next);
2143     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2144     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2145     cmp(rscratch2, rscratch1);
2146     b(ok, Assembler::HS);
2147     STOP("assert(top <= end)");
2148     should_not_reach_here();
2149 
2150     bind(ok);
2151     ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
2152   }
2153 #endif
2154 }
2155 
2156 // Writes to stack successive pages until offset reached to check for
2157 // stack overflow + shadow pages.  This clobbers tmp.
2158 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2159   assert_different_registers(tmp, size, rscratch1);
2160   mov(tmp, sp);
2161   // Bang stack for total size given plus shadow page size.
2162   // Bang one page at a time because large size can bang beyond yellow and
2163   // red zones.
2164   Label loop;
2165   mov(rscratch1, os::vm_page_size());
2166   bind(loop);
2167   lea(tmp, Address(tmp, -os::vm_page_size()));
2168   subs(size, size, rscratch1);
2169   str(size, Address(tmp));
2170   b(loop, Assembler::GT);
2171 
2172   // Bang down shadow pages too.
2173   // At this point, (tmp-0) is the last address touched, so don't
2174   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2175   // was post-decremented.)  Skip this address by starting at i=1, and
2176   // touch a few more pages below.  N.B.  It is important to touch all
2177   // the way down to and including i=StackShadowPages.
2178   for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
2179     // this could be any sized move but this is can be a debugging crumb
2180     // so the bigger the better.
2181     lea(tmp, Address(tmp, -os::vm_page_size()));
2182     str(size, Address(tmp));
2183   }
2184 }
2185 
2186 
2187 // Move the address of the polling page into dest.
2188 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
2189   if (SafepointMechanism::uses_thread_local_poll()) {
2190     ldr(dest, Address(rthread, Thread::polling_page_offset()));
2191   } else {
2192     mov(dest, Address(page, rtype));
2193   }
2194 }
2195 
2196 // Move the address of the polling page into r, then read the polling
2197 // page.
2198 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
2199   get_polling_page(r, page, rtype);
2200   return read_polling_page(r, rtype);
2201 }
2202 
2203 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
2204   InstructionMark im(this);
2205   code_section()->relocate(inst_mark(), rtype);
2206   // It's ok to load to reg from reg + off (without write-back)
2207   ldr(r, Address(r, 0));
2208   return inst_mark();
2209 }
2210 
2211 // Helper functions for 64-bit multipliction, division and remainder
2212 // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
2213 void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
2214   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2215   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2216   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2217 
2218   mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
2219 }
2220 
2221 // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
2222 void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
2223   assert_different_registers(Rn, Rnh);
2224   assert_different_registers(Rm, Rmh);
2225   assert_different_registers(Rd, Rdh); // umull restriction
2226   const Register t = rscratch1;
2227 
2228   mul(t, Rm, Rnh);
2229   mla(t, Rn, Rmh, t);
2230   umull(Rd, Rdh, Rm, Rn);
2231   add(Rdh, t, Rdh);
2232 }
2233 
2234 
2235 int64_t internal_ldiv(int64_t a, int64_t b) {
2236   return a / b;
2237 }
2238 
2239 int64_t internal_lmod(int64_t a, int64_t b) {
2240   return a % b;
2241 }
2242 
2243 void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
2244     Register cnt = rscratch1;
2245     Register mod = rscratch2;
2246     Register sign = r14;
2247     assert_different_registers(num, den, rscratch1, rscratch2, r14);
2248 
2249     // FIXME This works by first converting any negative values to positive ones, however
2250     // it is not possible to express |INT_MIN|. Need to fix this
2251 
2252     //Convert to positive values
2253     mov(sign, 0);
2254 
2255     cmp(num, 0);
2256     mov(sign, 1, MI);
2257     rsb(num, num, 0, MI);
2258 
2259     cmp(den, 0);
2260     if(!want_mod) eor(sign, sign, 1, MI);
2261     rsb(den, den, 0, MI);
2262 
2263     // Algorithm from
2264     // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
2265     // Graeme Williams
2266     mov(cnt, 28);
2267     mov(mod, num, lsr(4));
2268     cmp(den, mod, lsr(12));
2269     sub(cnt, cnt, 16, Assembler::LE);
2270     mov(mod, mod, lsr(16), Assembler::LE);
2271     cmp(den, mod, lsr(4));
2272     sub(cnt, cnt, 8, Assembler::LE);
2273     mov(mod, mod, lsr(8), Assembler::LE);
2274     cmp(den, mod);
2275     sub(cnt, cnt, 4, Assembler::LE);
2276     mov(mod, mod, lsr(4), Assembler::LE);
2277     mov(num, num, lsl(cnt));
2278     rsb(den, den, 0);
2279 
2280     adds(num, num, num);
2281     //Now skip over cnt copies of the 3 instr. loop.
2282     add(cnt, cnt, cnt, lsl(1));
2283     add(r15_pc, r15_pc, cnt, lsl(2));
2284     mov(r0, r0);
2285 
2286     for(int i = 0; i < 32; i++) {
2287         adcs(mod, den, mod, lsl(1));
2288         sub(mod, mod, den, Assembler::LO);
2289         adcs(num, num, num);
2290     }
2291 
2292     cmp(sign, 0);
2293     rsb(res, want_mod? mod : num, 0, NE);
2294     mov(res, want_mod? mod : num, EQ);
2295 }
2296 
2297 
2298 // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
2299 // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
2300 // <Rd> = <Rn> / <Rm>
2301 // <Rd> = <Rn> % <Rm>
2302 void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
2303   //Dispatch to best possible
2304   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2305   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2306   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2307 
2308   assert(32 == width || 64 == width, "Invalid width");
2309   bool is64b = 64 == width;
2310 
2311   if(is64b) {
2312     assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
2313   }
2314 
2315   if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
2316     // Emit a hw instruction sequnce.
2317     if(want_remainder) {
2318       sdiv(rscratch1, Rn, Rm);
2319       mls(Rd, rscratch1, Rm, Rn);
2320     } else {
2321       sdiv(Rd, Rn, Rm);
2322     }
2323   } else if(!is64b) {
2324     // Fall back to assembly software routine
2325     divide32(Rd, Rn, Rm, want_remainder);
2326   } else {
2327     // Fall back to C software routine for
2328     // 64 bit divide/mod
2329     if(Rn != r0) {
2330       mov(rscratch1, Rm);
2331       mov(rscratch2, Rmh);
2332 
2333       mov(r0, Rn);
2334       mov(r1, Rnh);
2335 
2336       mov(r2, rscratch1);
2337       mov(r3, rscratch2);
2338     } else if(Rm != r2) {
2339       mov(r2, Rm);
2340       mov(r3, Rmh);
2341     }
2342     address function;
2343     if(want_remainder) function = (address)internal_lmod;
2344     else               function = (address)internal_ldiv;
2345 
2346     mov(rscratch1, function);
2347     bl(rscratch1);
2348     if(Rd != r0) {
2349       mov(Rd, r0);
2350       if(is64b) mov(Rdh, r1);
2351     }
2352   }
2353 }
2354 
2355 void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
2356   assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
2357   // Dispatch to the best sequence
2358   if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
2359     // Can use extend X
2360     switch(width){
2361       case 8:  uxtb(dest, source, ror(lsb)); break;
2362       case 16: uxth(dest, source, ror(lsb)); break;
2363       default:                               break;
2364    }
2365   } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
2366     ubfx(dest, source, lsb, width);
2367   } else {
2368     // Do two shifts
2369     lsl(dest, source, 32 - (width + lsb));
2370     lsr(dest, dest, 32 - width);
2371   }
2372 }
2373 
2374 
2375 void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
2376   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2377   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2378   if(VM_Version::features() & FT_SINGLE_CORE) {
2379     ldrd(Rt, Rbase);
2380   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2381 #ifdef ASSERT
2382     Label lbl;
2383     tst(Rbase, 7);
2384     b(lbl, EQ);
2385     stop("atomic_ldrd is not doubleword aligned!");
2386     bind(lbl);
2387 #endif // ASSERT
2388 
2389     ldrexd(Rt, Rbase);
2390   } else {
2391     // TODO: Find Java way of logging
2392     static bool warning_printed = false;
2393     if(!warning_printed) {
2394       fprintf(stderr, "Unable to provide atomic doubleword load.\n");
2395       warning_printed = true;
2396     }
2397     ldrd(Rt, Rbase);
2398   }
2399 }
2400 
2401 void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
2402                                  Register temp, Register temp2) {
2403   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2404   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2405   assert((Register) (temp + 1) == temp2, "Must be contiguous");
2406   assert_different_registers(temp, Rt, Rbase, temp2);
2407   if(VM_Version::features() & FT_SINGLE_CORE) {
2408     strd(Rt, Rbase);
2409   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2410     // First need to gain exclusive access
2411     Label retry;
2412 
2413 #ifdef ASSERT
2414     tst(Rbase, 7);
2415     b(retry, EQ);
2416     stop("atomic_strd is not doubleword aligned!");
2417 #endif // ASSERT
2418 
2419     bind(retry);
2420     ldrexd(temp, Rbase);
2421     strexd(temp, Rt, Rbase);
2422     cmp(temp, 0);
2423     b(retry, NE);
2424   } else {
2425     // TODO: Find Java way of logging
2426     static bool warning_printed = false;
2427     if(!warning_printed) {
2428       fprintf(stderr, "Unable to provide atomic doubleword store.\n");
2429       warning_printed = true;
2430     }
2431     strd(Rt, Rbase);
2432   }
2433 }
2434 
2435 
2436 #define ENABLE_DEBUGGING 0
2437 // Helloworld is 2,482,397
2438 uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
2439 
2440 uint32_t MacroAssembler::bytecodes_executed = 0;
2441 
2442 int MacroAssembler::enable_debug = 0;
2443 int MacroAssembler::enable_method_debug = 0;
2444 int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
2445 
2446 #define N_J_BYTECODES 238
2447 const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
2448 "lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
2449 "iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
2450 "lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
2451 "aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
2452 "dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
2453 "fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
2454 "iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
2455 "dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
2456 "lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
2457 "ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
2458 "i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
2459 "dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
2460 "ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
2461 "lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
2462 "invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
2463 "anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
2464 "multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
2465 "fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
2466 "fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
2467 "fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
2468 "fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
2469 "fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "nofast_getfield", "nofast_putfield",
2470 "nofast_aload_0", "nofast_iload", "INVALID"};
2471 
2472 int bytecodes_seen[256];
2473 
2474 void MacroAssembler::init_unseen_bytecodes() {
2475   for(int i = 0; i < 256; i++ ) {
2476     bytecodes_seen[i] = 0;
2477   }
2478 }
2479 
2480 void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
2481   if(ENABLE_DEBUGGING) {
2482     mov(scratch, (address)bytecodes_seen);
2483     add(scratch, scratch, bc_reg, lsl(2));
2484     add(bc_reg, bc_reg, 1);
2485     str(bc_reg, Address(scratch));
2486     sub(bc_reg, bc_reg, 1);
2487   }
2488 }
2489 
2490 void MacroAssembler::print_unseen_bytecodes() {
2491   if(ENABLE_DEBUGGING) {
2492     printf("=== Unseen bytecodes ===\n");
2493     for(int i = 0; i < N_J_BYTECODES; i++) {
2494       if(0 == bytecodes_seen[i]) {
2495         printf("\t%s\n", j_bytecodes[i]);
2496       }
2497     }
2498     printf("=== End unseen ===\n");
2499   } else {
2500     printf("Not kept track, enable debugging to view info\n");
2501   }
2502   fflush(stdout);
2503 }
2504 
2505 int machine_state_regset = 0b0101111111111111;
2506 int machine_state_float_regset = 0b11;
2507 
2508 void MacroAssembler::save_machine_state() {
2509     stmdb(sp, machine_state_regset);
2510     if(hasFPU()) {
2511         vstmdb_f64(sp, machine_state_float_regset);
2512     }
2513     enter();
2514 }
2515 
2516 void MacroAssembler::restore_machine_state() {
2517     leave();
2518     if(hasFPU()) {
2519         vldmia_f64(sp, machine_state_float_regset);
2520     }
2521     ldmia(sp, machine_state_regset);
2522 }
2523 
2524 void internal_internal_printf(const char *fmt, ...) {
2525   va_list args;
2526   va_start (args, fmt);
2527   vprintf (fmt, args);
2528   fflush(stdout);
2529   va_end(args);
2530 }
2531 
2532 void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
2533   char buf[2048];
2534   char fmt[2048];
2535   buf[0] = '\0';
2536   const char *thread_str = "THREAD 0x%08x : ";
2537   int id = pthread_self();
2538   strcpy(fmt, format);
2539 
2540   char *str = strtok(fmt, "\n");
2541   int nreplace = 0;
2542   while(str) {
2543     strcpy(buf, thread_str);
2544     strcat(buf, str);
2545     strcat(buf, "\n");
2546     internal_internal_printf((const char*)buf, id, a, b, c);
2547     str = strtok(NULL, "\n");
2548   }
2549 }
2550 
2551 void MacroAssembler::get_bytecode(Register dst, Register bc) {
2552   if(ENABLE_DEBUGGING) {
2553     int nbytecodes = N_J_BYTECODES;
2554     mov(dst, (address)j_bytecodes);
2555     cmp(bc, nbytecodes);
2556 
2557     ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
2558     ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
2559   }
2560 }
2561 
2562 int invocation_depth_count = -1; //TODO remove this with debugging info
2563 
2564 #define MAX_FCALL_DEPTH 4096
2565 struct thread_method_record{
2566   int thread_id;
2567   char names[MAX_FCALL_DEPTH][512];
2568   int invocation_depth_count;
2569 };
2570 int ntmrs = 0;
2571 #define MAX_TMRS 10
2572 thread_method_record tmr_list[MAX_TMRS];
2573 
2574 void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
2575   int id = pthread_self();
2576   *thread_id = id;
2577   for(int i = 0; i < ntmrs; i++) {
2578     thread_method_record *tmr = &tmr_list[i];
2579     if(id == tmr->thread_id) {
2580       // Add a new frame
2581       if(tmr->invocation_depth_count >= -1 &&
2582         tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
2583         *invocation_depth_count = ++(tmr->invocation_depth_count);
2584         *name = tmr->names[tmr->invocation_depth_count];
2585         meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
2586         return;
2587       } else {
2588         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2589         exit(1);
2590       }
2591     }
2592   }
2593   // Add a new thread
2594   if(ntmrs >= MAX_TMRS) {
2595     fprintf(stderr, "Too many tmrs\n");
2596     exit(1);
2597   }
2598   //Create a new tmr
2599   tmr_list[ntmrs].thread_id = id;
2600   tmr_list[ntmrs].invocation_depth_count = 0;
2601   meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
2602   *invocation_depth_count = 0;
2603   *name = tmr_list[ntmrs].names[0];
2604   ntmrs++;
2605 }
2606 
2607 void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
2608   int id = pthread_self();
2609   *thread_id = id;
2610   for(int i = 0; i < ntmrs; i++) {
2611     thread_method_record *tmr = &tmr_list[i];
2612     if(id == tmr->thread_id) {
2613       if(tmr->invocation_depth_count >= 0 &&
2614         tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
2615         // Pop frame
2616         *name = tmr->names[tmr->invocation_depth_count];
2617         *invocation_depth_count = (tmr->invocation_depth_count)--;
2618         return;
2619       } else if ( -1 == tmr->invocation_depth_count) {
2620         *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
2621         *invocation_depth_count = 0;
2622         return;
2623       } else {
2624         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2625         exit(1);
2626       }
2627     }
2628   }
2629   fprintf(stderr, "Unable to find suitable tmr\n");
2630   exit(1);
2631 }
2632 
2633 void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
2634   sprintf(buf, "THREAD 0x%08x : ", id);
2635   for(int i = 0; i < invocation_depth_count; i++) {
2636     strcat(buf, "  ");
2637   }
2638 }
2639 
2640 
2641 void print_entry(Method *meth, int native) {
2642   char *name;
2643   int invocation_depth_count, id;
2644   push_tmr(meth, &id, &invocation_depth_count, &name);
2645 
2646   if(MacroAssembler::enable_method_debug) {
2647     char buf[4096], buf_b[2048];
2648     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2649     if(native) {
2650       sprintf(buf_b, "CALL NATIVE : %s\n", name);
2651     } else {
2652       sprintf(buf_b, "CALL JAVA   : %s\n", name);
2653     }
2654     strcat(buf, buf_b);
2655     printf("%s", buf);
2656     fflush(stdout);
2657   }
2658 }
2659 
2660 void print_exit(bool normal) {
2661   char *name;
2662   int invocation_depth_count, id;
2663   pop_tmr(&id, &invocation_depth_count, &name);
2664 
2665   if(MacroAssembler::enable_method_debug) {
2666     char buf[4096], buf_b[2048];
2667     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2668     sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
2669     strcat(buf, buf_b);
2670     printf("%s", buf);
2671     fflush(stdout);
2672   }
2673 }
2674 
2675 void MacroAssembler::print_method_entry(Register rmethod, bool native) {
2676   if(ENABLE_DEBUGGING) {
2677     save_machine_state();
2678 
2679     bic(sp, sp, 7); // 8-byte align stack
2680     mov(rscratch2, (address)print_entry);
2681     mov(r0, rmethod);
2682     mov(r1, native);
2683     bl(rscratch2);
2684 
2685     restore_machine_state();
2686   }
2687 }
2688 
2689 void MacroAssembler::print_method_exit(bool normal) {
2690   if(ENABLE_DEBUGGING) {
2691     save_machine_state();
2692 
2693     bic(sp, sp, 7); // 8-byte align stack
2694     mov(rscratch2, (address)print_exit);
2695     mov(r0, normal);
2696     bl(rscratch2);
2697 
2698     restore_machine_state();
2699   }
2700 }
2701 
2702 void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
2703   if(ENABLE_DEBUGGING) {
2704     Label skip;
2705     save_machine_state();
2706 
2707         mov(rscratch1, ra);
2708         str(rscratch1, Address(pre(sp, -wordSize)));
2709         mov(rscratch1, rb);
2710         str(rscratch1, Address(pre(sp, -wordSize)));
2711         mov(rscratch1, rc);
2712         str(rscratch1, Address(pre(sp, -wordSize)));
2713 
2714         if(!important) {
2715             mov(r0, (address)&enable_debug);
2716             ldr(r0, Address(r0));
2717             cmp(r0, 0);
2718             b(skip, Assembler::EQ);
2719         }
2720 
2721         int sp_difference = wordSize * (count_bits(machine_state_regset) +
2722                                         2 * count_bits(machine_state_float_regset) +
2723                                         2 + 3); //Frame entry and saved
2724 
2725         mov(r0, (address)fmt);
2726         if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
2727         else         add(r1, sp, sp_difference);
2728 
2729         if(rb != sp) ldr(r2, Address(sp, wordSize));
2730         else         add(r2, sp, sp_difference);
2731 
2732         if(rc != sp) ldr(r3, Address(sp));
2733         else         add(r3, sp, sp_difference);
2734 
2735         bic(sp, sp, 7); // 8-byte align stack
2736 
2737         mov(rscratch2, (address)internal_printf);
2738         bl(rscratch2);
2739 
2740         bind(skip);
2741         restore_machine_state();
2742     }
2743 }
2744 
2745 void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
2746     reg_printf_internal(false, fmt, ra, rb, rc);
2747 }
2748 
2749 void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
2750   reg_printf_internal(true, fmt, ra, rb, rc);
2751 }
2752 
2753 // When debugging, set the break on bkpnt
2754 void bkpnt() { return; }
2755 void MacroAssembler::create_breakpoint() {
2756     if(ENABLE_DEBUGGING) {
2757         save_machine_state();
2758         bic(sp, sp, 7); // 8-byte align stack
2759 
2760         mov(rscratch2, (address) bkpnt);
2761         bl(rscratch2);
2762 
2763         restore_machine_state();
2764     }
2765 }
2766 
2767 
2768 void MacroAssembler::print_cpool(InstanceKlass *klass) {
2769     ttyLocker ttyl;
2770     klass->constants()->print_on(tty);
2771 }
2772 
2773 int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
2774     if((0 == Rt->encoding_nocheck() % 2 &&
2775          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2776       (uabs(adr.offset()) < (1 << 8))) {
2777       /* Good to go with a ldrd */
2778       ldrd(Rt, adr, cond);
2779       return 0x0;
2780     } else {
2781       return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
2782                                 &Assembler::ldr, Rtmp, cond);
2783     }
2784 }
2785 
2786 int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
2787     if((0 == Rt->encoding_nocheck() % 2 &&
2788          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2789       (uabs(adr.offset()) < (1 << 8))) {
2790       /* Good to go with a strd */
2791       strd(Rt, adr, cond);
2792     } else {
2793       double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
2794     }
2795     return 0x0;
2796 }
2797 
2798 int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2799         void (Assembler::* mul)(unsigned, const Address&, Condition),
2800         void (Assembler::* sgl)(Register, const Address&, Condition),
2801         Register Rtmp, Condition cond) {
2802   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2803           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2804     /* Do a load or store multiple instruction */
2805     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2806   } else if (!adr.uses(Rt)) {
2807     double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
2808   } else {
2809     // need to reshuffle operation, otherwise write to Rt destroys adr
2810     if (adr.get_mode() != Address::reg) {
2811       // offset-based addressing. hence Rt2 could not be by adr
2812       if (adr.get_wb_mode() == Address::pre) {
2813         (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
2814         (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
2815       } else if (adr.get_wb_mode() == Address::post) {
2816         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2817         (this->*sgl)(Rt, adr, cond);
2818       } else if (adr.get_wb_mode() == Address::off) {
2819         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2820         (this->*sgl)(Rt, adr, cond);
2821       } else {
2822         ShouldNotReachHere();
2823       }
2824     } else {
2825       // index-based addressing. both Rt and Rt2 could be used by adr
2826       // hence temp register is necessary
2827       adr.lea(this, Rtmp);
2828       double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
2829       // adr.lea have only address manipulation and cannot cause trap.
2830       // first instruction when NPE can occur is in double_ldst_failed_dispatch
2831       // so shift offset appropriately
2832       return 0x4;
2833     }
2834   }
2835   return 0x0;
2836 }
2837 
2838 void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2839         void (Assembler::* mul)(unsigned, const Address&, Condition),
2840         void (Assembler::* sgl)(Register, const Address&, Condition),
2841         Condition cond) {
2842   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2843           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2844     /* Do a store multiple instruction */
2845     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2846   } else {
2847     if (adr.get_mode() != Address::reg) {
2848       // offset-based addressing
2849       if (adr.get_wb_mode() == Address::pre) {
2850         (this->*sgl)(Rt, adr, cond);
2851         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2852       } else if (adr.get_wb_mode() == Address::post) {
2853         (this->*sgl)(Rt, adr, cond);
2854         (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
2855       } else if (adr.get_wb_mode() == Address::off) {
2856         (this->*sgl)(Rt, adr, cond);
2857         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2858       } else {
2859         ShouldNotReachHere();
2860       }
2861     } else {
2862       // index-based addressing
2863       if (adr.get_wb_mode() == Address::pre) {
2864         // current implementation does not use Address::pre for indexed access
2865         ShouldNotReachHere();
2866       } else if (adr.get_wb_mode() == Address::post) {
2867         // current implementation does not use Address:post for indexed access
2868         // enable the code below and implement proper post() method if it is required
2869 #if 0
2870         (this->*sgl)(Rt, Address(post(adr.base(), wordSize)), cond);
2871         (this->*sgl)(Rt2, Address(post(adr.base(), adr.index(), adr.shift())), cond);
2872         sub(adr.base(), wordSize, cond);
2873 #endif
2874         ShouldNotReachHere();
2875       } else if (adr.get_wb_mode() == Address::off) {
2876         (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
2877         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2878         compensate_addr_offset(adr, cond);
2879       } else {
2880         ShouldNotReachHere();
2881       }
2882     }
2883   }
2884 }
2885 
2886 #ifdef ASSERT
2887 void MacroAssembler::verify_stack_alignment() {
2888   if (StackAlignmentInBytes > 4) {
2889     Label x;
2890     tst(sp, StackAlignmentInBytes-1);
2891     b(x, EQ);
2892     stop("stack unaligned");
2893     bind(x);
2894   }
2895 }
2896 #endif
2897 
2898 /**
2899  * Code for BigInteger::multiplyToLen() instrinsic.
2900  *
2901  * r0: x
2902  * r1: xlen
2903  * r2: y
2904  * r3: ylen
2905  * r4:  z
2906  * r5: zlen
2907  *
2908  */
2909 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
2910                                      Register z, Register zlen,
2911                                      Register tmp1, Register tmp2, Register tmp3, Register tmp4,
2912                                      Register tmp5, Register tmp6) {
2913 
2914   assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
2915 
2916   const Register xc = xlen;
2917   const Register yc = tmp1;
2918   const Register zc = tmp2;
2919 
2920   const Register vz = tmp3;
2921   const Register carry = tmp4;
2922   const Register vx = tmp5;
2923   const Register vy = tmp6;
2924 
2925   // ensure y (inner cycle) is shorter than x (outer cycle), this in theory uses CPU caches more effectively
2926   Label L_x_longer;
2927   cmp(xlen, ylen);
2928   b(L_x_longer, Assembler::GE);
2929 #define SWP(X, Y) \
2930   mov(tmp1, Y); \
2931   mov(Y, X); \
2932   mov(X, tmp1)
2933   SWP(x, y);
2934   SWP(xlen, ylen);
2935   bind(L_x_longer);
2936 
2937   lea(xc, Address(x, xlen, lsl(LogBytesPerInt))); // x[xstart]
2938   lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[idx]
2939   lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[kdx]
2940 
2941   // First Loop.
2942   //
2943   //  final static long LONG_MASK = 0xffffffffL;
2944   //  int xstart = xlen - 1;
2945   //  int ystart = ylen - 1;
2946   //  long carry = 0;
2947   //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
2948   //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
2949   //    z[kdx] = (int)product;
2950   //    carry = product >>> 32;
2951   //  }
2952   //  z[xstart] = (int)carry;
2953   //
2954 
2955   ldr(vx, Assembler::pre(xc, -BytesPerInt));
2956   mov(carry, 0);
2957 
2958   Label L_loop_1;
2959   bind(L_loop_1);
2960   ldr(vy, Assembler::pre(yc, -BytesPerInt));
2961   mov(vz, 0);
2962   umaal(vz, carry, vx, vy);
2963   str(vz, Assembler::pre(zc, -BytesPerInt));
2964   cmp(yc, y);
2965   b(L_loop_1, Assembler::GT);
2966 
2967   str(carry, Address(zc, -BytesPerInt));
2968 
2969   // Second and third (nested) loops.
2970   //
2971   // for (int i = xstart-1; i >= 0; i--) { // Second loop
2972   //   carry = 0;
2973   //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
2974   //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
2975   //                    (z[k] & LONG_MASK) + carry;
2976   //     z[k] = (int)product;
2977   //     carry = product >>> 32;
2978   //   }
2979   //   z[i] = (int)carry;
2980   // }
2981   //
2982   Label L_loop_2, L_loop_3;
2983   bind(L_loop_2);
2984 
2985   sub(zlen, zlen, 1);
2986   lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[jdx]
2987   lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[k]
2988 
2989   ldr(vx, Assembler::pre(xc, -BytesPerInt));
2990   mov(carry, 0);
2991 
2992   bind(L_loop_3);
2993   ldr(vy, Assembler::pre(yc, -BytesPerInt));
2994   ldr(vz, Assembler::pre(zc, -BytesPerInt)); // r1 is vz, r2 is carry
2995   umaal(vz, carry, vx, vy);
2996   str(vz, Address(zc));
2997   cmp(yc, y);
2998   b(L_loop_3, Assembler::GT);
2999 
3000   str(carry, Address(zc, -BytesPerInt));
3001   cmp(xc, x);
3002   b(L_loop_2, Assembler::GT);
3003 }
3004 
3005 /**
3006  * Code for BigInteger::mulAdd() instrinsic.
3007  *
3008  * r0: out
3009  * r1: in
3010  * r2: offset
3011  * r3: len
3012  * r4: k
3013  */
3014 void MacroAssembler::mul_add(Register out, Register in, Register offset, Register len, Register k,
3015                               Register tmp1, Register tmp2, Register tmp3) {
3016 
3017   assert_different_registers(out, in, offset, len, k, tmp1, tmp2, tmp3);
3018 
3019   Register vin = tmp1;
3020   Register vout = tmp2;
3021   Register carry = tmp3;
3022   Register result = r0;
3023 
3024 //        long kLong = k & LONG_MASK;
3025 //        long carry = 0;
3026 //
3027 //        offset = out.length-offset - 1;
3028 //        for (int j=len-1; j >= 0; j--) {
3029 //            long product = (in[j] & LONG_MASK) * kLong +
3030 //                           (out[offset] & LONG_MASK) + carry;
3031 //            out[offset--] = (int)product;
3032 //            carry = product >>> 32;
3033 //        }
3034 //        return (int)carry;
3035 
3036   lea(in, Address(in, len, lsl(LogBytesPerInt)));
3037   lea(out, Address(out, offset, lsl(LogBytesPerInt)));
3038   mov(carry, 0);
3039 
3040   Label L_loop;
3041   bind(L_loop);
3042   ldr(vin, Assembler::pre(in, -BytesPerInt));
3043   ldr(vout, Assembler::pre(out, -BytesPerInt));
3044   umaal(vout, carry, vin, k);
3045   str(vout, Address(out));
3046   subs(len, len, 1);
3047   b(L_loop, Assembler::GT);
3048 
3049   mov(result, carry);
3050 }
3051 
3052 /**
3053  * Emits code to update CRC-32 with a byte value according to constants in table
3054  *
3055  * @param [in,out]crc   Register containing the crc.
3056  * @param [in]val       Register containing the byte to fold into the CRC.
3057  * @param [in]table     Register containing the table of crc constants.
3058  *
3059  * uint32_t crc;
3060  * val = crc_table[(val ^ crc) & 0xFF];
3061  * crc = val ^ (crc >> 8);
3062  *
3063  */
3064 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3065   eor(val, val, crc);
3066   andr(val, val, 0xff);
3067   ldr(val, Address(table, val, lsl(2)));
3068   eor(crc, val, crc, Assembler::lsr(8));
3069 }
3070 
3071 /**
3072  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3073  *
3074  * @param [in,out]crc   Register containing the crc.
3075  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3076  * @param [in]table0    Register containing table 0 of crc constants.
3077  * @param [in]table1    Register containing table 1 of crc constants.
3078  * @param [in]table2    Register containing table 2 of crc constants.
3079  * @param [in]table3    Register containing table 3 of crc constants.
3080  *
3081  * uint32_t crc;
3082  *   v = crc ^ v
3083  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3084  *
3085  */
3086 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3087         Register tmp2, Register table0, Register table1, Register table2, Register table3) {
3088   eor(v, crc, v);
3089   uxtb(tmp, v);
3090   uxtb(tmp2, v, ror(8));
3091   ldr(crc, Address(table3, tmp, lsl(2)));
3092   ldr(tmp2, Address(table2, tmp2, lsl(2)));
3093   uxtb(tmp, v, ror(16));
3094   eor(crc, crc, tmp2);
3095   uxtb(tmp2, v, ror(24));
3096   ldr(tmp, Address(table1, tmp, lsl(2)));
3097   ldr(tmp2, Address(table0, tmp2, lsl(2)));
3098   eor(crc, crc, tmp);
3099   eor(crc, crc, tmp2);
3100 }
3101 
3102 /**
3103  * @param crc   register containing existing CRC (32-bit)
3104  * @param buf   register pointing to input byte buffer (byte*)
3105  * @param len   register containing number of bytes
3106  * @param table register that will contain address of CRC table
3107  * @param tmp   scratch register
3108  */
3109 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3110         Register table0, Register table1, Register table2, Register table3,
3111         Register tmp, Register tmp2, Register tmp3, int is_crc32c) {
3112   Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
3113 
3114   if (!is_crc32c)
3115     inv(crc, crc);
3116   if (UseCRC32) {
3117     Label CRC_by4_loop, CRC_by1_loop;
3118 
3119       subs(len, len, 4);
3120       b(CRC_by4_loop, Assembler::GE);
3121       adds(len, len, 4);
3122       b(CRC_by1_loop, Assembler::GT);
3123       b(L_exit);
3124 
3125     BIND(CRC_by4_loop);
3126       ldr(tmp, Address(post(buf, 4)));
3127       subs(len, len, 4);
3128       if (!is_crc32c)
3129         crc32w(crc, crc, tmp);
3130       else // is_crc32c
3131         crc32cw(crc, crc, tmp);
3132       b(CRC_by4_loop, Assembler::GE);
3133       adds(len, len, 4);
3134       b(L_exit, Assembler::LE);
3135     BIND(CRC_by1_loop);
3136       ldrb(tmp, Address(post(buf, 1)));
3137       subs(len, len, 1);
3138       if (!is_crc32c)
3139         crc32b(crc, crc, tmp);
3140       else // is_crc32c
3141         crc32cb(crc, crc, tmp);
3142       b(CRC_by1_loop, Assembler::GT);
3143     BIND(L_exit);
3144       if (!is_crc32c)
3145         inv(crc, crc);
3146       return;
3147   }
3148     lea(table0, ExternalAddress(
3149         !is_crc32c ?
3150             StubRoutines::crc_table_addr() :
3151             StubRoutines::crc32c_table_addr() ));
3152     add(table1, table0, 1*256*sizeof(juint));
3153     add(table2, table0, 2*256*sizeof(juint));
3154     add(table3, table0, 3*256*sizeof(juint));
3155 
3156   BIND(L_align_by1_loop);
3157     tst(buf, 3);
3158     b(L_align_exit, Assembler::EQ);
3159     cmp(len, 0);
3160     b(L_exit, Assembler::EQ);
3161     sub(len, len, 1);
3162     ldrb(tmp, Address(post(buf, 1)));
3163     update_byte_crc32(crc, tmp, table0);
3164     b(L_align_by1_loop);
3165 
3166   BIND(L_align_exit);
3167 
3168   if(VM_Version::features() & FT_AdvSIMD) {
3169   if (UseNeon) {
3170       cmp(len, 32+12); // account for possible need for alignment
3171       b(L_cpu, Assembler::LT);
3172 
3173     Label L_fold, L_align_by4_loop, L_align_by4_exit;
3174 
3175     BIND(L_align_by4_loop);
3176       tst(buf, 0xf);
3177       b(L_align_by4_exit, Assembler::EQ);
3178       ldr(tmp, Address(post(buf, 4)));
3179       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3180       sub(len, len, 4);
3181       b(L_align_by4_loop);
3182 
3183     BIND(L_align_by4_exit);
3184 
3185       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3186 
3187       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3188       vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
3189       vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
3190       vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
3191       vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
3192       veor_64(d16, d16, d16);
3193       vmov_32(d16, 0, crc);
3194 
3195       veor_64(d0, d0, d16);
3196       sub(len, len, 32);
3197 
3198     BIND(L_fold);
3199       vmullp_8(q8, d0, d5);
3200       vmullp_8(q9, d0, d7);
3201       vmullp_8(q10, d0, d4);
3202       vmullp_8(q11, d0, d6);
3203 
3204       vmullp_8(q12, d1, d5);
3205       vmullp_8(q13, d1, d7);
3206       vmullp_8(q14, d1, d4);
3207       vmullp_8(q15, d1, d6);
3208 
3209       vuzp_128_16(q9, q8);
3210       veor_128(q8, q8, q9);
3211 
3212       vuzp_128_16(q13, q12);
3213       veor_128(q12, q12, q13);
3214 
3215       vshll_16u(q9, d16, 8);
3216       vshll_16u(q8, d17, 8);
3217 
3218       vshll_16u(q13, d24, 8);
3219       vshll_16u(q12, d25, 8);
3220 
3221       veor_128(q8, q8, q10);
3222       veor_128(q12, q12, q14);
3223       veor_128(q9, q9, q11);
3224       veor_128(q13, q13, q15);
3225 
3226       veor_64(d19, d19, d18);
3227       veor_64(d18, d27, d26);
3228 
3229       vshll_32u(q13, d18, 16);
3230       vshll_32u(q9, d19, 16);
3231 
3232       veor_128(q9, q8, q9);
3233       veor_128(q13, q12, q13);
3234 
3235       veor_64(d31, d26, d27);
3236       veor_64(d30, d18, d19);
3237 
3238       vshl_128_64(q15, q15, 1);
3239       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3240       veor_128(q0, q0, q15);
3241 
3242       subs(len, len, 16);
3243       b(L_fold, Assembler::GE);
3244 
3245       vmov_32(tmp, d0, 0);
3246       mov(crc, 0);
3247       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3248       vmov_32(tmp, d0, 1);
3249       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3250       vmov_32(tmp, d1, 0);
3251       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3252       vmov_32(tmp, d1, 1);
3253       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3254 
3255       add(len, len, 16);
3256     }
3257   } // if FT_AdvSIMD
3258 
3259   BIND(L_cpu);
3260     subs(len, len, 8);
3261     b(L_by8_loop, Assembler::GE);
3262     adds(len, len, 8);
3263     b(L_by1_loop, Assembler::GT);
3264     b(L_exit);
3265 
3266   BIND(L_by8_loop);
3267     ldr(tmp, Address(post(buf, 4)));
3268     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3269     ldr(tmp, Address(post(buf, 4)));
3270     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3271     subs(len, len, 8);
3272     b(L_by8_loop, Assembler::GE);
3273     adds(len, len, 8);
3274     b(L_exit, Assembler::LE);
3275   BIND(L_by1_loop);
3276     subs(len, len, 1);
3277     ldrb(tmp, Address(post(buf, 1)));
3278     update_byte_crc32(crc, tmp, table0);
3279     b(L_by1_loop, Assembler::GT);
3280 
3281   BIND(L_exit);
3282     if (!is_crc32c)
3283       inv(crc, crc);
3284 }
3285 
3286 /**
3287  * First round Key (cpu implementation)
3288  * @param in   register containing address of input data (plain or cipher text)
3289  * @param key  register containing address of the key data
3290  * @param t0   output register t0
3291  * @param t1   output register t1
3292  * @param t2   output register t2
3293  * @param t3   output register t3
3294  * @param t4   temporary register
3295  * @param t5   temporary register
3296  * @param t6   temporary register
3297  * @param t7   temporary register
3298  */
3299 void MacroAssembler::kernel_aescrypt_firstRound(Register in, Register key,
3300         Register t0, Register t1, Register t2, Register t3,
3301         Register t4, Register t5, Register t6, Register t7) {
3302 
3303   ldr(t4, Address(post(key, 4)));
3304   ldr(t5, Address(post(key, 4)));
3305   ldr(t6, Address(post(key, 4)));
3306   ldr(t7, Address(post(key, 4)));
3307   ldr(t0, Address(post(in, 4)));
3308   ldr(t1, Address(post(in, 4)));
3309   ldr(t2, Address(post(in, 4)));
3310   ldr(t3, Address(post(in, 4)));
3311   rev(t0, t0);
3312   rev(t1, t1);
3313   rev(t2, t2);
3314   rev(t3, t3);
3315   eor(t0, t0, t4);
3316   eor(t1, t1, t5);
3317   eor(t2, t2, t6);
3318   eor(t3, t3, t7);
3319 }
3320 
3321 /**
3322  * AES ECB Round
3323  * @param table_te Register contains address of AES replacement table
3324  * @param key   register containing address of the key data
3325  * @param t0    Register for input value t0
3326  * @param t1    Register for input value t1
3327  * @param t2    Register for input value t2
3328  * @param t3    Register for input value t3
3329  * @param a     Register for output value
3330  * @param tmp1  Temporary register 1
3331  * @param tmp2  Temporary register 2
3332  */
3333 void MacroAssembler::kernel_aescrypt_round(Register table_te, Register key,
3334         Register t0, Register t1, Register t2, Register t3,
3335         Register a, Register tmp1, Register tmp2) {
3336 
3337   ldr(a, Address(post(key, 4))); // K
3338   uxtb(tmp1, t0, ror(24));
3339   ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T1
3340   uxtb(tmp2, t1, ror(16));
3341   eor(a, a, tmp1);
3342   ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T2
3343   uxtb(tmp1, t2, ror(8));
3344   eor(a, a, tmp2, ror(8));
3345   ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T3
3346   uxtb(tmp2, t3);
3347   eor(a, a, tmp1, ror(16));
3348   ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T4
3349   eor(a, a, tmp2, ror(24)); // a0
3350 };
3351 
3352 /**
3353  *
3354  *  Last AES encryption round ( 4 bytes )
3355  * @param table_te
3356  * @param key
3357  * @param to
3358  * @param t0
3359  * @param t1
3360  * @param t2
3361  * @param t3
3362  * @param t4
3363  * @param t5
3364  * @param t6
3365  * @param t7
3366  *
3367  *           int tt = K[keyOffset++];
3368  *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
3369  *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
3370  *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
3371  *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
3372  */
3373 void MacroAssembler::kernel_aescrypt_lastRound(
3374         Register table_te, Register key, Register to,
3375         Register t0, Register t1, Register t2, Register t3,
3376         Register t4, Register t5, Register t6, Register t7) {
3377 
3378   ldr(t7, Address(post(key, 4))); // tt
3379 
3380   uxtb(t5, t0, ror(24));
3381   ldr(t4, Address(table_te, t5, lsl(2))); // S[]
3382   uxtb(t6, t1, ror(16));
3383   eor(t4, t4, t7, lsr(24));
3384   ldr(t6, Address(table_te, t6, lsl(2))); // S[]
3385   uxtb(t5, t2, ror(8));
3386   eor(t6, t6, t7, lsr(16));
3387   uxtb(t6, t6);
3388   add(t4, t4, t6, lsl(8));
3389   ldr(t5, Address(table_te, t5, lsl(2))); // S[]
3390   uxtb(t6, t3);
3391   eor(t5, t5, t7, lsr(8));
3392   uxtb(t5, t5);
3393   add(t4, t4, t5, lsl(16));
3394   ldr(t6, Address(table_te, t6, lsl(2))); // S[]
3395   eor(t6, t6, t7);
3396   uxtb(t6, t6);
3397   add(t4, t4, t6, lsl(24));
3398 
3399   str(t4, Address(post(to, 4)));
3400 
3401 }
3402 
3403 /**
3404  *
3405  *  Last AES encryption round ( 4 bytes )
3406  * @param table_te
3407  * @param key
3408  * @param to
3409  * @param t0
3410  * @param t1
3411  * @param t2
3412  * @param t3
3413  * @param t4
3414  * @param t5
3415  * @param t6
3416  * @param t7
3417  *
3418  *           int tt = K[keyOffset++];
3419  *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
3420  *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
3421  *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
3422  *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
3423  */
3424 void MacroAssembler::kernel_aescrypt_lastRound_cbc(
3425         Register table_te,
3426         Register t0, Register t1, Register t2, Register t3,
3427         Register t4, Register t5, Register t6) {
3428 
3429   uxtb(t5, t0, ror(24));
3430   ldr(t4, Address(table_te, t5, lsl(2))); // S[]
3431   uxtb(t6, t1, ror(16));
3432   ldr(t6, Address(table_te, t6, lsl(2))); // S[]
3433   uxtb(t5, t2, ror(8));
3434   add(t4, t4, t6, lsl(8));
3435   ldr(t5, Address(table_te, t5, lsl(2))); // S[]
3436   uxtb(t6, t3);
3437   add(t4, t4, t5, lsl(16));
3438   ldr(t6, Address(table_te, t6, lsl(2))); // S[]
3439   add(t4, t4, t6, lsl(24));
3440 }
3441 
3442 /**
3443  * AES ECB encryption
3444  *
3445  * @param from      register pointing to source array address
3446  * @param to        register pointing to destination array address
3447  * @param key       register pointing to key
3448  * @param keylen    register containing key len in bytes
3449  */
3450 void MacroAssembler::kernel_aescrypt_encryptBlock(Register from, Register to,
3451         Register key, Register keylen, Register table_te,
3452         Register t0, Register t1, Register t2, Register t3,
3453         Register t4, Register t5, Register t6, Register t7) {
3454   Label L_loop;
3455   lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
3456 
3457   ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
3458           arrayOopDesc::base_offset_in_bytes(T_INT)));
3459 
3460 
3461   kernel_aescrypt_firstRound(from, key,
3462           t0, t1, t2, t3, t4, t5, t6, t7);
3463 
3464   sub(keylen, keylen, 8);
3465   BIND(L_loop);
3466 
3467   kernel_aescrypt_round(table_te, key,
3468           t0, t1, t2, t3, t4, t7, from);
3469   kernel_aescrypt_round(table_te, key,
3470           t1, t2, t3, t0, t5, t7, from);
3471   kernel_aescrypt_round(table_te, key,
3472           t2, t3, t0, t1, t6, t7, from);
3473 
3474   uxtb(t7, t3, ror(24));
3475   ldr(t3, Address(table_te, t7, lsl(2))); // T1
3476   uxtb(t7, t0, ror(16));
3477   ldr(t7, Address(table_te, t7, lsl(2))); // T2
3478   mov(t0, t4); // t0=a0
3479   eor(t3, t3, t7, ror(8));
3480   uxtb(t7, t1, ror(8));
3481   ldr(t7, Address(table_te, t7, lsl(2))); // T3
3482   mov(t1, t5); // t1=a1
3483   eor(t3, t3, t7, ror(16));
3484   uxtb(t7, t2);
3485   ldr(t7, Address(table_te, t7, lsl(2))); // T4
3486   mov(t2, t6); // t2=a2
3487   eor(t3, t3, t7, ror(24));
3488   ldr(t7, Address(post(key, 4))); // K
3489   eor(t3, t3, t7); // t3 = a3
3490 
3491   subs(keylen, keylen, 4);
3492   b(L_loop, Assembler::NE);
3493 
3494   // last round is special
3495   add(table_te, table_te, 4 * 256); //S
3496 
3497   kernel_aescrypt_lastRound(
3498           table_te, key, to,
3499           t0, t1, t2, t3,
3500           t4, t5, t6, t7);
3501 
3502   kernel_aescrypt_lastRound(
3503           table_te, key, to,
3504           t1, t2, t3, t0,
3505           t4, t5, t6, t7);
3506 
3507   kernel_aescrypt_lastRound(
3508           table_te, key, to,
3509           t2, t3, t0, t1,
3510           t4, t5, t6, t7);
3511 
3512   kernel_aescrypt_lastRound(
3513           table_te, key, to,
3514           t3, t0, t1, t2,
3515           t4, t5, t6, t7);
3516 }
3517 
3518 /**
3519  * AES ECB decryption
3520  * @param from      register pointing to source array address
3521  * @param to        register pointing to destination array address
3522  * @param key       register pointing to key
3523  * @param keylen    register containing key len in bytes
3524  */
3525 void MacroAssembler::kernel_aescrypt_decryptBlock(Register from, Register to,
3526         Register key, Register keylen, Register table_te,
3527         Register t0, Register t1, Register t2, Register t3,
3528         Register t4, Register t5, Register t6, Register t7) {
3529   Label L_loop;
3530   lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
3531 
3532   ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
3533           arrayOopDesc::base_offset_in_bytes(T_INT)));
3534 
3535   push(key, sp);
3536 
3537   add(key, key, 16);
3538   kernel_aescrypt_firstRound(from, key,
3539           t0, t1, t2, t3, t4, t5, t6, t7);
3540 
3541   sub(keylen, keylen, 8);
3542   BIND(L_loop);
3543 
3544   kernel_aescrypt_round(table_te, key,
3545           t0, t3, t2, t1, t4, t7, from);
3546   kernel_aescrypt_round(table_te, key,
3547           t1, t0, t3, t2, t5, t7, from);
3548   kernel_aescrypt_round(table_te, key,
3549           t2, t1, t0, t3, t6, t7, from);
3550 
3551   uxtb(t7, t3, ror(24));
3552   ldr(t3, Address(table_te, t7, lsl(2))); // T1
3553   uxtb(t7, t2, ror(16));
3554   ldr(t7, Address(table_te, t7, lsl(2))); // T2
3555   mov(t2, t6); // t2=a2
3556   eor(t3, t3, t7, ror(8));
3557   uxtb(t7, t1, ror(8));
3558   ldr(t7, Address(table_te, t7, lsl(2))); // T3
3559   mov(t1, t5); // t1=a1
3560   eor(t3, t3, t7, ror(16));
3561   uxtb(t7, t0);
3562   ldr(t7, Address(table_te, t7, lsl(2))); // T4
3563   mov(t0, t4); // t0=a0
3564   eor(t3, t3, t7, ror(24));
3565   ldr(t7, Address(post(key, 4))); // K
3566   eor(t3, t3, t7); // t3 = a3
3567 
3568   subs(keylen, keylen, 4);
3569   b(L_loop, Assembler::NE);
3570 
3571   pop(key, sp);
3572   // last round is special
3573   add(table_te, table_te, 4 * 256); //S
3574 
3575   kernel_aescrypt_lastRound(
3576           table_te, key, to,
3577           t0, t3, t2, t1,
3578           t4, t5, t6, t7);
3579 
3580   kernel_aescrypt_lastRound(
3581           table_te, key, to,
3582           t1, t0, t3, t2,
3583           t4, t5, t6, t7);
3584 
3585   kernel_aescrypt_lastRound(
3586           table_te, key, to,
3587           t2, t1, t0, t3,
3588           t4, t5, t6, t7);
3589 
3590   kernel_aescrypt_lastRound(
3591           table_te, key, to,
3592           t3, t2, t1, t0,
3593           t4, t5, t6, t7);
3594 }
3595 
3596 /**
3597  * AES CBC encryption
3598  *
3599  * @param from      register pointing to source array address
3600  * @param to        register pointing to destination array address
3601  * @param key       register pointing to key
3602  * @param rvec      register pointing to roundkey vector
3603  * @param len       register containing source len in bytes
3604  */
3605 void MacroAssembler::kernel_aescrypt_encrypt(Register from, Register to,
3606         Register key, Register rvec, Register len, Register keylen, Register table_te,
3607         Register t0, Register t1, Register t2, Register t3,
3608         Register t4, Register t5, Register t6) {
3609   Label L_loop, L_loop2;
3610   lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
3611   ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
3612           arrayOopDesc::base_offset_in_bytes(T_INT)));
3613 
3614   vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
3615   vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
3616   sub(keylen, keylen, 8);
3617 
3618   add(t4, key, keylen, lsl(2));
3619   vld1_64(d8, d9, Address(t4), Assembler::ALIGN_STD); // read last key bytes to q4
3620   vrev32_128_8(q4, q4);
3621 
3622   push(to, sp);
3623   BIND(L_loop2);
3624   // get round key and first round
3625   vld1_64(d0, d1, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q0
3626   veor_128(q0, q0, q2);
3627   vrev32_128_8(q0, q0);
3628   veor_128(q0, q0, q1);
3629   vmov_f64(t0, t1, d0);
3630   vmov_f64(t2, t3, d1);
3631 
3632   push(RegSet::of(key, from), sp);
3633   push(RegSet::of(to, keylen), sp);
3634 
3635   BIND(L_loop);
3636 
3637   kernel_aescrypt_round(table_te, key,
3638           t0, t1, t2, t3, t4, to, from);
3639   kernel_aescrypt_round(table_te, key,
3640           t1, t2, t3, t0, t5, to, from);
3641   kernel_aescrypt_round(table_te, key,
3642           t2, t3, t0, t1, t6, to, from);
3643 
3644   uxtb(to, t3, ror(24));
3645   ldr(t3, Address(table_te, to, lsl(2))); // T1
3646   uxtb(to, t0, ror(16));
3647   ldr(to, Address(table_te, to, lsl(2))); // T2
3648   mov(t0, t4); // t0=a0
3649   eor(t3, t3, to, ror(8));
3650   uxtb(to, t1, ror(8));
3651   ldr(to, Address(table_te, to, lsl(2))); // T3
3652   mov(t1, t5); // t1=a1
3653   eor(t3, t3, to, ror(16));
3654   uxtb(to, t2);
3655   ldr(to, Address(table_te, to, lsl(2))); // T4
3656   mov(t2, t6); // t2=a2
3657   eor(t3, t3, to, ror(24));
3658   ldr(to, Address(post(key, 4))); // K
3659   eor(t3, t3, to); // t3 = a3
3660 
3661   subs(keylen, keylen, 4);
3662   b(L_loop, Assembler::NE);
3663 
3664   // last round is special
3665   add(table_te, table_te, 4 * 256); //S
3666   kernel_aescrypt_lastRound_cbc(
3667           table_te,
3668           t0, t1, t2, t3,
3669           t4, t5, t6);
3670 
3671   kernel_aescrypt_lastRound_cbc(
3672           table_te,
3673           t1, t2, t3, t0,
3674           t5, t6, from);
3675   vmov_f64(d6, t4, t5);
3676 
3677   kernel_aescrypt_lastRound_cbc(
3678           table_te,
3679           t2, t3, t0, t1,
3680           t4, t5, t6);
3681 
3682   kernel_aescrypt_lastRound_cbc(
3683           table_te,
3684           t3, t0, t1, t2,
3685           t5, t6, from);
3686   vmov_f64(d7, t4, t5);
3687   veor_128(q2, q4, q3);
3688 
3689   pop(RegSet::of(to, keylen), sp);
3690   sub(table_te, table_te, 4 * 256); //Te
3691   vst1_64(d4, Address(post(to, 8)), Assembler::ALIGN_STD);
3692   pop(RegSet::of(key, from), sp);
3693   vst1_64(d5, Address(post(to, 8)), Assembler::ALIGN_STD);
3694 
3695   subs(len, len, 16);
3696   b(L_loop2, Assembler::NE);
3697   vstr_f64(d4, Address(rvec));
3698   vstr_f64(d5, Address(rvec, 8));
3699   mov(r0, to);
3700   pop(to, sp);
3701   sub(r0, r0, to);
3702 };
3703 
3704 /**
3705  * AES CBC decryption
3706  *
3707  * @param from      register pointing to source array address
3708  * @param to        register pointing to destination array address
3709  * @param key       register pointing to key
3710  * @param rvec      register pointing to roundkey vector
3711  * @param len       register containing source len in bytes
3712  */
3713 void MacroAssembler::kernel_aescrypt_decrypt(Register from, Register to,
3714         Register key, Register rvec, Register len, Register keylen, Register table_te,
3715         Register t0, Register t1, Register t2, Register t3,
3716         Register t4, Register t5, Register t6) {
3717   Label L_loop, L_loop2;
3718   lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
3719 
3720   ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
3721           arrayOopDesc::base_offset_in_bytes(T_INT)));
3722 
3723   vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
3724   vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
3725   vld1_64(d10, d11, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q5
3726   vrev32_128_8(q1, q1);
3727   sub(keylen, keylen, 8);
3728 
3729   push(to, sp);
3730   BIND(L_loop2);
3731   // get round key and first round
3732   vld1_64(d8, d9, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q4
3733 
3734   push(RegSet::of(to, key, from, keylen), sp);
3735   vrev32_128_8(q0, q4);
3736   veor_128(q0, q0, q5);
3737   vmov_f64(t0, t1, d0);
3738   vmov_f64(t2, t3, d1);
3739 
3740   BIND(L_loop);
3741 
3742   kernel_aescrypt_round(table_te, key,
3743           t0, t3, t2, t1, t4, to, from);
3744   kernel_aescrypt_round(table_te, key,
3745           t1, t0, t3, t2, t5, to, from);
3746   kernel_aescrypt_round(table_te, key,
3747           t2, t1, t0, t3, t6, to, from);
3748 
3749   uxtb(to, t3, ror(24));
3750   ldr(t3, Address(table_te, to, lsl(2))); // T1
3751   uxtb(to, t2, ror(16));
3752   ldr(to, Address(table_te, to, lsl(2))); // T2
3753   mov(t2, t6); // t2=a2
3754   eor(t3, t3, to, ror(8));
3755   uxtb(to, t1, ror(8));
3756   ldr(to, Address(table_te, to, lsl(2))); // T3
3757   mov(t1, t5); // t1=a1
3758   eor(t3, t3, to, ror(16));
3759   uxtb(to, t0);
3760   ldr(to, Address(table_te, to, lsl(2))); // T4
3761   mov(t0, t4); // t0=a0
3762   eor(t3, t3, to, ror(24));
3763   ldr(to, Address(post(key, 4))); // K
3764   eor(t3, t3, to); // t3 = a3
3765 
3766   subs(keylen, keylen, 4);
3767   b(L_loop, Assembler::NE);
3768 
3769   // last round is special
3770   add(table_te, table_te, 4 * 256); //S
3771 
3772   kernel_aescrypt_lastRound_cbc(
3773           table_te,
3774           t0, t3, t2, t1,
3775           t4, t5, t6);
3776 
3777   kernel_aescrypt_lastRound_cbc(
3778           table_te,
3779           t1, t0, t3, t2,
3780           t5, t6, to);
3781   vmov_f64(d6, t4, t5); //q3
3782 
3783   kernel_aescrypt_lastRound_cbc(
3784           table_te,
3785           t2, t1, t0, t3,
3786           t4, t5, t6);
3787 
3788   kernel_aescrypt_lastRound_cbc(
3789           table_te,
3790           t3, t2, t1, t0,
3791           t5, t6, to);
3792   vmov_f64(d7, t4, t5); //q3
3793   pop(RegSet::of(to, key, from, keylen), sp);
3794   veor_128(q3, q1, q3);
3795   veor_128(q3, q3, q2);
3796   vshl_128_64(q2, q4, 0);
3797 
3798   sub(table_te, table_te, 4 * 256); //Te
3799 
3800   vst1_64(d6, Address(post(to, 8)), Assembler::ALIGN_STD);
3801   subs(len, len, 16);
3802   vst1_64(d7, Address(post(to, 8)), Assembler::ALIGN_STD);
3803 
3804   b(L_loop2, Assembler::NE);
3805 
3806   vstr_f64(d4, Address(rvec));
3807   vstr_f64(d5, Address(rvec, 8));
3808   mov(r0, to);
3809   pop(to, sp);
3810   sub(r0, r0, to);
3811 };
3812 
3813 /*
3814  * First round of SHA1 algorithm
3815  */
3816 void MacroAssembler::sha_round1(Register st_b, Register st_c, Register st_d,
3817         Register tmp, Register st_f, int sh) {
3818   if (sh) {
3819     eor(st_f, st_d, st_c, ror(32-sh));
3820   } else {
3821     eor(st_f, st_d, st_c);
3822   }
3823   andr(st_f, st_f, st_b);
3824   eor(st_f, st_f, st_d);
3825 }
3826 
3827 /*
3828  * Second and forth round of SHA1 algorithm
3829  */
3830 void MacroAssembler::sha_round2(Register st_b, Register st_c, Register st_d,
3831         Register tmp, Register st_f, int sh) {
3832   if (sh) {
3833     eor(st_f, st_b, st_c, ror(32-sh));
3834   } else {
3835     eor(st_f, st_b, st_c);
3836   }
3837   eor(st_f, st_f, st_d);
3838 }
3839 
3840 /*
3841  * Third round of SHA1 algorithm
3842  */
3843 void MacroAssembler::sha_round3(Register st_b, Register st_c, Register st_d,
3844         Register tmp, Register st_f, int sh) {
3845   if (sh) {
3846     andr(st_f, st_b, st_c, ror(32-sh));
3847     orr(tmp, st_b, st_c, ror(32-sh));
3848   } else {
3849     andr(st_f, st_b, st_c);
3850     orr(tmp, st_b, st_c);
3851   }
3852   andr(tmp, st_d, tmp);
3853   orr(st_f, st_f, tmp);
3854 }
3855 
3856 /*
3857  * Calculate Deltas w[i] and w[i+1]
3858  * w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) rotl 1
3859  */
3860 void MacroAssembler::sha_w0(FloatRegister w16, FloatRegister w14,
3861         FloatRegister w8, FloatRegister w4, FloatRegister w2,
3862         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
3863         FloatRegister st_k, FloatRegister st_kw, bool update) {
3864   vadd_64_32(st_kw, st_k, w16);
3865   if(update) {
3866     veor_64(tmp1, w16, w14);
3867     vext_64(tmp2, w2, w4, 4);
3868     veor_64(tmp3, tmp1, w8);
3869     veor_64(tmp4, tmp3, tmp2);
3870 
3871     vshr_64_u32(tmp1, tmp4, 31);
3872     vshl_64_32(tmp2, tmp4, 1);
3873     vorr_64(w16, tmp1, tmp2);
3874   }
3875 }
3876 /*
3877  * Calculate Deltas w[i] and w[i+1]
3878  */
3879 void MacroAssembler::sha_w(FloatRegister w16, FloatRegister w14,
3880         FloatRegister w12, FloatRegister w10, FloatRegister w8,
3881         FloatRegister w6, FloatRegister w4, FloatRegister w2,
3882         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
3883         FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp,
3884         bool update) {
3885   Label L_7, L_6, L_5, L_4, L_3, L_2, L_1, L_done;
3886   andr(rtmp, counter, 0x7);
3887   add(counter, counter, 1);
3888   cmp(rtmp, 7);
3889   b(L_7, Assembler::EQ);
3890   cmp(rtmp, 6);
3891   b(L_6, Assembler::EQ);
3892   cmp(rtmp, 5);
3893   b(L_5, Assembler::EQ);
3894   cmp(rtmp, 4);
3895   b(L_4, Assembler::EQ);
3896   cmp(rtmp, 3);
3897   b(L_3, Assembler::EQ);
3898   cmp(rtmp, 2);
3899   b(L_2, Assembler::EQ);
3900   cmp(rtmp, 1);
3901   b(L_1, Assembler::EQ);
3902     sha_w0(w16, w14, w8, w4, w2, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update);
3903     b(L_done);
3904   BIND(L_1); {
3905     sha_w0(w14, w12, w6, w2, w16, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3906     b(L_done);
3907   }
3908   BIND(L_2); {
3909     sha_w0(w12, w10, w4, w16, w14, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3910     b(L_done);
3911   }
3912   BIND(L_3); {
3913     sha_w0(w10, w8, w2, w14, w12, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3914     b(L_done);
3915   }
3916   BIND(L_4); {
3917     sha_w0(w8, w6, w16, w12, w10, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3918     b(L_done);
3919   }
3920   BIND(L_5); {
3921     sha_w0(w6, w4, w14, w10, w8, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3922     b(L_done);
3923   }
3924   BIND(L_6); {
3925     sha_w0(w4, w2, w12, w8, w6, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3926     b(L_done);
3927   }
3928   BIND(L_7); {
3929     sha_w0(w2, w16, w10, w6, w4, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
3930   }
3931   BIND(L_done);
3932 }
3933 
3934 /**
3935  * SHA1 digest
3936  *
3937  * @param from      register pointing to source array address
3938  * @param state     register pointing to state array address
3939  */
3940 void MacroAssembler::kernel_sha_implCompress(Register from, Register state,
3941         Register counter, Register table_k,
3942         Register st_a, Register st_b,
3943         Register st_c, Register st_d, Register st_e,
3944         Register tmp, Register counter2, Register st_new_a, Register st_w) {
3945   Label L_round_1, L_round_2, L_round_3, L_round_4, L_round_4_cont, L_hash_no_w;
3946 
3947   FloatRegister w16 = d0;  //q0-q7
3948   FloatRegister w14 = w16->successor(FloatRegisterImpl::DOUBLE);
3949   FloatRegister w12 = w14->successor(FloatRegisterImpl::DOUBLE);
3950   FloatRegister w10 = w12->successor(FloatRegisterImpl::DOUBLE);
3951   FloatRegister w8  = w10->successor(FloatRegisterImpl::DOUBLE);
3952   FloatRegister w6  = w8->successor(FloatRegisterImpl::DOUBLE);
3953   FloatRegister w4  = w6->successor(FloatRegisterImpl::DOUBLE);
3954   FloatRegister w2  = w4->successor(FloatRegisterImpl::DOUBLE);
3955   FloatRegister wtmp1  = w2->successor(FloatRegisterImpl::DOUBLE);
3956   FloatRegister wtmp2  = wtmp1->successor(FloatRegisterImpl::DOUBLE);
3957   FloatRegister wtmp3  = wtmp2->successor(FloatRegisterImpl::DOUBLE);
3958   FloatRegister wtmp4  = wtmp3->successor(FloatRegisterImpl::DOUBLE);
3959   FloatRegister st_k1  = wtmp4->successor(FloatRegisterImpl::DOUBLE);
3960   FloatRegister st_k2  = st_k1->successor(FloatRegisterImpl::DOUBLE);
3961   FloatRegister st_k   = st_k2->successor(FloatRegisterImpl::DOUBLE);
3962   FloatRegister st_kw  = st_k->successor(FloatRegisterImpl::DOUBLE);
3963 
3964 
3965   assert_different_registers(st_a,st_b,st_c,st_d,st_e,tmp,counter2, st_new_a, st_w);
3966   assert_different_registers(w2,w4,w6,w8,w10,w12,w14,w16);
3967 
3968   lea(table_k, ExternalAddress(StubRoutines::sha1_table_addr()));
3969 
3970   // read initial 16 W elements
3971   vld1_64(w16,  w14,  w12,  w10,  Address(post(from, 32)), Assembler::ALIGN_STD);
3972   vld1_64(w8,   w6,   w4,   w2,   Address(from), Assembler::ALIGN_STD);
3973 
3974   // revert W
3975   vrev64_128_8(w16, w16);
3976   vrev64_128_8(w12, w12);
3977   vrev64_128_8(w8,  w8);
3978   vrev64_128_8(w4,  w4);
3979   // load state
3980   ldr(st_a, Address(post(state, 4)));
3981   ldr(st_b, Address(post(state, 4)));
3982   ldr(st_c, Address(post(state, 4)));
3983   ldr(st_d, Address(post(state, 4)));
3984   ldr(st_e, Address(state));
3985   sub(state, state, 16);
3986 
3987   mov(counter2, 0);
3988   mov(counter, 10);
3989   // first round
3990   vld1_64(st_k1, st_k2, Address(table_k), Assembler::ALIGN_128);
3991   vdup_64_32(st_k, st_k1, 0);
3992 
3993   BIND(L_round_1); {
3994     sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
3995 
3996     sha_round1(st_b, st_c, st_d, tmp, st_new_a, 0);
3997     vmov_32(st_w, st_kw, 1);
3998     add(st_new_a, st_new_a, st_a, ror(32-5));
3999     add(st_new_a, st_new_a, st_e);
4000     add(st_new_a, st_new_a, st_w);
4001 
4002     vmov_32(st_w, st_kw, 0);
4003     sha_round1(st_a, st_b, st_c, tmp, st_e, 30);
4004 
4005     add(tmp, st_e, st_new_a, ror(32-5));
4006     add(tmp, tmp, st_d);
4007 
4008     mov(st_e, st_c);
4009     mov(st_d, st_b, ror(32-30));
4010     mov(st_c, st_a, ror(32-30));
4011     mov(st_b, st_new_a);
4012     add(st_a, tmp, st_w);
4013 
4014     sub(counter, counter, 1);
4015   }cbnz(counter, L_round_1);
4016 
4017   mov(counter, 10);
4018   // second round
4019   vdup_64_32(st_k, st_k1, 1);
4020 
4021   BIND(L_round_2); {
4022     sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
4023 
4024     sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
4025     vmov_32(st_w, st_kw, 1);
4026     add(st_new_a, st_new_a, st_a, ror(32-5));
4027     add(st_new_a, st_new_a, st_e);
4028     add(st_new_a, st_new_a, st_w);
4029 
4030     vmov_32(st_w, st_kw, 0);
4031     sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
4032 
4033     add(tmp, st_e, st_new_a, ror(32-5));
4034     add(tmp, tmp, st_d);
4035 
4036     mov(st_e, st_c);
4037     mov(st_d, st_b, ror(32-30));
4038     mov(st_c, st_a, ror(32-30));
4039     mov(st_b, st_new_a);
4040     add(st_a, tmp, st_w);
4041 
4042     sub(counter, counter, 1);
4043   }cbnz(counter, L_round_2);
4044 
4045   mov(counter, 10);
4046   vdup_64_32(st_k, st_k2, 0);
4047   // third round
4048 
4049   BIND(L_round_3); {
4050     sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
4051 
4052     sha_round3(st_b, st_c, st_d, tmp, st_new_a, 0);
4053     vmov_32(st_w, st_kw, 1);
4054     add(st_new_a, st_new_a, st_a, ror(32-5));
4055     add(st_new_a, st_new_a, st_e);
4056     add(st_new_a, st_new_a, st_w);
4057 
4058     vmov_32(st_w, st_kw, 0);
4059     sha_round3(st_a, st_b, st_c, tmp, st_e, 30);
4060 
4061     add(tmp, st_e, st_new_a, ror(32-5));
4062     add(tmp, tmp, st_d);
4063 
4064     mov(st_e, st_c);
4065     mov(st_d, st_b, ror(32-30));
4066     mov(st_c, st_a, ror(32-30));
4067     mov(st_b, st_new_a);
4068     add(st_a, tmp, st_w);
4069 
4070     sub(counter, counter, 1);
4071   }cbnz(counter, L_round_3);
4072 
4073   mov(counter, 10);
4074   // forth round
4075   vdup_64_32(st_k, st_k2, 1);
4076 
4077   BIND(L_round_4); {
4078     sub(counter, counter, 1);
4079     cmp(counter, 8);
4080     b(L_hash_no_w, Assembler::LO);
4081     sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
4082     b(L_round_4_cont);
4083     BIND(L_hash_no_w);
4084     sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp, false);
4085     BIND(L_round_4_cont);
4086 
4087     sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
4088     vmov_32(st_w, st_kw, 1);
4089     add(st_new_a, st_new_a, st_a, ror(32-5));
4090     add(st_new_a, st_new_a, st_e);
4091     add(st_new_a, st_new_a, st_w);
4092 
4093     vmov_32(st_w, st_kw, 0);
4094     sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
4095 
4096     add(tmp, st_e, st_new_a, ror(32-5));
4097     add(tmp, tmp, st_d);
4098 
4099     mov(st_e, st_c);
4100     mov(st_d, st_b, ror(32-30));
4101     mov(st_c, st_a, ror(32-30));
4102     mov(st_b, st_new_a);
4103     add(st_a, tmp, st_w);
4104 
4105   }cbnz(counter, L_round_4);
4106 
4107     // load state
4108   ldr(tmp, Address(post(state, 4)));
4109   add(st_a, st_a, tmp);
4110   ldr(tmp, Address(post(state, 4)));
4111   add(st_b, st_b, tmp);
4112   ldr(tmp, Address(post(state, 4)));
4113   add(st_c, st_c, tmp);
4114   ldr(tmp, Address(post(state, 4)));
4115   add(st_d, st_d, tmp);
4116   ldr(tmp, Address(state));
4117   add(st_e, st_e, tmp);
4118   sub(state, state, 16);
4119 
4120   // save state
4121   str(st_a, Address(post(state, 4)));
4122   str(st_b, Address(post(state, 4)));
4123   str(st_c, Address(post(state, 4)));
4124   str(st_d, Address(post(state, 4)));
4125   str(st_e, Address(state));
4126 }
4127 /**
4128  * One iteration of SHA256 algorithm
4129  * Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
4130  * Ma := (a and b) xor (a and c) xor (b and c)
4131  * t2 := Σ0 + Ma
4132  * Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
4133  * Ch := (e and f) xor ((not e) and g)
4134  * t1 := h + Σ1 + Ch + k[i] + w[i]
4135  * h := g
4136  * g := f
4137  * f := e
4138  * e := d + t1
4139  * d := c
4140  * c := b
4141  * b := a
4142  * a := t1 + t2
4143  */
4144 void MacroAssembler::sha256_implCompress_iter0(
4145       Register Da, Register Db, Register Dc, Register Dd,
4146       Register De, Register Df, Register Dg, Register Dh,
4147       FloatRegister Dkw, int index,
4148       Register Dtmp,
4149       Register Dnew_a, Register Dnew_e
4150         ) {
4151     assert_different_registers(Da, Db, Dc, Dd, De, Df, Dg, Dh);
4152 
4153     //  Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
4154     //  Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
4155     andr(Dnew_a, Da, Db);
4156     andr(Dnew_e, Da, Dc);
4157     eor(Dnew_a, Dnew_a, Dnew_e);
4158     andr(Dnew_e, Db, Dc);
4159     eor(Dnew_e, Dnew_a, Dnew_e); //Ma
4160 
4161     mov(Dnew_a, Da, ror(2));
4162     eor(Dnew_a, Dnew_a, Da, ror(13));
4163     eor(Dnew_a, Dnew_a, Da, ror(22)); //Σ0
4164 
4165     add(Dnew_a, Dnew_a, Dnew_e); //t2
4166 
4167     andr(Dnew_e, De, Df);
4168     mvn(Dtmp, De);
4169     andr(Dtmp, Dtmp, Dg);
4170     eor(Dtmp, Dnew_e, Dtmp); //Ch
4171 
4172     mov(Dnew_e, De, ror(6));
4173     eor(Dnew_e, Dnew_e, De, ror(11));
4174     eor(Dnew_e, Dnew_e, De, ror(25)); //Σ1
4175 
4176     add(Dnew_e, Dnew_e, Dtmp);
4177     vmov_32(Dtmp, Dkw, index);
4178     add(Dnew_e, Dnew_e, Dh);
4179 
4180     add(Dtmp, Dnew_e, Dtmp); //t1
4181 
4182     add(Dnew_e, Dtmp, Dd); //new_e
4183     add(Dnew_a, Dtmp, Dnew_a); //new_a
4184 };
4185 /**
4186  * Four iterations of SHA256 algorithm
4187  */
4188 void MacroAssembler::sha256_implCompress_iter(
4189       Register ra, Register rb, Register rc, Register rd,
4190       Register re, Register rf, Register rg, Register rh,
4191       FloatRegister Dkw1, FloatRegister Dkw2,
4192       Register step,
4193       Register tmp,
4194       Register ra2, Register re2
4195         ) {
4196   Label L_4, L_3, L_2, L_1, L_done;
4197   cmp(step, 4);
4198   b(L_4, Assembler::EQ);
4199   cmp(step, 3);
4200   b(L_3, Assembler::EQ);
4201   cmp(step, 2);
4202   b(L_2, Assembler::EQ);
4203   cmp(step, 1);
4204   b(L_1, Assembler::EQ);
4205     sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 0, tmp, ra2, re2);
4206     sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 1, tmp, rd,  rh);
4207     sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 0, tmp, rc,  rg);
4208     sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 1, tmp, rb,  rf);
4209     mov(step, 4);
4210     b(L_done);
4211   BIND(L_1); {
4212     sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 0, tmp, rd,  rh);
4213     sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 1, tmp, rc,  rg);
4214     sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 0, tmp, rb,  rf);
4215     sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 1, tmp, ra,  re);
4216     mov(step, 0);
4217     b(L_done);
4218   }
4219   BIND(L_2); {
4220     sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 0, tmp, rc,  rg);
4221     sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 1, tmp, rb,  rf);
4222     sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 0, tmp, ra,  re);
4223     sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 1, tmp, ra2, re2);
4224     mov(step, 1);
4225     b(L_done);
4226   }
4227   BIND(L_3); {
4228     sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 0, tmp, rb,  rf);
4229     sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 1, tmp, ra,  re);
4230     sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 0, tmp, ra2, re2);
4231     sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 1, tmp, rd,  rh);
4232     mov(step, 2);
4233     b(L_done);
4234   }
4235   BIND(L_4); {
4236     sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 0, tmp, ra,  re);
4237     sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 1, tmp, ra2, re2);
4238     sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 0, tmp, rd,  rh);
4239     sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 1, tmp, rc,  rg);
4240     mov(step, 3);
4241   }
4242   BIND(L_done);
4243 };
4244 
4245   /*
4246    * Calculate Deltas w[i] and w[i+1]
4247    * s0 := (w[i-15] rotr 7) xor (w[i-15] rotr 18) xor (w[i-15] shr 3)
4248    * s1 := (w[i-2] rotr 17) xor (w[i-2] rotr 19) xor (w[i-2] shr 10)
4249    * w[i] := w[i-16] + s0 + w[i-7] + s1
4250    */
4251 void MacroAssembler::sha256_w0(
4252       FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14,
4253       FloatRegister w_m7, FloatRegister w_m6,
4254       FloatRegister w_m2,
4255       FloatRegister Qtmp_S0, FloatRegister Qtmp_S1,
4256       FloatRegister Qtmp1){
4257 
4258     vmov_64(Qtmp1, w_m15);
4259     vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m14);
4260     vshr_128_u64(Qtmp_S0, Qtmp1, 7);
4261     vshr_128_u64(Qtmp_S1, Qtmp1, 18);
4262     veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);
4263     vshr_128_u64(Qtmp_S1, Qtmp1, 35);
4264     veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);  //S0
4265 
4266     vshr_128_u64(Qtmp_S1, w_m2, 17);
4267     vshr_128_u64(Qtmp1, w_m2, 19);
4268     veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);
4269     vshr_128_u64(Qtmp1, w_m2, 42);
4270     veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);  //S1
4271 
4272     vmov_64(Qtmp1, w_m7);
4273     vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m6);
4274     vadd_128_32(Qtmp1, Qtmp1, w_m16);
4275     vadd_128_32(Qtmp1, Qtmp1, Qtmp_S0);
4276     vadd_128_32(w_m16, Qtmp1, Qtmp_S1); // w[i/i+1]
4277 
4278     vdup_64_32(w_m16, w_m16, 0);
4279     vdup_64_32(w_m15, w_m15, 0);
4280 }
4281 
4282 /*
4283  * Calculate Deltas w[i] ... w[i+3]
4284  */
4285 void MacroAssembler::sha256_w(FloatRegister w16, FloatRegister w14,
4286         FloatRegister w12, FloatRegister w10, FloatRegister w8,
4287         FloatRegister w6, FloatRegister w4, FloatRegister w2,
4288         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3,
4289         FloatRegister st_kw, Register counter, Register rtmp) {
4290   FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
4291   FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
4292   FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
4293   FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
4294   FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
4295   FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
4296   FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
4297   FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
4298 
4299   FloatRegister Dtmp1  = as_FloatRegister(tmp1->encoding());
4300   FloatRegister Dtmp2  = Dtmp1->successor(FloatRegisterImpl::DOUBLE);
4301   Label L_3, L_2, L_1, L_done;
4302 
4303   andr(rtmp, counter, 0x3);
4304   cmp(rtmp, 3);
4305   b(L_3, Assembler::EQ);
4306   cmp(rtmp, 2);
4307   b(L_2, Assembler::EQ);
4308   cmp(rtmp, 1);
4309   b(L_1, Assembler::EQ);
4310     vext_64(Dtmp1, w16, w15, 4);
4311     vext_64(Dtmp2, w14, w13, 4);
4312     vadd_128_32(st_kw, st_kw, tmp1);
4313     cmp(counter, 3);
4314     b(L_done, Assembler::LO);
4315     sha256_w0(w16, w15, w14, w7,  w6,  w2,  tmp1, tmp2, tmp3);
4316     sha256_w0(w14, w13, w12, w5,  w4,  w16, tmp1, tmp2, tmp3);
4317     b(L_done);
4318   BIND(L_3); {
4319     vext_64(Dtmp1, w12, w11, 4);
4320     vext_64(Dtmp2, w10, w9,  4);
4321     vadd_128_32(st_kw, st_kw, tmp1);
4322     cmp(counter, 3);
4323     b(L_done, Assembler::LO);
4324     sha256_w0(w12, w11, w10, w3,  w2,  w14, tmp1, tmp2, tmp3);
4325     sha256_w0(w10, w9,  w8,  w1,  w16, w12, tmp1, tmp2, tmp3);
4326     b(L_done);
4327   }
4328   BIND(L_2); {
4329     vext_64(Dtmp1, w8, w7, 4);
4330     vext_64(Dtmp2, w6, w5, 4);
4331     vadd_128_32(st_kw, st_kw, tmp1);
4332     cmp(counter, 3);
4333     b(L_done, Assembler::LO);
4334     sha256_w0(w8,  w7,  w6,  w15, w14, w10, tmp1, tmp2, tmp3);
4335     sha256_w0(w6,  w5,  w4,  w13, w12, w8,  tmp1, tmp2, tmp3);
4336     b(L_done);
4337   }
4338   BIND(L_1); {
4339     vext_64(Dtmp1, w4, w3, 4);
4340     vext_64(Dtmp2, w2, w1, 4);
4341     vadd_128_32(st_kw, st_kw, tmp1);
4342     cmp(counter, 3);
4343     b(L_done, Assembler::LO);
4344     sha256_w0(w4,  w3,  w2,  w11, w10, w6,  tmp1, tmp2, tmp3);
4345     sha256_w0(w2,  w1,  w16, w9,  w8,  w4,  tmp1, tmp2, tmp3);
4346   }
4347   BIND(L_done);
4348 }
4349 
4350 /**
4351  * SHA256 digest
4352  *
4353  * @param from      register pointing to source array address
4354  * @param state     register pointing to state array address
4355  */
4356 void MacroAssembler::kernel_sha256_implCompress(Register from, Register state,
4357         Register counter, Register table_k,
4358         Register ra, Register rb, Register rc, Register rd, Register re,
4359         Register rf, Register rg, Register rh,
4360         Register ra2, Register re2) {
4361 
4362     Label L_hash_loop, L_hash_loop_done, L_hash_no_w;
4363     lea(table_k, ExternalAddress(StubRoutines::sha256_table_addr()));
4364 
4365     // read next k
4366     vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
4367     // read initial 16 W elements in q8-q11
4368     vld1_64(d16, d17, d18, d19, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
4369     vld1_64(d20, d21, d22, d23, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
4370     // revert W
4371     vrev32_128_8(q8,  q8);
4372     vrev32_128_8(q9,  q9);
4373     vrev32_128_8(q10, q10);
4374     vrev32_128_8(q11, q11);
4375 
4376     vadd_128_32(q7, q7, q8); // k + w
4377 
4378     vdup_64_32(d31, d23, 1);  //w1
4379     vdup_64_32(d30, d23, 0);  //w2
4380     vdup_64_32(d29, d22, 1);  //w3
4381     vdup_64_32(d28, d22, 0);  //w4
4382     vdup_64_32(d27, d21, 1);  //w5
4383     vdup_64_32(d26, d21, 0);  //w6
4384     vdup_64_32(d25, d20, 1);  //w7
4385     vdup_64_32(d24, d20, 0);  //w8
4386     vdup_64_32(d23, d19, 1);  //w9
4387     vdup_64_32(d22, d19, 0);  //w10
4388     vdup_64_32(d21, d18, 1);  //w11
4389     vdup_64_32(d20, d18, 0);  //w12
4390     vdup_64_32(d19, d17, 1);  //w13
4391     vdup_64_32(d18, d17, 0);  //w14
4392     vdup_64_32(d17, d16, 1);  //w15
4393     vdup_64_32(d16, d16, 0);  //w16
4394 
4395     mov(counter, 16);
4396     // load state
4397     push(state, sp);
4398     ldr(ra, Address(post(state, 4)));
4399     ldr(rb, Address(post(state, 4)));
4400     ldr(rc, Address(post(state, 4)));
4401     ldr(rd, Address(post(state, 4)));
4402     ldr(re, Address(post(state, 4)));
4403     ldr(rf, Address(post(state, 4)));
4404     ldr(rg, Address(post(state, 4)));
4405     ldr(rh, Address(state));
4406 
4407     const Register tmp = from;
4408     const Register step = state;
4409 
4410     // calculate deltas
4411     sha256_w0(d16, d17, d18, d25,  d26,  d30, q0, q1, q2);
4412     sha256_w0(d18, d19, d20, d27,  d28,  d16, q0, q1, q2);
4413 
4414     mov(step, 0); // use state for internal counter
4415     sub(counter, counter, 1);
4416 
4417     sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
4418         step,
4419         tmp, ra2, re2);
4420 
4421     BIND(L_hash_loop); {
4422       // read next k
4423       vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
4424       //calculate deltas
4425       sha256_w(q8, q9, q10, q11, q12, q13, q14, q15,
4426         q0, q1, q2,
4427         q7,
4428         counter, tmp);
4429 
4430       //calculate state
4431       sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
4432         step,
4433         tmp, ra2, re2);
4434       sub(counter, counter, 1);
4435     } cbnz(counter, L_hash_loop);
4436 
4437     pop(state, sp);
4438 
4439     // load initial state and add to current state
4440     ldr(tmp, Address(post(state, 4)));
4441     add(rb, rb, tmp);
4442     ldr(tmp, Address(post(state, 4)));
4443     add(rc, rc, tmp);
4444     ldr(tmp, Address(post(state, 4)));
4445     add(rd, rd, tmp);
4446     ldr(tmp, Address(post(state, 4)));
4447     add(ra2, ra2, tmp);
4448     ldr(tmp, Address(post(state, 4)));
4449     add(rf, rf, tmp);
4450     ldr(tmp, Address(post(state, 4)));
4451     add(rg, rg, tmp);
4452     ldr(tmp, Address(post(state, 4)));
4453     add(rh, rh, tmp);
4454     ldr(tmp, Address(state));
4455     add(re2, re2, tmp);
4456     sub(state, state, 28);
4457 
4458     // save state
4459     str(rb,  Address(post(state, 4)));
4460     str(rc,  Address(post(state, 4)));
4461     str(rd,  Address(post(state, 4)));
4462     str(ra2, Address(post(state, 4)));
4463     str(rf,  Address(post(state, 4)));
4464     str(rg,  Address(post(state, 4)));
4465     str(rh,  Address(post(state, 4)));
4466     str(re2, Address(post(state, 4)));
4467 }
4468 
4469 /**
4470  * SHA512 Sigma
4471  * Sigma(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR ROTR(x, sh3)
4472  */
4473 void MacroAssembler::sha512_sigma(FloatRegister x,
4474         FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3) {
4475   FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
4476   FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
4477   assert_different_registers(x, Dtmp0, Dtmp1, Dsigma);
4478 
4479   vshr_64_u64(Dtmp0, x, sh1);
4480   vshl_64_64(Dtmp1, x, 64-sh1);
4481   vorr_64(Dsigma, Dtmp0, Dtmp1);
4482 
4483   vshr_64_u64(Dtmp0, x, sh2);
4484   vshl_64_64(Dtmp1, x, 64-sh2);
4485   vorr_64(Dtmp0, Dtmp0, Dtmp1);
4486 
4487   veor_64(Dsigma, Dsigma, Dtmp0);
4488 
4489   vshr_64_u64(Dtmp0, x, sh3);
4490   vshl_64_64(Dtmp1, x, 64-sh3);
4491   vorr_64(Dtmp0, Dtmp0, Dtmp1);
4492 
4493   veor_64(Dsigma, Dsigma, Dtmp0);
4494 }
4495 
4496 /**
4497  * SHA512 Delta
4498  * Delta(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR SHR(x, sh3)
4499  */
4500 void MacroAssembler::sha512_delta(FloatRegister x,
4501         FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3) {
4502   FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
4503   FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
4504   assert_different_registers(x, Dtmp0, Dtmp1, Ddelta);
4505 
4506   vshr_64_u64(Dtmp0, x, sh1);
4507   vshl_64_64(Dtmp1, x, 64-sh1);
4508   vorr_64(Ddelta, Dtmp0, Dtmp1);
4509 
4510   vshr_64_u64(Dtmp0, x, sh2);
4511   vshl_64_64(Dtmp1, x, 64-sh2);
4512   vorr_64(Dtmp0, Dtmp0, Dtmp1);
4513 
4514   veor_64(Ddelta, Ddelta, Dtmp0);
4515 
4516   vshr_64_u64(Dtmp0, x, sh3);
4517 
4518   veor_64(Ddelta, Ddelta, Dtmp0);
4519 }
4520 
4521 /**
4522  * SHA512 Ch
4523  * Ch(x, y, z) = (x AND y) XOR ( NOT x AND z)
4524  */
4525 void MacroAssembler::sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z,
4526         FloatRegister Dtmp, FloatRegister Dch) {
4527   assert_different_registers(x, Dtmp, Dch);
4528 
4529   vmvn_64(Dtmp, x);
4530   vand_64(Dtmp, Dtmp, z);
4531 
4532   vand_64(Dch, x, y);
4533   veor_64(Dch, Dtmp, Dch);
4534 }
4535 
4536 /**
4537  * SHA512 Maj
4538  * Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
4539  */
4540 void MacroAssembler::sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z,
4541         FloatRegister Dtmp, FloatRegister Dmaj) {
4542   assert_different_registers(x, Dtmp, Dmaj);
4543 
4544   vand_64(Dmaj, x, y);
4545   vand_64(Dtmp, x, z);
4546   veor_64(Dmaj, Dmaj, Dtmp);
4547   vand_64(Dtmp, y, z);
4548   veor_64(Dmaj, Dmaj, Dtmp);
4549 }
4550 
4551 /**
4552  * SHA512 digest
4553  *
4554  * @param from      register pointing to source array address
4555  * @param state     register pointing to state array address
4556  */
4557 void MacroAssembler::kernel_sha512_implCompress(Register from, Register state,
4558         Register counter, Register table_k) {
4559   Label L_hash_loop, L_hash_no_w;
4560   FloatRegister st_a = d18;  //q9-q12
4561   FloatRegister st_b = st_a->successor(FloatRegisterImpl::DOUBLE);
4562   FloatRegister st_c = st_b->successor(FloatRegisterImpl::DOUBLE);
4563   FloatRegister st_d = st_c->successor(FloatRegisterImpl::DOUBLE);
4564   FloatRegister st_e = st_d->successor(FloatRegisterImpl::DOUBLE);
4565   FloatRegister st_f = st_e->successor(FloatRegisterImpl::DOUBLE);
4566   FloatRegister st_g = st_f->successor(FloatRegisterImpl::DOUBLE);
4567   FloatRegister st_h = st_g->successor(FloatRegisterImpl::DOUBLE);
4568 
4569   FloatRegister w16 = d0;  //q0-q7
4570   FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
4571   FloatRegister w14 = w15->successor(FloatRegisterImpl::DOUBLE);
4572   FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
4573   FloatRegister w12 = w13->successor(FloatRegisterImpl::DOUBLE);
4574   FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
4575   FloatRegister w10 = w11->successor(FloatRegisterImpl::DOUBLE);
4576   FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
4577   FloatRegister w8  = w9->successor(FloatRegisterImpl::DOUBLE);
4578   FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
4579   FloatRegister w6  = w7->successor(FloatRegisterImpl::DOUBLE);
4580   FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
4581   FloatRegister w4  = w5->successor(FloatRegisterImpl::DOUBLE);
4582   FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
4583   FloatRegister w2  = w3->successor(FloatRegisterImpl::DOUBLE);
4584   FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
4585 
4586   FloatRegister t1  = d26;
4587   FloatRegister t2  = d27;
4588   FloatRegister new_a = st_h;
4589   FloatRegister new_e = st_d;
4590   FloatRegister new_new_a = st_g;
4591   FloatRegister new_new_e = st_c;
4592 
4593   FloatRegister w0  = w1->successor(FloatRegisterImpl::DOUBLE);
4594   assert_different_registers(st_a,st_b,st_c,st_d,st_e,st_f,st_g,st_h);
4595   assert_different_registers(w0,w1,w2,w3,w4,w5,w6,w7);
4596   assert_different_registers(w8,w9,w10,w11,w12,w13,w14,w15,w16);
4597 
4598   lea(table_k, ExternalAddress(StubRoutines::sha512_table_addr()));
4599 
4600   // read initial 16 W elements
4601   vld1_64(w16,  w15,  w14,  w13,  Address(post(from, 32)), Assembler::ALIGN_STD);
4602   vld1_64(w12,  w11,  w10,  w9,   Address(post(from, 32)), Assembler::ALIGN_STD);
4603   vld1_64(w8,   w7,   w6,   w5,   Address(post(from, 32)), Assembler::ALIGN_STD);
4604   vld1_64(w4,   w3,   w2,   w1,   Address(from),           Assembler::ALIGN_STD);
4605   // read initial state to a,b,c,d,e,f,g,h
4606   vld1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
4607   vld1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
4608   sub(state, state, 32);
4609 
4610   // revert W
4611   vrev64_128_8(w16, w16);
4612   vrev64_128_8(w14, w14);
4613   vrev64_128_8(w12, w12);
4614   vrev64_128_8(w10, w10);
4615   vrev64_128_8(w8,  w8);
4616   vrev64_128_8(w6,  w6);
4617   vrev64_128_8(w4,  w4);
4618   vrev64_128_8(w2,  w2);
4619 
4620 
4621   mov(counter, 40);
4622   BIND(L_hash_loop); {
4623     sub(counter, counter, 1);
4624     // first iteration
4625     // calculate T1
4626     // read K
4627     vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
4628     vadd_64_64(d31, st_h, w16);
4629     sha512_ch(st_e, st_f, st_g, t2, t1);
4630     sha512_sigma(st_e, q14, t2, 14, 18, 41);
4631     vadd_128_64(q13, q13, q15);
4632     vadd_64_64(t1, t1, t2);
4633 
4634     // calculate T2
4635     sha512_maj(st_a, st_b, st_c, d30, d31);
4636     sha512_sigma(st_a, q14, t2, 28, 34, 39);
4637     vadd_64_64(t2, t2, d31);
4638 
4639     vadd_64_64(new_a, t1, t2);
4640     vadd_64_64(new_e, st_d,  t1);
4641 
4642     // second iteration
4643     // calculate T1
4644     // read K
4645     vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
4646     vadd_64_64(d31, st_g, w15);
4647     sha512_ch(new_e, st_e, st_f, t2, t1);
4648     sha512_sigma(new_e, q14, t2, 14, 18, 41);
4649     vadd_128_64(q13, q13, q15);
4650     vadd_64_64(t1, t1, t2);
4651 
4652     // calculate T2
4653     sha512_maj(new_a, st_a, st_b, d30, d31);
4654     sha512_sigma(new_a, q14, t2, 28, 34, 39);
4655     vadd_64_64(t2, t2, d31);
4656 
4657     vadd_64_64(new_new_a, t1, t2);
4658     vadd_64_64(new_new_e, st_c,  t1);
4659 
4660     // restore a,b,c,d,e,f,g,h sequence
4661     vswp_128(st_g, st_a);
4662     vswp_128(st_g, st_c);
4663     vswp_128(st_g, st_e);
4664 
4665     cmp(counter, 8);
4666     b(L_hash_no_w, Assembler::LO);
4667 
4668     // calculate W[+1], W[+2]
4669     sha512_delta(w15, q14, t1, 1, 8, 7);
4670     sha512_delta(w2,  q14, d30, 19, 61, 6);
4671     sha512_delta(w14, q14, t2, 1, 8, 7);
4672     sha512_delta(w1,  q14, d31, 19, 61, 6);
4673 
4674     vadd_128_64(w16, w16, t1);
4675     vadd_128_64(w16, w16, q15);
4676     vadd_64_64(w16, w16, w7);
4677     vadd_64_64(w15, w15, w6);
4678 
4679     BIND(L_hash_no_w);
4680 
4681     vswp_128(w16, w14);
4682     vswp_128(w14, w12);
4683     vswp_128(w12, w10);
4684     vswp_128(w10, w8);
4685     vswp_128(w8,  w6);
4686     vswp_128(w6,  w4);
4687     vswp_128(w4,  w2);
4688   } cbnz(counter, L_hash_loop);
4689   // read initial state to w16 - w9
4690   vld1_64(w16, w15, w14, w13, Address(post(state, 32)), Assembler::ALIGN_STD);
4691   vld1_64(w12, w11, w10, w9,  Address(state),           Assembler::ALIGN_STD);
4692   sub(state, state, 32);
4693 
4694   // update state
4695   vadd_128_64(st_a, st_a, w16);
4696   vadd_128_64(st_c, st_c, w14);
4697   vadd_128_64(st_e, st_e, w12);
4698   vadd_128_64(st_g, st_g, w10);
4699 
4700   // store state
4701   vst1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
4702   vst1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
4703 }
4704 
4705 void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
4706   if (width > 15 && lsb == 0) {
4707     lsr(Rd, Rd, width);
4708     lsl(Rd, Rd, width);
4709   } else if (width > 15 && lsb + width == 32) {
4710     lsl(Rd, Rd, 32 - lsb);
4711     lsr(Rd, Rd, 32 - lsb);
4712   } else {
4713     const int lsb1 = (lsb & 1);
4714     int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
4715     while (width) {
4716       bic(Rd, Rd, ((1 << w1) - 1) << lsb);
4717       width -= w1;
4718       lsb += w1;
4719       w1 = width > 8 ? 8 : width;
4720     }
4721   }
4722 }
4723 
4724 // get_thread can be called anywhere inside generated code so we need
4725 // to save whatever non-callee save context might get clobbered by the
4726 // call to the C thread_local lookup call or, indeed, the call setup
4727 // code. x86 appears to save C arg registers.
4728 
4729 void MacroAssembler::get_thread(Register dst) {
4730   // call pthread_getspecific
4731   // void * pthread_getspecific(pthread_key_t key);
4732 
4733   // Save all call-clobbered regs except dst, plus rscratch1 and rscratch2.
4734   RegSet saved_regs = RegSet::range(r0, r3) + rscratch1 + rscratch2 + lr - dst;
4735   push(saved_regs, sp);
4736 
4737   // Align stack and save value for return
4738   mov(c_rarg1, sp);
4739   sub(sp, sp, wordSize);
4740   bic(sp, sp, 7);
4741   str(c_rarg1, Address(sp));
4742 
4743   mov(rscratch2, CAST_FROM_FN_PTR(address, Thread::current));
4744 
4745   bl(rscratch2);
4746   //undo alignment
4747   ldr(sp, Address(sp));
4748 
4749   if (dst != c_rarg0) {
4750     mov(dst, c_rarg0);
4751   }
4752 
4753   // restore pushed registers
4754   pop(saved_regs, sp);
4755 }
4756 
4757 #ifdef COMPILER2
4758 // 24-bit word range == 26-bit byte range
4759 bool check26(int offset) {
4760   // this could be simplified, but it mimics encoding and decoding
4761   // an actual branch insrtuction
4762   int off1 = offset << 6 >> 8;
4763   int encoded = off1 & ((1<<24)-1);
4764   int decoded = encoded << 8 >> 6;
4765   return offset == decoded;
4766 }
4767 
4768 // Perform some slight adjustments so the default 32MB code cache
4769 // is fully reachable.
4770 static inline address first_cache_address() {
4771   return CodeCache::low_bound() + sizeof(HeapBlock::Header);
4772 }
4773 static inline address last_cache_address() {
4774   return CodeCache::high_bound() - NativeInstruction::arm_insn_sz;
4775 }
4776 
4777 // Can we reach target using unconditional branch or call from anywhere
4778 // in the code cache (because code can be relocated)?
4779 bool MacroAssembler::_reachable_from_cache(address target) {
4780 #ifdef __thumb__
4781   if ((1 & (intptr_t)target) != 0) {
4782     // Return false to avoid 'b' if we need switching to THUMB mode.
4783     return false;
4784   }
4785 #endif
4786 
4787   address cl = first_cache_address();
4788   address ch = last_cache_address();
4789 
4790   if (ForceUnreachable) {
4791     // Only addresses from CodeCache can be treated as reachable.
4792     if (target < CodeCache::low_bound() || CodeCache::high_bound() <= target) {
4793       return false;
4794     }
4795   }
4796 
4797   intptr_t loffset = (intptr_t)target - (intptr_t)cl;
4798   intptr_t hoffset = (intptr_t)target - (intptr_t)ch;
4799 
4800   return check26(loffset - 8) && check26(hoffset - 8);
4801 }
4802 
4803 bool MacroAssembler::_cache_fully_reachable() {
4804   address cl = first_cache_address();
4805   address ch = last_cache_address();
4806   return _reachable_from_cache(cl) && _reachable_from_cache(ch);
4807 }
4808 
4809 bool MacroAssembler::reachable_from_cache(address target) {
4810   assert(CodeCache::contains(pc()), "not supported");
4811   return _reachable_from_cache(target);
4812 }
4813 
4814 bool MacroAssembler::cache_fully_reachable() {
4815   return _cache_fully_reachable();
4816 }
4817 
4818 // IMPORTANT: does not generate mt-safe patchable code
4819 void MacroAssembler::call(address target, RelocationHolder rspec, Condition cond) {
4820   Register scratch = lr;
4821   assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported");
4822   if (reachable_from_cache(target)) {
4823     relocate(rspec);
4824     bl(target, cond);
4825     return;
4826   }
4827 
4828   mov(scratch, (intptr_t)target, cond);
4829   bl(scratch, cond);
4830 }
4831 
4832 // IMPORTANT: does not generate mt-safe patchable code. C2 only uses this method
4833 // for calls into runtime which do not need mt-safe patching
4834 void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch, Condition cond) {
4835   assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
4836   if (reachable_from_cache(target)) {
4837     relocate(rtype);
4838     b(target, cond);
4839     return;
4840   }
4841 
4842   mov(scratch, (intptr_t)target, cond);
4843   b(scratch, cond);
4844 }
4845 
4846 void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) {
4847   // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM
4848   if (UseStackBanging) {
4849     const int page_size = os::vm_page_size();
4850 
4851     sub(tmp, sp, StackShadowPages*page_size);
4852     strb(r0, Address(tmp));
4853     for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) {
4854       strb(r0, pre(tmp, -0xff0));
4855     }
4856   }
4857 }
4858 
4859 void MacroAssembler::floating_cmp(Register dst) {
4860   vmrs(dst);
4861   orr(dst, dst, 0x08000000);
4862   eor(dst, dst, dst, lsl(3));
4863   mov(dst, dst, asr(30));
4864 }
4865 
4866 void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2) {
4867   assert(Roop != Rscratch, "");
4868   assert(Roop != Rmark, "");
4869   assert(Rbox != Rscratch, "");
4870   assert(Rbox != Rmark, "");
4871 
4872   Label fast_lock, done;
4873 
4874   if (UseBiasedLocking && !UseOptoBiasInlining) {
4875     Label failed;
4876     biased_locking_enter(Roop, Rmark, Rscratch, Rscratch2, false, done, &failed);
4877     bind(failed);
4878   }
4879 
4880   ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes()));
4881   tst(Rmark, markOopDesc::unlocked_value);
4882   b(fast_lock, Assembler::NE);
4883 
4884   // Check for recursive lock
4885   // See comments in InterpreterMacroAssembler::lock_object for
4886   // explanations on the fast recursive locking check.
4887   // -1- test low 2 bits
4888   movs(Rscratch, Rmark, lsl(30));
4889   // -2- test (hdr - SP) if the low two bits are 0
4890   sub(Rscratch, Rmark, sp, Assembler::EQ);
4891   movs(Rscratch, Rscratch, lsr(exact_log2(os::vm_page_size())), Assembler::EQ);
4892   // If still 'eq' then recursive locking OK
4893   // set to zero if recursive lock, set to non zero otherwise (see discussion in JDK-8153107)
4894   str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
4895   b(done);
4896 
4897   bind(fast_lock);
4898   str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
4899 
4900   membar(StoreStore);
4901   ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
4902   cmp(Rscratch, Rmark);
4903   strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
4904   cmp(Rscratch, 0, Assembler::EQ);
4905   membar(AnyAny);
4906 
4907   bind(done);
4908 }
4909 
4910 void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2) {
4911   Register Rmark      = Rscratch2;
4912 
4913   assert(Roop != Rscratch, "");
4914   assert(Roop != Rmark, "");
4915   assert(Rbox != Rscratch, "");
4916   assert(Rbox != Rmark, "");
4917 
4918   Label done;
4919 
4920   if (UseBiasedLocking && !UseOptoBiasInlining) {
4921     biased_locking_exit(Roop, Rscratch, done);
4922   }
4923 
4924   ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
4925   // If hdr is NULL, we've got recursive locking and there's nothing more to do
4926   cmp(Rmark, 0);
4927   b(done, Assembler::EQ);
4928 
4929   // Restore the object header
4930   membar(AnyAny);
4931   ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
4932   cmp(Rscratch, Rmark);
4933   strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
4934   cmp(Rscratch, 0, Assembler::EQ);
4935 
4936   membar(StoreLoad);
4937 
4938   bind(done);
4939 }
4940 
4941 #endif