1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include <sys/types.h>
  28 
  29 #include "precompiled.hpp"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 
  34 #include "compiler/disassembler.hpp"
  35 #include "memory/resourceArea.hpp"
  36 #include "nativeInst_aarch32.hpp"
  37 //This ifdef was introduced so a core build can be built
  38 #ifdef COMPILER2
  39 #include "opto/compile.hpp"
  40 #include "opto/node.hpp"
  41 #endif
  42 
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 
  48 #if INCLUDE_ALL_GCS
  49 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  50 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  51 #include "gc_implementation/g1/heapRegion.hpp"
  52 #endif
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // FIXME This is not a nice fix, this constant was in a compiler2 header
  65 #define MAX_stubs_size_div2 (128 / 2)
  66 // FIXME END
  67 
  68 // Note the corrections in the following three instructions for the PC.
  69 // All literal modes that use the PC need to have the offset adjusted
  70 // Patch any kind of instruction; there may be several instructions.
  71 // Return the total length (in bytes) of the instructions.
  72 
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   // Note the corrections
  75   int instructions = 1;
  76   long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
  77   bool add = offset >= 0;
  78   unsigned insn = *(unsigned*)branch;
  79   int opc = Instruction_aarch32::extract(insn, 27, 24);
  80 
  81   if(0b1010 == opc || 0b1011 == opc) {
  82     // Branch or branch with link
  83     assert(0 == (offset & 3), "not aligned correctly");
  84     Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
  85   } else if (0b0011 == opc) {
  86     // Movw, Movt or mov, orr, orr, orr
  87     // patch up address load to registers (absolute address).
  88       instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
  89   } else if (0b010 == (opc >> 1)) {
  90     // LDR, LDRB, STR, STRB
  91     Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
  92     Instruction_aarch32::patch(branch, 23, 23, add);
  93   } else if (0b000 == (opc >> 1)) {
  94     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
  95     offset = uabs(offset);
  96     Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
  97     Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
  98     Instruction_aarch32::patch(branch, 23, 23, add);
  99   } else if (0b1101 == opc) {
 100     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 101     offset = uabs(offset);
 102     assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
 103     Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
 104     Instruction_aarch32::patch(branch, 23, 23, add);
 105   } else if (0b0010 == opc) {
 106     // ADR
 107     Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
 108     Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
 109   } else {
 110     ShouldNotReachHere();
 111   }
 112   // aarch64 had something for polling page load?
 113   return instructions * NativeInstruction::arm_insn_sz;
 114 }
 115 
 116 int MacroAssembler::patch_oop(address insn_addr, address o) {
 117     unsigned insn = *(unsigned*)insn_addr;
 118     int opc = Instruction_aarch32::extract(insn, 27, 21);
 119     if(0b0011000 == opc) {
 120         //32-bit pointers, formed of a mov and a movt
 121         assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
 122 
 123         uint32_t btm = (uint32_t)o & 0xffff;
 124         Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
 125         Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
 126         uint32_t top = (uint32_t)o >> 16;
 127         Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
 128         Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
 129         return 2 * NativeInstruction::arm_insn_sz;
 130   } else if(0b0011101 == opc) {
 131     //Instead 32bit load sequence uses mov, orr, orr, orr
 132     assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
 133     assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
 134     assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
 135     // FIXME this could carry us outside valid memory
 136 
 137     uint32_t addr = (uint32_t)o;
 138     Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
 139     Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
 140     Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
 141     Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
 142     return 4 * NativeInstruction::arm_insn_sz;
 143   } else {
 144     ShouldNotReachHere();
 145   }
 146   return 0; //won't reach here
 147 }
 148 
 149 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 150   long offset = 0;
 151   int opc = Instruction_aarch32::extract(insn, 27, 24);
 152 
 153   if(0b1010 == opc || 0b1011 == opc) {
 154     // Branch or branch with link
 155     offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
 156   } else if (0b0011 == opc) {
 157     unsigned *insn_buf = (unsigned*)insn_addr;
 158     int opc2 = Instruction_aarch32::extract(insn, 23, 21);
 159     if(0b000 == opc2) {
 160       // movw, movt (only on newer ARMs)
 161       assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
 162       u_int32_t addr;
 163       addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
 164       addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
 165       addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
 166       addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
 167       return address(addr);
 168     } else if(0b101 == opc2) {
 169       // mov, orr, orr, orr
 170       assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
 171       assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
 172       assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
 173       u_int32_t addr;
 174       addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
 175       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
 176       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
 177       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
 178       return address(addr);
 179     } else {
 180       ShouldNotReachHere();
 181     }
 182   } else if (0b010 == (opc >> 1)) {
 183     // LDR, LDRB, STR, STRB
 184     offset = Instruction_aarch32::extract(insn, 11, 0);
 185     bool add = Instruction_aarch32::extract(insn, 23, 23);
 186     offset = add ? offset : -offset;
 187   } else if (0b000 == (opc >> 1)) {
 188     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
 189     offset = Instruction_aarch32::extract(insn, 3, 0);
 190     offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
 191     bool add = Instruction_aarch32::extract(insn, 23, 23);
 192     offset = add ? offset : -offset;
 193   } else if (0b1101 == opc) {
 194     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 195     offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
 196     bool add = Instruction_aarch32::extract(insn, 23, 23);
 197     offset = add ? offset : -offset;
 198   } else if (0b0010 == opc) {
 199     // ADR
 200     offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
 201     int code = Instruction_aarch32::extract(insn, 23, 22);
 202     switch(code) {
 203       case 0b01: offset = -offset; break;
 204       case 0b10:                   break;
 205       default: ShouldNotReachHere();
 206     }
 207   } else {
 208     ShouldNotReachHere();
 209   }
 210   //Correct offset for PC
 211   offset -= 8;
 212   return address(((u_int32_t)insn_addr + offset));
 213 }
 214 
 215 
 216 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 217   dsb(Assembler::SY);
 218 }
 219 
 220 
 221 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 222                                            bool clear_pc) {
 223   mov(rscratch1, 0);
 224   // we must set sp to zero to clear frame
 225   str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
 226   // must clear fp, so that compiled frames are not confused; it is
 227   // possible that we need it only for debugging
 228   if (clear_fp) {
 229     str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
 230   }
 231 
 232   if (clear_pc) {
 233     str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
 234   }
 235 }
 236 
 237 // Calls to C land
 238 //
 239 // When entering C land, the rfp & sp of the last Java frame have to be recorded
 240 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 241 // has to be reset to 0. This is required to allow proper stack traversal.
 242 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 243                                          Register last_java_fp,
 244                                          Register last_java_pc,
 245                                          Register scratch) {
 246 
 247   if (last_java_pc->is_valid()) {
 248       str(last_java_pc, Address(rthread,
 249                                 JavaThread::frame_anchor_offset()
 250                                 + JavaFrameAnchor::last_Java_pc_offset()));
 251     }
 252 
 253   // determine last_java_sp register
 254   if (last_java_sp == sp) {
 255     mov(scratch, sp);
 256     last_java_sp = scratch;
 257   } else if (!last_java_sp->is_valid()) {
 258     last_java_sp = sp;
 259   }
 260 
 261   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 262 
 263   // last_java_fp is optional
 264   if (last_java_fp->is_valid()) {
 265     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 266   }
 267 }
 268 
 269 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 270                                          Register last_java_fp,
 271                                          address  last_java_pc,
 272                                          Register scratch) {
 273   if (last_java_pc != NULL) {
 274     adr(scratch, last_java_pc);
 275   } else {
 276     // FIXME: This is almost never correct.  We should delete all
 277     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 278     // correct return address instead.
 279     adr(scratch, pc());
 280   }
 281 
 282   str(scratch, Address(rthread,
 283                        JavaThread::frame_anchor_offset()
 284                        + JavaFrameAnchor::last_Java_pc_offset()));
 285 
 286   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 287 }
 288 
 289 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 290                                          Register last_java_fp,
 291                                          Label &L,
 292                                          Register scratch) {
 293   if (L.is_bound()) {
 294     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 295   } else {
 296     InstructionMark im(this);
 297     L.add_patch_at(code(), locator());
 298     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 299   }
 300 }
 301 
 302 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 303   assert(CodeCache::find_blob(entry.target()) != NULL,
 304          "destination of far call not found in code cache");
 305   // TODO performance issue: if intented to patch later,
 306   // generate mov rX, imm; bl rX far call (to reserve space)
 307   if (entry.rspec().type() != relocInfo::none || far_branches()) {
 308     lea(tmp, entry);
 309     if (cbuf) cbuf->set_insts_mark();
 310     bl(tmp);
 311   } else {
 312     if (cbuf) cbuf->set_insts_mark();
 313     bl(entry);
 314   }
 315 }
 316 
 317 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 318   assert(CodeCache::find_blob(entry.target()) != NULL,
 319          "destination of far call not found in code cache");
 320   assert(!external_word_Relocation::is_reloc_index((intptr_t)entry.target()), "can't far jump to reloc index)");
 321   // TODO performance issue: if intented to patch later,
 322   // generate mov rX, imm; bl rX far call (to reserve space)
 323   if (entry.rspec().type() != relocInfo::none || far_branches()) {
 324     lea(tmp, entry);
 325     if (cbuf) cbuf->set_insts_mark();
 326     b(tmp);
 327   } else {
 328     if (cbuf) cbuf->set_insts_mark();
 329     b(entry);
 330   }
 331 }
 332 
 333 int MacroAssembler::biased_locking_enter(Register lock_reg,
 334                                          Register obj_reg,
 335                                          Register swap_reg,
 336                                          Register tmp_reg,
 337                                          bool swap_reg_contains_mark,
 338                                          Label& done,
 339                                          Label* slow_case,
 340                                          BiasedLockingCounters* counters) {
 341   assert(UseBiasedLocking, "why call this otherwise?");
 342   assert_different_registers(lock_reg, obj_reg, swap_reg);
 343 
 344   if (PrintBiasedLockingStatistics && counters == NULL)
 345     counters = BiasedLocking::counters();
 346 
 347   bool need_tmp_reg = false;
 348   if (tmp_reg == noreg) {
 349     tmp_reg = rscratch2;
 350   }
 351   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 352   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 353   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 354   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 355   Address saved_mark_addr(lock_reg, 0);
 356 
 357   // Biased locking
 358   // See whether the lock is currently biased toward our thread and
 359   // whether the epoch is still valid
 360   // Note that the runtime guarantees sufficient alignment of JavaThread
 361   // pointers to allow age to be placed into low bits
 362   // First check to see whether biasing is even enabled for this object
 363   Label cas_label;
 364   int null_check_offset = -1;
 365   if (!swap_reg_contains_mark) {
 366     null_check_offset = offset();
 367     ldr(swap_reg, mark_addr);
 368   }
 369   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 370   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 371   b(cas_label, Assembler::NE);
 372   // The bias pattern is present in the object's header. Need to check
 373   // whether the bias owner and the epoch are both still current.
 374   load_prototype_header(tmp_reg, obj_reg);
 375   orr(tmp_reg, tmp_reg, rthread);
 376   eor(tmp_reg, swap_reg, tmp_reg);
 377 //  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 378   bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
 379   if (counters != NULL) {
 380     Label around;
 381     cbnz(tmp_reg, around);
 382     atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 383     b(done);
 384     bind(around);
 385   } else {
 386     cbz(tmp_reg, done);
 387   }
 388 
 389   Label try_revoke_bias;
 390   Label try_rebias;
 391 
 392   // At this point we know that the header has the bias pattern and
 393   // that we are not the bias owner in the current epoch. We need to
 394   // figure out more details about the state of the header in order to
 395   // know what operations can be legally performed on the object's
 396   // header.
 397 
 398   // If the low three bits in the xor result aren't clear, that means
 399   // the prototype header is no longer biased and we have to revoke
 400   // the bias on this object.
 401   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 402   cbnz(rscratch1, try_revoke_bias);
 403 
 404   // Biasing is still enabled for this data type. See whether the
 405   // epoch of the current bias is still valid, meaning that the epoch
 406   // bits of the mark word are equal to the epoch bits of the
 407   // prototype header. (Note that the prototype header's epoch bits
 408   // only change at a safepoint.) If not, attempt to rebias the object
 409   // toward the current thread. Note that we must be absolutely sure
 410   // that the current epoch is invalid in order to do this because
 411   // otherwise the manipulations it performs on the mark word are
 412   // illegal.
 413   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 414   cbnz(rscratch1, try_rebias);
 415 
 416   // The epoch of the current bias is still valid but we know nothing
 417   // about the owner; it might be set or it might be clear. Try to
 418   // acquire the bias of the object using an atomic operation. If this
 419   // fails we will go in to the runtime to revoke the object's bias.
 420   // Note that we first construct the presumed unbiased header so we
 421   // don't accidentally blow away another thread's valid bias.
 422   {
 423     Label here;
 424     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 425     andr(swap_reg, swap_reg, rscratch1);
 426     orr(tmp_reg, swap_reg, rthread);
 427     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 428     // If the biasing toward our thread failed, this means that
 429     // another thread succeeded in biasing it toward itself and we
 430     // need to revoke that bias. The revocation will occur in the
 431     // interpreter runtime in the slow case.
 432     bind(here);
 433     if (counters != NULL) {
 434       atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 435                   tmp_reg, rscratch1);
 436     }
 437   }
 438   b(done);
 439 
 440   bind(try_rebias);
 441   // At this point we know the epoch has expired, meaning that the
 442   // current "bias owner", if any, is actually invalid. Under these
 443   // circumstances _only_, we are allowed to use the current header's
 444   // value as the comparison value when doing the cas to acquire the
 445   // bias in the current epoch. In other words, we allow transfer of
 446   // the bias from one thread to another directly in this situation.
 447   //
 448   // FIXME: due to a lack of registers we currently blow away the age
 449   // bits in this situation. Should attempt to preserve them.
 450   {
 451     Label here;
 452     load_prototype_header(tmp_reg, obj_reg);
 453     orr(tmp_reg, rthread, tmp_reg);
 454     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 455     // If the biasing toward our thread failed, then another thread
 456     // succeeded in biasing it toward itself and we need to revoke that
 457     // bias. The revocation will occur in the runtime in the slow case.
 458     bind(here);
 459     if (counters != NULL) {
 460       atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
 461                   tmp_reg, rscratch1);
 462     }
 463   }
 464   b(done);
 465 
 466   bind(try_revoke_bias);
 467   // The prototype mark in the klass doesn't have the bias bit set any
 468   // more, indicating that objects of this data type are not supposed
 469   // to be biased any more. We are going to try to reset the mark of
 470   // this object to the prototype value and fall through to the
 471   // CAS-based locking scheme. Note that if our CAS fails, it means
 472   // that another thread raced us for the privilege of revoking the
 473   // bias of this particular object, so it's okay to continue in the
 474   // normal locking code.
 475   //
 476   // FIXME: due to a lack of registers we currently blow away the age
 477   // bits in this situation. Should attempt to preserve them.
 478   {
 479     Label here, nope;
 480     load_prototype_header(tmp_reg, obj_reg);
 481     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 482     bind(here);
 483 
 484     // Fall through to the normal CAS-based lock, because no matter what
 485     // the result of the above CAS, some thread must have succeeded in
 486     // removing the bias bit from the object's header.
 487     if (counters != NULL) {
 488       atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 489                   rscratch1);
 490     }
 491     bind(nope);
 492   }
 493 
 494   bind(cas_label);
 495 
 496   return null_check_offset;
 497 }
 498 
 499 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 500   assert(UseBiasedLocking, "why call this otherwise?");
 501 
 502   // Check for biased locking unlock case, which is a no-op
 503   // Note: we do not have to check the thread ID for two reasons.
 504   // First, the interpreter checks for IllegalMonitorStateException at
 505   // a higher level. Second, if the bias was revoked while we held the
 506   // lock, the object could not be rebiased toward another thread, so
 507   // the bias bit would be clear.
 508   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 509   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 510   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 511   b(done, Assembler::EQ);
 512 }
 513 
 514 
 515 static void pass_arg0(MacroAssembler* masm, Register arg) {
 516   if (c_rarg0 != arg ) {
 517     masm->mov(c_rarg0, arg);
 518   }
 519 }
 520 
 521 static void pass_arg1(MacroAssembler* masm, Register arg) {
 522   if (c_rarg1 != arg ) {
 523     masm->mov(c_rarg1, arg);
 524   }
 525 }
 526 
 527 static void pass_arg2(MacroAssembler* masm, Register arg) {
 528   if (c_rarg2 != arg ) {
 529     masm->mov(c_rarg2, arg);
 530   }
 531 }
 532 
 533 static void pass_arg3(MacroAssembler* masm, Register arg) {
 534   if (c_rarg3 != arg ) {
 535     masm->mov(c_rarg3, arg);
 536   }
 537 }
 538 
 539 void MacroAssembler::call_VM_base(Register oop_result,
 540                                   Register java_thread,
 541                                   Register last_java_sp,
 542                                   address  entry_point,
 543                                   int      number_of_arguments,
 544                                   bool     check_exceptions) {
 545    // determine java_thread register
 546   if (!java_thread->is_valid()) {
 547     java_thread = rthread;
 548   }
 549 
 550   // determine last_java_sp register
 551   if (!last_java_sp->is_valid()) {
 552     last_java_sp = sp;
 553   }
 554 
 555   // debugging support
 556   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 557   assert(java_thread == rthread, "unexpected register");
 558 
 559   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 560   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 561 
 562   // push java thread (becomes first argument of C function)
 563 
 564   mov(c_rarg0, java_thread);
 565 
 566   // set last Java frame before call
 567   assert(last_java_sp != rfp, "can't use rfp");
 568 
 569   Label l;
 570   set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
 571 
 572 
 573   // FIXME - Can save lr in more elegant way ?
 574   //str(lr, pre(sp, -wordSize));
 575 
 576   // do the call, remove parameters
 577   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 578 
 579   //ldr(lr, post(sp, wordSize));
 580 
 581   // reset last Java frame
 582   // Only interpreter should have to clear fp
 583   reset_last_Java_frame(true, true);
 584 
 585    // C++ interp handles this in the interpreter
 586   check_and_handle_popframe(java_thread);
 587   check_and_handle_earlyret(java_thread);
 588 
 589   if (check_exceptions) {
 590     // check for pending exceptions (java_thread is set upon return)
 591     ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 592     Label ok;
 593     cbz(rscratch2, ok);
 594     lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
 595     bl(rscratch2);
 596     bind(ok);
 597   }
 598 
 599   // get oop result if there is one and reset the value in the thread
 600   if (oop_result->is_valid()) {
 601     get_vm_result(oop_result, java_thread);
 602   }
 603 }
 604 
 605 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 606   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 607 }
 608 
 609 // Maybe emit a call via a trampoline.  If the code cache is small
 610 // trampolines won't be emitted.
 611 
 612 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 613   assert(entry.rspec().type() == relocInfo::runtime_call_type
 614          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 615          || entry.rspec().type() == relocInfo::static_call_type
 616          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 617 
 618   //FIXME This block
 619   bool compile_in_scratch_emit_size = false;
 620   #ifdef COMPILER2
 621   compile_in_scratch_emit_size = Compile::current()->in_scratch_emit_size();
 622   #endif
 623 
 624   if (cbuf) cbuf->set_insts_mark();
 625   relocate(entry.rspec());
 626 
 627   // Have make trampline such way: destination address should be raw 4 byte value,
 628   // so it's patching could be done atomically.
 629   add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
 630   ldr(r15_pc, Address(r15_pc, 4)); // Address does correction for offset from pc base
 631   emit_int32((uintptr_t) entry.target());
 632   // possibly pad the call to the NativeCall size to make patching happy
 633   for (int i = NativeCall::instruction_size; i > 3 * NativeInstruction::arm_insn_sz; i -= NativeInstruction::arm_insn_sz)
 634     nop();
 635 }
 636 
 637 void MacroAssembler::ic_call(address entry) {
 638   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 639   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 640   // unsigned long offset;
 641   // ldr_constant(rscratch2, const_ptr);
 642   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 643   trampoline_call(Address(entry, rh));
 644 }
 645 
 646 // Implementation of call_VM versions
 647 
 648 void MacroAssembler::call_VM(Register oop_result,
 649                              address entry_point,
 650                              bool check_exceptions) {
 651   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 652 }
 653 
 654 void MacroAssembler::call_VM(Register oop_result,
 655                              address entry_point,
 656                              Register arg_1,
 657                              bool check_exceptions) {
 658   pass_arg1(this, arg_1);
 659   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 660 }
 661 
 662 void MacroAssembler::call_VM(Register oop_result,
 663                              address entry_point,
 664                              Register arg_1,
 665                              Register arg_2,
 666                              bool check_exceptions) {
 667   assert(arg_1 != c_rarg2, "smashed arg");
 668   pass_arg2(this, arg_2);
 669   pass_arg1(this, arg_1);
 670   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 671 }
 672 
 673 void MacroAssembler::call_VM(Register oop_result,
 674                              address entry_point,
 675                              Register arg_1,
 676                              Register arg_2,
 677                              Register arg_3,
 678                              bool check_exceptions) {
 679   assert(arg_1 != c_rarg3, "smashed arg");
 680   assert(arg_2 != c_rarg3, "smashed arg");
 681   pass_arg3(this, arg_3);
 682 
 683   assert(arg_1 != c_rarg2, "smashed arg");
 684   pass_arg2(this, arg_2);
 685 
 686   pass_arg1(this, arg_1);
 687   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 688 }
 689 
 690 void MacroAssembler::call_VM(Register oop_result,
 691                              Register last_java_sp,
 692                              address entry_point,
 693                              int number_of_arguments,
 694                              bool check_exceptions) {
 695   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 696 }
 697 
 698 void MacroAssembler::call_VM(Register oop_result,
 699                              Register last_java_sp,
 700                              address entry_point,
 701                              Register arg_1,
 702                              bool check_exceptions) {
 703   pass_arg1(this, arg_1);
 704   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 705 }
 706 
 707 void MacroAssembler::call_VM(Register oop_result,
 708                              Register last_java_sp,
 709                              address entry_point,
 710                              Register arg_1,
 711                              Register arg_2,
 712                              bool check_exceptions) {
 713 
 714   assert(arg_1 != c_rarg2, "smashed arg");
 715   pass_arg2(this, arg_2);
 716   pass_arg1(this, arg_1);
 717   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 718 }
 719 
 720 void MacroAssembler::call_VM(Register oop_result,
 721                              Register last_java_sp,
 722                              address entry_point,
 723                              Register arg_1,
 724                              Register arg_2,
 725                              Register arg_3,
 726                              bool check_exceptions) {
 727   assert(arg_1 != c_rarg3, "smashed arg");
 728   assert(arg_2 != c_rarg3, "smashed arg");
 729   pass_arg3(this, arg_3);
 730   assert(arg_1 != c_rarg2, "smashed arg");
 731   pass_arg2(this, arg_2);
 732   pass_arg1(this, arg_1);
 733   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 734 }
 735 
 736 
 737 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 738   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 739   assert(oop_result != rscratch2, "can't be");
 740   mov(rscratch2, 0);
 741   str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
 742   verify_oop(oop_result, "broken oop in call_VM_base");
 743 }
 744 
 745 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 746   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 747   assert(metadata_result != rscratch2 &&
 748          java_thread != rscratch2, "can't be");
 749   mov(rscratch2, 0);
 750   str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
 751 }
 752 
 753 void MacroAssembler::align(int modulus) {
 754   while (offset() % modulus != 0) nop();
 755 }
 756 
 757 // these are no-ops overridden by InterpreterMacroAssembler
 758 
 759 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 760 
 761 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 762 
 763 
 764 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 765                                                       Register tmp,
 766                                                       int offset) {
 767   intptr_t value = *delayed_value_addr;
 768   if (value != 0)
 769     return RegisterOrConstant(value + offset);
 770 
 771   // load indirectly to solve generation ordering problem
 772   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 773 
 774   if (offset != 0)
 775     add(tmp, tmp, offset);
 776 
 777   return RegisterOrConstant(tmp);
 778 }
 779 
 780 
 781 // Look up the method for a megamorphic invokeinterface call.
 782 // The target method is determined by <intf_klass, itable_index>.
 783 // The receiver klass is in recv_klass.
 784 // On success, the result will be in method_result, and execution falls through.
 785 // On failure, execution transfers to the given label.
 786 void MacroAssembler::lookup_interface_method(Register recv_klass,
 787                                              Register intf_klass,
 788                                              RegisterOrConstant itable_index,
 789                                              Register method_result,
 790                                              Register scan_temp,
 791                                              Label& L_no_such_interface) {
 792   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 793   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 794          "caller must use same register for non-constant itable index as for method");
 795 
 796   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 797   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 798   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 799   int scan_step   = itableOffsetEntry::size() * wordSize;
 800   int vte_size    = vtableEntry::size() * wordSize;
 801   assert(vte_size == wordSize, "else adjust times_vte_scale");
 802 
 803   ldr(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 804 
 805   // %%% Could store the aligned, prescaled offset in the klassoop.
 806   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 807   lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
 808   add(scan_temp, scan_temp, vtable_base);
 809   if (HeapWordsPerLong > 1) {
 810     // Round up to align_object_offset boundary
 811     // see code for instanceKlass::start_of_itable!
 812     round_to(scan_temp, BytesPerLong);
 813   }
 814 
 815   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 816   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 817   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 818   lea(recv_klass, itable_index.is_register() ?
 819           Address(recv_klass, itable_index, lsl(2)) :
 820           Address(recv_klass, itable_index.as_constant() << 2));
 821   if (itentry_off)
 822     add(recv_klass, recv_klass, itentry_off);
 823 
 824   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 825   //   if (scan->interface() == intf) {
 826   //     result = (klass + scan->offset() + itable_index);
 827   //   }
 828   // }
 829   Label search, found_method;
 830 
 831   for (int peel = 1; peel >= 0; peel--) {
 832     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 833     cmp(intf_klass, method_result);
 834 
 835     if (peel) {
 836       b(found_method, Assembler::EQ);
 837     } else {
 838       b(search, Assembler::NE);
 839       // (invert the test to fall through to found_method...)
 840     }
 841 
 842     if (!peel)  break;
 843 
 844     bind(search);
 845 
 846     // Check that the previous entry is non-null.  A null entry means that
 847     // the receiver class doesn't implement the interface, and wasn't the
 848     // same as when the caller was compiled.
 849     cbz(method_result, L_no_such_interface);
 850     add(scan_temp, scan_temp, scan_step);
 851   }
 852 
 853   bind(found_method);
 854 
 855   // Got a hit.
 856   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 857   ldr(method_result, Address(recv_klass, scan_temp));
 858 }
 859 
 860 // virtual method calling
 861 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 862                                            RegisterOrConstant vtable_index,
 863                                            Register method_result) {
 864   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 865   //assert(vtableEntry::size() * wordSize == 8,
 866   //       "adjust the scaling in the code below");
 867   // FIXME What scaling needs changing as indexes address by one word
 868   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 869 
 870   if (vtable_index.is_register()) {
 871     lea(method_result, Address(recv_klass,
 872                                vtable_index.as_register(),
 873                                lsl(LogBytesPerWord)));
 874     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 875   } else {
 876     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 877     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 878   }
 879 }
 880 
 881 void MacroAssembler::check_klass_subtype(Register sub_klass,
 882                            Register super_klass,
 883                            Register temp_reg,
 884                            Label& L_success) {
 885   Label L_failure;
 886   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 887   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 888   bind(L_failure);
 889 }
 890 
 891 
 892 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 893                                                    Register super_klass,
 894                                                    Register temp_reg,
 895                                                    Label* L_success,
 896                                                    Label* L_failure,
 897                                                    Label* L_slow_path,
 898                                         RegisterOrConstant super_check_offset) {
 899   assert_different_registers(sub_klass, super_klass, temp_reg);
 900   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 901   if (super_check_offset.is_register()) {
 902     assert_different_registers(sub_klass, super_klass,
 903                                super_check_offset.as_register());
 904   } else if (must_load_sco) {
 905     assert(temp_reg != noreg, "supply either a temp or a register offset");
 906   }
 907 
 908   Label L_fallthrough;
 909   int label_nulls = 0;
 910   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 911   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 912   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 913   assert(label_nulls <= 1, "at most one NULL in the batch");
 914 
 915   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 916   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 917   Address super_check_offset_addr(super_klass, sco_offset);
 918 
 919   // Hacked jmp, which may only be used just before L_fallthrough.
 920 #define final_jmp(label)                                                \
 921   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 922   else                            b(label)                /*omit semi*/
 923 
 924   // If the pointers are equal, we are done (e.g., String[] elements).
 925   // This self-check enables sharing of secondary supertype arrays among
 926   // non-primary types such as array-of-interface.  Otherwise, each such
 927   // type would need its own customized SSA.
 928   // We move this check to the front of the fast path because many
 929   // type checks are in fact trivially successful in this manner,
 930   // so we get a nicely predicted branch right at the start of the check.
 931   cmp(sub_klass, super_klass);
 932   b(*L_success, Assembler::EQ);
 933 
 934   // Check the supertype display:
 935   if (must_load_sco) {
 936     ldr(temp_reg, super_check_offset_addr);
 937     super_check_offset = RegisterOrConstant(temp_reg);
 938   }
 939   Address super_check_addr(sub_klass, super_check_offset);
 940   ldr(rscratch1, super_check_addr);
 941   cmp(super_klass, rscratch1); // load displayed supertype
 942 
 943   // This check has worked decisively for primary supers.
 944   // Secondary supers are sought in the super_cache ('super_cache_addr').
 945   // (Secondary supers are interfaces and very deeply nested subtypes.)
 946   // This works in the same check above because of a tricky aliasing
 947   // between the super_cache and the primary super display elements.
 948   // (The 'super_check_addr' can address either, as the case requires.)
 949   // Note that the cache is updated below if it does not help us find
 950   // what we need immediately.
 951   // So if it was a primary super, we can just fail immediately.
 952   // Otherwise, it's the slow path for us (no success at this point).
 953 
 954   if (super_check_offset.is_register()) {
 955     b(*L_success, Assembler::EQ);
 956     cmp(super_check_offset.as_register(), sc_offset);
 957     if (L_failure == &L_fallthrough) {
 958       b(*L_slow_path, Assembler::EQ);
 959     } else {
 960       b(*L_failure, Assembler::NE);
 961       final_jmp(*L_slow_path);
 962     }
 963   } else if (super_check_offset.as_constant() == sc_offset) {
 964     // Need a slow path; fast failure is impossible.
 965     if (L_slow_path == &L_fallthrough) {
 966       b(*L_success, Assembler::EQ);
 967     } else {
 968       b(*L_slow_path, Assembler::NE);
 969       final_jmp(*L_success);
 970     }
 971   } else {
 972     // No slow path; it's a fast decision.
 973     if (L_failure == &L_fallthrough) {
 974       b(*L_success, Assembler::EQ);
 975     } else {
 976       b(*L_failure, Assembler::NE);
 977       final_jmp(*L_success);
 978     }
 979   }
 980 
 981   bind(L_fallthrough);
 982 
 983 #undef final_jmp
 984 }
 985 
 986 // These two are taken from x86, but they look generally useful
 987 
 988 // scans count pointer sized words at [addr] for occurence of value,
 989 // generic
 990 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
 991                                 Register scratch) {
 992   Label loop, fail, found;
 993   cmp(count, 0);
 994   b(fail, EQ);
 995 
 996   bind(loop);
 997   ldr(scratch, post(addr, wordSize));
 998   cmp(value, scratch);
 999   b(found, EQ);
1000   subs(count, count, 1);
1001   b(loop, NE);
1002 
1003   bind(fail);
1004   cmp(sp, 0); // sp never zero
1005   bind(found);
1006 }
1007 
1008 // Form an address from base + offset in Rd.  Rd may or may
1009 // not actually be used: you must use the Address that is returned.
1010 // It is up to you to ensure that the shift provided matches the size
1011 // of your data.
1012 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1013   // form_address result should only be used together with ldr/str instructions
1014   // otherwise please provide exact type instead of IDT_INT or apply safe_for()
1015   if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
1016     // It fits; no need for any heroics
1017     return Address(base, byte_offset);
1018 
1019   // See if we can do this with two 12-bit offsets
1020   {
1021     unsigned long masked_offset = byte_offset & ~0xfff;
1022     if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
1023         && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
1024       add(Rd, base, masked_offset);
1025       byte_offset -= masked_offset;
1026       return Address(Rd, byte_offset);
1027     }
1028   }
1029 
1030   // Do it the hard way
1031   mov(Rd, byte_offset);
1032   add(Rd, base, Rd);
1033   return Address(Rd);
1034 }
1035 
1036 // scans count 4 byte words at [addr] for occurence of value,
1037 // generic
1038 /*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1039                                 Register scratch) {
1040   Label Lloop, Lexit;
1041   cbz(count, Lexit);
1042   bind(Lloop);
1043   ldr(scratch, post(addr, wordSize));
1044   cmp(value, scratch);
1045   b(Lexit, EQ);
1046   sub(count, count, 1);
1047   cbnz(count, Lloop);
1048   bind(Lexit);
1049 }*/
1050 
1051 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1052                                                    Register super_klass,
1053                                                    Register temp_reg,
1054                                                    Register temp2_reg,
1055                                                    Label* L_success,
1056                                                    Label* L_failure,
1057                                                    bool set_cond_codes) {
1058   assert_different_registers(sub_klass, super_klass, temp_reg);
1059   if (temp2_reg != noreg)
1060     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1061 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1062 
1063   Label L_fallthrough;
1064   int label_nulls = 0;
1065   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1066   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1067   assert(label_nulls <= 1, "at most one NULL in the batch");
1068 
1069   // a couple of useful fields in sub_klass:
1070   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1071   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1072   Address secondary_supers_addr(sub_klass, ss_offset);
1073   Address super_cache_addr(     sub_klass, sc_offset);
1074 
1075   BLOCK_COMMENT("check_klass_subtype_slow_path");
1076 
1077   // Do a linear scan of the secondary super-klass chain.
1078   // This code is rarely used, so simplicity is a virtue here.
1079   // The repne_scan instruction uses fixed registers, which we must spill.
1080   // Don't worry too much about pre-existing connections with the input regs.
1081 
1082   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1083   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1084 
1085   // Get super_klass value into r0 (even if it was in r14 or r2).
1086   RegSet pushed_registers;
1087   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1088   if (!IS_A_TEMP(r14))    pushed_registers += r14;
1089 
1090   if (super_klass != r0) {
1091     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1092   }
1093 
1094   push(pushed_registers, sp);
1095 
1096 #ifndef PRODUCT
1097   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1098   Address pst_counter_addr(rscratch2);
1099   ldr(rscratch1, pst_counter_addr);
1100   add(rscratch1, rscratch1, 1);
1101   str(rscratch1, pst_counter_addr);
1102 #endif //PRODUCT
1103 
1104   // We will consult the secondary-super array.
1105   ldr(r14, secondary_supers_addr);
1106   // Load the array length.
1107   ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
1108   // Skip to start of data.
1109   add(r14, r14, Array<Klass*>::base_offset_in_bytes());
1110 
1111   cmp(sp, 0); // Clear Z flag; SP is never zero
1112   // Scan R2 words at [R14] for an occurrence of R0.
1113   // Set NZ/Z based on last compare.
1114   repne_scan(r14, r0, r2, rscratch1);
1115 
1116   // Unspill the temp. registers:
1117   pop(pushed_registers, sp);
1118 
1119   b(*L_failure, Assembler::NE);
1120 
1121   // Success.  Cache the super we found and proceed in triumph.
1122   str(super_klass, super_cache_addr);
1123 
1124   if (L_success != &L_fallthrough) {
1125     b(*L_success);
1126   }
1127 
1128 #undef IS_A_TEMP
1129 
1130   bind(L_fallthrough);
1131 }
1132 
1133 
1134 void MacroAssembler::verify_oop(Register reg, const char* s) {
1135   if (!VerifyOops) return;
1136 
1137   // Pass register number to verify_oop_subroutine
1138   const char* b = NULL;
1139   {
1140     ResourceMark rm;
1141     stringStream ss;
1142     ss.print("verify_oop: %s: %s", reg->name(), s);
1143     b = code_string(ss.as_string());
1144   }
1145   BLOCK_COMMENT("verify_oop {");
1146 
1147   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1148 
1149   mov(r0, reg);
1150   mov(rscratch1, (address)b);
1151   mrs(r1);
1152 
1153   // call indirectly to solve generation ordering problem
1154   reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
1155   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1156   ldr(rscratch2, Address(rscratch2));
1157   bl(rscratch2);
1158   reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
1159 
1160   msr(r1);
1161   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1162 
1163   BLOCK_COMMENT("} verify_oop");
1164 }
1165 
1166 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1167   if (!VerifyOops) return;
1168 
1169   const char* b = NULL;
1170   {
1171     ResourceMark rm;
1172     stringStream ss;
1173     ss.print("verify_oop_addr: %s", s);
1174     b = code_string(ss.as_string());
1175   }
1176   BLOCK_COMMENT("verify_oop_addr {");
1177 
1178   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1179   mrs(r1);
1180 
1181   // addr may contain sp so we will have to adjust it based on the
1182   // pushes that we just did.
1183   if (addr.uses(sp)) {
1184     lea(r0, addr);
1185     ldr(r0, Address(r0, 5 * wordSize));
1186   } else {
1187     ldr(r0, addr);
1188   }
1189   mov(rscratch1, (address)b);
1190 
1191   // call indirectly to solve generation ordering problem
1192   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1193   ldr(rscratch2, Address(rscratch2));
1194   bl(rscratch2);
1195 
1196   msr(r1);
1197   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1198 
1199   BLOCK_COMMENT("} verify_oop_addr");
1200 }
1201 
1202 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1203                                          int extra_slot_offset) {
1204   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1205   int stackElementSize = Interpreter::stackElementSize;
1206   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1207 #ifdef ASSERT
1208   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1209   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1210 #endif
1211   if (arg_slot.is_constant()) {
1212     return Address(sp, arg_slot.as_constant() * stackElementSize
1213                    + offset);
1214   } else {
1215     add(rscratch1, sp, arg_slot.as_register(),
1216         lsl(exact_log2(stackElementSize)));
1217     return Address(rscratch1, offset);
1218   }
1219 }
1220 
1221 void MacroAssembler::call_VM_leaf_base(address entry_point,
1222                                        int number_of_arguments,
1223                                        Label *retaddr) {
1224   Label E, L;
1225 
1226   //FIXME Do this alignment in a more elegant way
1227   mov(rscratch2, sp);
1228   sub(sp, sp, wordSize);
1229   bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
1230   str(rscratch2, Address(sp));
1231 
1232   // FIXME Do we need to preserve rscratch2?
1233   //str(rscratch2, Address(pre(sp, -wordSize)));
1234 
1235   mov(rscratch2, entry_point);
1236   reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
1237   bl(rscratch2);
1238   if (retaddr)
1239     bind(*retaddr);
1240   reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
1241 
1242   //ldr(rscratch2, Address(post(sp, wordSize)));
1243 
1244   //Undo alignment
1245   ldr(sp, Address(sp));
1246 
1247   maybe_isb();
1248 }
1249 
1250 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1251   call_VM_leaf_base(entry_point, number_of_arguments);
1252 }
1253 
1254 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1255   pass_arg0(this, arg_0);
1256   call_VM_leaf_base(entry_point, 1);
1257 }
1258 
1259 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1260   pass_arg0(this, arg_0);
1261   pass_arg1(this, arg_1);
1262   call_VM_leaf_base(entry_point, 2);
1263 }
1264 
1265 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1266                                   Register arg_1, Register arg_2) {
1267   pass_arg0(this, arg_0);
1268   pass_arg1(this, arg_1);
1269   pass_arg2(this, arg_2);
1270   call_VM_leaf_base(entry_point, 3);
1271 }
1272 
1273 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1274   pass_arg0(this, arg_0);
1275   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1276 }
1277 
1278 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1279 
1280   assert(arg_0 != c_rarg1, "smashed arg");
1281   pass_arg1(this, arg_1);
1282   pass_arg0(this, arg_0);
1283   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1284 }
1285 
1286 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1287   assert(arg_0 != c_rarg2, "smashed arg");
1288   assert(arg_1 != c_rarg2, "smashed arg");
1289   pass_arg2(this, arg_2);
1290   assert(arg_0 != c_rarg1, "smashed arg");
1291   pass_arg1(this, arg_1);
1292   pass_arg0(this, arg_0);
1293   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1294 }
1295 
1296 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1297   assert(arg_0 != c_rarg3, "smashed arg");
1298   assert(arg_1 != c_rarg3, "smashed arg");
1299   assert(arg_2 != c_rarg3, "smashed arg");
1300   pass_arg3(this, arg_3);
1301   assert(arg_0 != c_rarg2, "smashed arg");
1302   assert(arg_1 != c_rarg2, "smashed arg");
1303   pass_arg2(this, arg_2);
1304   assert(arg_0 != c_rarg1, "smashed arg");
1305   pass_arg1(this, arg_1);
1306   pass_arg0(this, arg_0);
1307   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1308 }
1309 
1310 // Clobbers rscratch1
1311 void MacroAssembler::null_check(Register reg, int offset) {
1312   if (needs_explicit_null_check(offset)) {
1313     // provoke OS NULL exception if reg = NULL by
1314     // accessing M[reg] w/o changing any registers
1315     // NOTE: this is plenty to provoke a segv
1316     reg_printf("Generating OS check null with ptr = %p\n", reg);
1317     assert(reg != rscratch1, "can't be");
1318     ldr(rscratch1, Address(reg));
1319   } else {
1320     // nothing to do, (later) access of M[reg + offset]
1321     // will provoke OS NULL exception if reg = NULL
1322   }
1323 }
1324 
1325 // MacroAssembler protected routines needed to implement
1326 // public methods
1327 
1328 void MacroAssembler::mov(Register r, Address dest, Condition cond) {
1329   code_section()->relocate(pc(), dest.rspec());
1330   uint32_t imm32 = (uint32_t)dest.target();
1331   movptr(r, imm32, cond);
1332 }
1333 
1334 // Move a constant pointer into r.  In aarch32 address space
1335 // is 32 bits in size and so a pointer can be encoded in two mov
1336 // instructions.
1337 void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
1338 #ifndef PRODUCT
1339   {
1340     char buffer[64];
1341     snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1342     block_comment(buffer);
1343   }
1344 #endif
1345   Assembler::mov_immediate32(r, imm32, cond, false);
1346 }
1347 
1348 void MacroAssembler::ret(Register reg) {
1349   assert(reg == lr, "Can do return only to LR");
1350   mov(r15_pc, lr);
1351 }
1352 
1353 void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
1354   Label retry_load;
1355   bind(retry_load);
1356   // flush and load exclusive from the memory location
1357   ldrex(tmp, counter_addr);
1358   add(tmp, tmp, 1);
1359   // if we store+flush with no intervening write tmp wil be zero
1360   strex(tmp, tmp, counter_addr);
1361   cmp(tmp, 0);
1362   b(retry_load, Assembler::NE);
1363 }
1364 
1365 
1366 // MacroAssembler routines found actually to be needed
1367 
1368 void MacroAssembler::push(Register src)
1369 {
1370   str(src, Address(pre(sp, -1 * wordSize)));
1371 }
1372 
1373 void MacroAssembler::pop(Register dst)
1374 {
1375   ldr(dst, Address(post(sp, 1 * wordSize)));
1376 }
1377 
1378 // Note: load_unsigned_short used to be called load_unsigned_word.
1379 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1380   int off = offset();
1381   ldrh(dst, src);
1382   return off;
1383 }
1384 
1385 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1386   int off = offset();
1387   ldrb(dst, src);
1388   return off;
1389 }
1390 
1391 int MacroAssembler::load_signed_short(Register dst, Address src) {
1392   int off = offset();
1393   ldrsh(dst, src);
1394   return off;
1395 }
1396 
1397 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1398   int off = offset();
1399   ldrsb(dst, src);
1400   return off;
1401 }
1402 
1403 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1404   switch (size_in_bytes) {
1405   //case  8:  ldr(dst, src); break;
1406   case  4:  ldr(dst, src); break;
1407   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1408   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1409   default:  ShouldNotReachHere();
1410   }
1411 }
1412 
1413 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1414   switch (size_in_bytes) {
1415   //case  8:  str(src, dst); break;
1416   case  4:  str(src, dst); break;
1417   case  2:  strh(src, dst); break;
1418   case  1:  strb(src, dst); break;
1419   default:  ShouldNotReachHere();
1420   }
1421 }
1422 
1423 void MacroAssembler::decrement(Register reg, int value) {
1424   if (value < 0) {
1425     increment(reg, -value);
1426     return;
1427   }
1428   if (value == 0) {
1429     return;
1430   }
1431   if (operand_valid_for_add_sub_immediate(value)) {
1432     sub(reg, reg, value);
1433     return;
1434   }
1435   assert(reg != rscratch2, "invalid register for decrement");
1436   mov(rscratch2, (unsigned int) value);
1437   sub(reg, reg, rscratch2);
1438 }
1439 
1440 void MacroAssembler::decrement(Address dst, int value) {
1441   assert(!dst.uses(rscratch1), "invalid address for decrement");
1442   ldr(rscratch1, dst);
1443   decrement(rscratch1, value);
1444   str(rscratch1, dst);
1445 }
1446 
1447 void MacroAssembler::increment(Register reg, int value) {
1448   if (value < 0) {
1449     decrement(reg, -value);
1450     return;
1451   }
1452   if (value == 0) {
1453     return;
1454   }
1455   if (operand_valid_for_add_sub_immediate(value)) {
1456     add(reg, reg, value);
1457     return;
1458   }
1459   assert(reg != rscratch2, "invalid register for increment");
1460   mov(rscratch2, (unsigned int) value);
1461   add(reg, reg, rscratch2);
1462 }
1463 
1464 void MacroAssembler::increment(Address dst, int value) {
1465   assert(!dst.uses(rscratch1), "invalid address for increment");
1466   ldr(rscratch1, dst);
1467   increment(rscratch1, value);
1468   str(rscratch1, dst);
1469 }
1470 
1471 // Loads and stores everything except the pc and sp
1472 void MacroAssembler::pusha() {
1473   unsigned regset = 0b0101111111111111;
1474   stmdb(sp, regset);
1475 }
1476 void MacroAssembler::popa() {
1477   unsigned regset = 0b0101111111111111;
1478   ldmia(sp, regset);
1479 }
1480 
1481 static void multiple_reg_check(unsigned int bitset, Register stack) {
1482   const unsigned int pcbit = 1 << r15_pc->encoding();
1483   const unsigned int lrbit = 1 << lr->encoding();
1484   const unsigned int spbit = 1 << sp->encoding();
1485   const unsigned int stackbit = 1 << stack->encoding();
1486   assert(!(bitset & spbit), "The SP can be in the list. However, "
1487       "ARM deprecates using these instructions with SP in the list.");
1488   assert(!(bitset & pcbit) || !(bitset & lrbit),
1489       "ARM deprecates using these instructions with both "
1490       "the LR and the PC in the list.");
1491   assert(!(bitset & stackbit), "Instructions with the base register "
1492       "in the list and ! specified are only available before ARMv7, "
1493       "and ARM deprecates the use of such instructions. "
1494       "The value of the base register after such an instruction is UNKNOWN");
1495 }
1496 
1497 // Push lots of registers in the bit set supplied.  Don't push sp.
1498 // Return the number of words pushed
1499 int MacroAssembler::push(unsigned int bitset, Register stack) {
1500   multiple_reg_check(bitset, stack);
1501   unsigned bc = bitset, count = 0, i;
1502   for(i = 0; i <= 15; i++) {
1503     if (1 & bc) count++;
1504     bc >>= 1;
1505   }
1506   // TODO Also why did it only do even quantities before?
1507   stmdb(stack, bitset);
1508   return count;
1509 }
1510 
1511 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1512   multiple_reg_check(bitset, stack);
1513   unsigned bc = bitset, count = 0, i;
1514   for(i = 0; i <= 15; i++) {
1515     if (1 & bc) count++;
1516     bc >>= 1;
1517   }
1518   // TODO Also why did it only do even quantities before?
1519   ldmia(stack, bitset);
1520   return count;
1521 }
1522 
1523 void MacroAssembler::stop(const char* msg) {
1524   pusha();
1525   // Save old sp value
1526   add(rscratch2, sp, 14 * wordSize);
1527   str(rscratch2, Address(pre(sp, -4)));
1528   mov(c_rarg0, (address)msg);
1529   mov(c_rarg1, r15_pc);
1530   sub(c_rarg1, c_rarg1, 8); // Restore to actual value
1531   mov(c_rarg2, sp);
1532   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
1533   bl(c_rarg3);
1534   hlt(0);
1535 }
1536 
1537 // this simulates the behaviour of the x86 cmpxchg instruction using a
1538 // load linked/store conditional pair. we use the acquire/release
1539 // versions of these instructions so that we flush pending writes as
1540 // per Java semantics.
1541 
1542 // n.b the x86 version assumes the old value to be compared against is
1543 // in rax and updates rax with the value located in memory if the
1544 // cmpxchg fails. we supply a register for the old value explicitly
1545 
1546 // the aarch32 load linked/store conditional instructions do not
1547 // accept an offset. so, unlike x86, we must provide a plain register
1548 // to identify the memory word to be compared/exchanged rather than a
1549 // register+offset Address.
1550 
1551 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1552                                 Label &succeed, Label *fail) {
1553   // oldv holds comparison value
1554   // newv holds value to write in exchange
1555   // addr identifies memory word to compare against/update
1556   // tmp returns 0/1 for success/failure
1557   Label retry_load, nope;
1558 
1559   bind(retry_load);
1560   // flush and load exclusive from the memory location
1561   // and fail if it is not what we expect
1562   ldrex(tmp, addr);
1563   cmp(tmp, oldv);
1564   b(nope, Assembler::NE);
1565   // if we store+flush with no intervening write tmp wil be zero
1566   strex(tmp, newv, addr);
1567   cmp(tmp, 0);
1568   b(succeed, Assembler::EQ);
1569   // retry so we only ever return after a load fails to compare
1570   // ensures we don't return a stale value after a failed write.
1571   b(retry_load);
1572   // if the memory word differs we return it in oldv and signal a fail
1573   bind(nope);
1574   membar(AnyAny);
1575   mov(oldv, tmp);
1576   if (fail)
1577     b(*fail);
1578 }
1579 
1580 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1581                                 Label &succeed, Label *fail) {
1582   // oldv holds comparison value
1583   // newv holds value to write in exchange
1584   // addr identifies memory word to compare against/update
1585   // tmp returns 0/1 for success/failure
1586   Label retry_load, nope;
1587 
1588   bind(retry_load);
1589   // flush and load exclusive from the memory location
1590   // and fail if it is not what we expect
1591   ldrex(tmp, addr);
1592   cmp(tmp, oldv);
1593   b(nope, Assembler::NE);
1594   // if we store+flush with no intervening write tmp wil be zero
1595   strex(tmp, newv, addr);
1596   cmp(tmp, 0);
1597   b(succeed, Assembler::EQ);
1598   // retry so we only ever return after a load fails to compare
1599   // ensures we don't return a stale value after a failed write.
1600   b(retry_load);
1601   // if the memory word differs we return it in oldv and signal a fail
1602   bind(nope);
1603   membar(AnyAny);
1604   mov(oldv, tmp);
1605   if (fail)
1606     b(*fail);
1607 }
1608 
1609 void MacroAssembler::incr_allocated_bytes(Register thread,
1610                                           Register var_size_in_bytes,
1611                                           int con_size_in_bytes,
1612                                           Register t1) {
1613   if (!thread->is_valid()) {
1614     thread = rthread;
1615   }
1616   assert(t1->is_valid(), "need temp reg");
1617 
1618   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1619   if (var_size_in_bytes->is_valid()) {
1620     add(t1, t1, var_size_in_bytes);
1621   } else {
1622     add(t1, t1, con_size_in_bytes);
1623   }
1624   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1625 }
1626 
1627 #ifndef PRODUCT
1628 extern "C" void findpc(intptr_t x);
1629 #endif
1630 
1631 void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
1632 {
1633   print_unseen_bytecodes();
1634   // In order to get locks to work, we need to fake a in_VM state
1635   if (ShowMessageBoxOnError) {
1636     JavaThread* thread = JavaThread::current();
1637     JavaThreadState saved_state = thread->thread_state();
1638     thread->set_thread_state(_thread_in_vm);
1639 #ifndef PRODUCT
1640     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
1641       ttyLocker ttyl;
1642       BytecodeCounter::print();
1643     }
1644 #endif
1645     if (os::message_box(msg, "Execution stopped, print registers?")) {
1646       ttyLocker ttyl;
1647       tty->print_cr(" pc = 0x%016x", pc);
1648 #ifndef PRODUCT
1649       tty->cr();
1650       findpc(pc);
1651       tty->cr();
1652 #endif
1653       tty->print_cr("THIS IS WRONG!");
1654       tty->print_cr(" r0 = 0x%016x", regs[0]);
1655       tty->print_cr(" r1 = 0x%016x", regs[1]);
1656       tty->print_cr(" r2 = 0x%016x", regs[2]);
1657       tty->print_cr(" r3 = 0x%016x", regs[3]);
1658       tty->print_cr(" r4 = 0x%016x", regs[4]);
1659       tty->print_cr(" r5 = 0x%016x", regs[5]);
1660       tty->print_cr(" r6 = 0x%016x", regs[6]);
1661       tty->print_cr(" r7 = 0x%016x", regs[7]);
1662       tty->print_cr(" r8 = 0x%016x", regs[8]);
1663       tty->print_cr(" r9 = 0x%016x", regs[9]);
1664       tty->print_cr("r10 = 0x%016x", regs[10]);
1665       tty->print_cr("r11 = 0x%016x", regs[11]);
1666       tty->print_cr("r12 = 0x%016x", regs[12]);
1667       tty->print_cr("r13 = 0x%016x", regs[13]);
1668       tty->print_cr("r14 = 0x%016x", regs[14]);
1669       tty->print_cr("r15 = 0x%016x", regs[15]);
1670       BREAKPOINT;
1671     }
1672     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
1673   } else {
1674     {
1675     ttyLocker ttyl;
1676     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
1677     ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
1678     ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
1679     ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
1680     ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
1681     ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
1682     ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
1683     ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
1684     ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
1685     ::tty->print_cr(" r8 [  rthread  ] = 0x%08x", regs[9]);
1686     ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
1687     ::tty->print_cr("r10 [  rmethod  ] = 0x%08x", regs[11]);
1688     ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
1689     ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
1690     ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
1691     ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
1692     ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
1693     }
1694     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
1695   }
1696 }
1697 
1698 void MacroAssembler::push_CPU_state() {
1699   // ensure the sp is decremented by the multiple of StackAlignmentInBytes
1700   sub(sp, sp, 4);
1701   // if fix this, update also RegisterSaved::save_live_registers and it's map
1702   push(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1703 
1704   int nfloat = 16;
1705   vstmdb_f64(sp, (1 << nfloat) - 1);
1706 }
1707 
1708 void MacroAssembler::pop_CPU_state() {
1709   int nfloat = 16;
1710   vldmia_f64(sp, (1 << nfloat) - 1);
1711   pop(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1712   add(sp, sp, 4);
1713 }
1714 
1715 // appears this needs to round up!
1716 void MacroAssembler::round_to(Register reg, int modulus) {
1717   // from x86
1718   add(reg, reg, modulus - 1);
1719   bic(reg, reg, modulus - 1); // and( reg, -modulus)
1720 }
1721 
1722 SkipIfEqual::SkipIfEqual(
1723     MacroAssembler* masm, const bool* flag_addr, bool value) {
1724   _masm = masm;
1725   _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
1726   _masm->ldrb(rscratch1, rscratch1);
1727   _masm->cmp(rscratch1, 0);
1728   _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
1729 }
1730 
1731 SkipIfEqual::~SkipIfEqual() {
1732   _masm->bind(_label);
1733 }
1734 
1735 void MacroAssembler::cmpptr(Register src1, Address src2) {
1736   mov(rscratch1, src2);
1737   ldr(rscratch1, Address(rscratch1));
1738   cmp(src1, rscratch1);
1739 }
1740 
1741 void MacroAssembler::store_check(Register obj) {
1742   // Does a store check for the oop in register obj. The content of
1743   // register obj is destroyed afterwards.
1744   store_check_part_1(obj);
1745   store_check_part_2(obj);
1746 }
1747 
1748 void MacroAssembler::store_check(Register obj, Address dst) {
1749   store_check(obj);
1750 }
1751 
1752 
1753 // split the store check operation so that other instructions can be scheduled inbetween
1754 void MacroAssembler::store_check_part_1(Register obj) {
1755   BarrierSet* bs = Universe::heap()->barrier_set();
1756   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
1757   lsr(obj, obj, CardTableModRefBS::card_shift);
1758 }
1759 
1760 void MacroAssembler::store_check_part_2(Register obj) {
1761   BarrierSet* bs = Universe::heap()->barrier_set();
1762   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
1763   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1764   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1765 
1766   // The calculation for byte_map_base is as follows:
1767   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
1768   // So this essentially converts an address to a displacement and
1769   // it will never need to be relocated.
1770 
1771   // FIXME: It's not likely that disp will fit into an offset so we
1772   // don't bother to check, but it could save an instruction.
1773   intptr_t disp = (intptr_t) ct->byte_map_base;
1774   mov(rscratch1, disp);
1775   mov(rscratch2, 0);
1776   strb(rscratch2, Address(obj, rscratch1));
1777 }
1778 
1779 void MacroAssembler::load_klass(Register dst, Register src) {
1780   ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1781 }
1782 
1783 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
1784   ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
1785   cmp(trial_klass, tmp);
1786 }
1787 
1788 void MacroAssembler::load_prototype_header(Register dst, Register src) {
1789   load_klass(dst, src);
1790   ldr(dst, Address(dst, Klass::prototype_header_offset()));
1791 }
1792 
1793 void MacroAssembler::store_klass(Register dst, Register src) {
1794   str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1795 }
1796 
1797 void MacroAssembler::store_klass_gap(Register dst, Register src) { }
1798 
1799 void MacroAssembler::load_heap_oop(Register dst, Address src)
1800 {
1801   ldr(dst, src);
1802 }
1803 
1804 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
1805 {
1806   ldr(dst, src);
1807 }
1808 
1809 void MacroAssembler::store_heap_oop(Address dst, Register src) {
1810   str(src, dst);
1811 }
1812 
1813 // Used for storing NULLs.
1814 void MacroAssembler::store_heap_oop_null(Address dst) {
1815   mov(rscratch1, 0);
1816   str(rscratch1, dst);
1817 }
1818 
1819 
1820 
1821 
1822 
1823 
1824 
1825 
1826 
1827 
1828 
1829 
1830 
1831 
1832 
1833 
1834 
1835 
1836 
1837 
1838 
1839 
1840 
1841 
1842 
1843 
1844 
1845 
1846 
1847 
1848 
1849 
1850 
1851 #if INCLUDE_ALL_GCS
1852 void MacroAssembler::g1_write_barrier_pre(Register obj,
1853                                           Register pre_val,
1854                                           Register thread,
1855                                           Register tmp,
1856                                           bool tosca_live,
1857                                           bool expand_call) {
1858   // If expand_call is true then we expand the call_VM_leaf macro
1859   // directly to skip generating the check by
1860   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
1861 
1862   assert(thread == rthread, "must be");
1863 
1864   Label done;
1865   Label runtime;
1866 
1867   assert(pre_val != noreg, "check this code");
1868 
1869   if (obj != noreg)
1870     assert_different_registers(obj, pre_val, tmp);
1871 
1872   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1873                                        PtrQueue::byte_offset_of_active()));
1874   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1875                                        PtrQueue::byte_offset_of_index()));
1876   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1877                                        PtrQueue::byte_offset_of_buf()));
1878 
1879 
1880   // Is marking active?
1881   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
1882     ldr(tmp, in_progress);
1883   } else {
1884     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
1885     ldrb(tmp, in_progress);
1886   }
1887   cmp(tmp, 0);
1888   b(done, Assembler::EQ);
1889 
1890   // Do we need to load the previous value?
1891   if (obj != noreg) {
1892     load_heap_oop(pre_val, Address(obj, 0));
1893   }
1894 
1895   // Is the previous value null?
1896   cbz(pre_val, done);
1897 
1898   // Can we store original value in the thread's buffer?
1899   // Is index == 0?
1900   // (The index field is typed as size_t.)
1901 
1902   ldr(tmp, index);                      // tmp := *index_adr
1903   cbz(tmp, runtime);                    // tmp == 0?
1904                                         // If yes, goto runtime
1905 
1906   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
1907   str(tmp, index);                      // *index_adr := tmp
1908   ldr(rscratch1, buffer);
1909   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
1910 
1911   // Record the previous value
1912   str(pre_val, Address(tmp, 0));
1913   b(done);
1914 
1915   bind(runtime);
1916   // save the live input values
1917   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1918 
1919   // Calling the runtime using the regular call_VM_leaf mechanism generates
1920   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
1921   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
1922   //
1923   // If we care generating the pre-barrier without a frame (e.g. in the
1924   // intrinsified Reference.get() routine) then ebp might be pointing to
1925   // the caller frame and so this check will most likely fail at runtime.
1926   //
1927   // Expanding the call directly bypasses the generation of the check.
1928   // So when we do not have have a full interpreter frame on the stack
1929   // expand_call should be passed true.
1930 
1931   if (expand_call) {
1932     assert(pre_val != c_rarg1, "smashed arg");
1933     pass_arg1(this, thread);
1934     pass_arg0(this, pre_val);
1935     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
1936   } else {
1937     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
1938   }
1939 
1940   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1941 
1942   bind(done);
1943 }
1944 
1945 void MacroAssembler::g1_write_barrier_post(Register store_addr,
1946                                            Register new_val,
1947                                            Register thread,
1948                                            Register tmp,
1949                                            Register tmp2) {
1950   assert(thread == rthread, "must be");
1951 
1952   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1953                                        PtrQueue::byte_offset_of_index()));
1954   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1955                                        PtrQueue::byte_offset_of_buf()));
1956 
1957   BarrierSet* bs = Universe::heap()->barrier_set();
1958   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1959   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1960 
1961   Label done;
1962   Label runtime;
1963 
1964   // Does store cross heap regions?
1965 
1966   eor(tmp, store_addr, new_val);
1967   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
1968   cbz(tmp, done);
1969 
1970   // crosses regions, storing NULL?
1971 
1972   cbz(new_val, done);
1973 
1974   // storing region crossing non-NULL, is card already dirty?
1975 
1976 
1977   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1978   const Register card_addr = tmp;
1979 
1980   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
1981 
1982   //ExternalAddress cardtable((address) ct->byte_map_base);
1983   mov(tmp2, (unsigned)ct->byte_map_base);
1984 
1985   // get the address of the card
1986   add(card_addr, card_addr, tmp2);
1987   ldrb(tmp2, Address(card_addr));
1988   cmp(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
1989   b(done, Assembler::EQ);
1990 
1991   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
1992 
1993   membar(Assembler::StoreLoad);
1994 
1995   ldrb(tmp2, Address(card_addr));
1996   cmp(tmp2, 0);
1997   b(done, Assembler::EQ);
1998 
1999   // storing a region crossing, non-NULL oop, card is clean.
2000   // dirty card and log.
2001   mov(rscratch1, 0);
2002   strb(rscratch1, Address(card_addr));
2003 
2004   ldr(rscratch1, queue_index);
2005   cbz(rscratch1, runtime);
2006   sub(rscratch1, rscratch1, wordSize);
2007   str(rscratch1, queue_index);
2008 
2009   ldr(tmp2, buffer);
2010   str(card_addr, Address(tmp2, rscratch1));
2011   b(done);
2012 
2013   bind(runtime);
2014   // save the live input values
2015   push(store_addr->bit(true) | new_val->bit(true), sp);
2016   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
2017   pop(store_addr->bit(true) | new_val->bit(true), sp);
2018 
2019   bind(done);
2020 }
2021 
2022 #endif // INCLUDE_ALL_GCS
2023 
2024 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
2025   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
2026   int index = oop_recorder()->allocate_metadata_index(obj);
2027   RelocationHolder rspec = metadata_Relocation::spec(index);
2028   return Address((address)obj, rspec);
2029 }
2030 
2031 // Move an oop into a register.  immediate is true if we want
2032 // immediate instrcutions, i.e. we are not going to patch this
2033 // instruction while the code is being executed by another thread.  In
2034 // that case we can use move immediates rather than the constant pool.
2035 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
2036   int oop_index;
2037   if (obj == NULL) {
2038     oop_index = oop_recorder()->allocate_oop_index(obj);
2039   } else {
2040     oop_index = oop_recorder()->find_index(obj);
2041     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
2042   }
2043   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2044   if (! immediate) {
2045     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2046     ldr_constant(dst, Address(dummy, rspec));
2047   } else
2048     mov(dst, Address((address)obj, rspec));
2049 }
2050 
2051 // Move a metadata address into a register.
2052 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2053   int oop_index;
2054   if (obj == NULL) {
2055     oop_index = oop_recorder()->allocate_metadata_index(obj);
2056   } else {
2057     oop_index = oop_recorder()->find_index(obj);
2058   }
2059   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2060   mov(dst, Address((address)obj, rspec));
2061 }
2062 
2063 Address MacroAssembler::constant_oop_address(jobject obj) {
2064   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
2065   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
2066   int oop_index = oop_recorder()->find_index(obj);
2067   return Address((address)obj, oop_Relocation::spec(oop_index));
2068 }
2069 
2070 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
2071 void MacroAssembler::tlab_allocate(Register obj,
2072                                    Register var_size_in_bytes,
2073                                    int con_size_in_bytes,
2074                                    Register t1,
2075                                    Register t2,
2076                                    Label& slow_case) {
2077   assert_different_registers(obj, t2);
2078   assert_different_registers(obj, var_size_in_bytes);
2079   Register end = t2;
2080 
2081   // verify_tlab();
2082 
2083   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
2084   if (var_size_in_bytes == noreg) {
2085     lea(end, Address(obj, con_size_in_bytes));
2086   } else {
2087     lea(end, Address(obj, var_size_in_bytes));
2088   }
2089   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
2090   cmp(end, rscratch1);
2091   b(slow_case, Assembler::HI);
2092 
2093   // update the tlab top pointer
2094   str(end, Address(rthread, JavaThread::tlab_top_offset()));
2095 
2096   // recover var_size_in_bytes if necessary
2097   if (var_size_in_bytes == end) {
2098     sub(var_size_in_bytes, var_size_in_bytes, obj);
2099   }
2100   // verify_tlab();
2101 }
2102 
2103 // Preserves r6, and r3.
2104 Register MacroAssembler::tlab_refill(Label& retry,
2105                                      Label& try_eden,
2106                                      Label& slow_case) {
2107   Register top = r0;
2108   Register t1  = r2;
2109   Register t2  = r4;
2110   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r6, r3);
2111   Label do_refill, discard_tlab;
2112 
2113   if (!Universe::heap()->supports_inline_contig_alloc()) {
2114     // No allocation in the shared eden.
2115     b(slow_case);
2116   }
2117 
2118   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2119   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2120 
2121   // calculate amount of free space
2122   sub(t1, t1, top);
2123   lsr(t1, t1, LogHeapWordSize);
2124 
2125   // Retain tlab and allocate object in shared space if
2126   // the amount free in the tlab is too large to discard.
2127 
2128   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2129   cmp(t1, rscratch1);
2130   b(discard_tlab, Assembler::LE);
2131 
2132   // Retain
2133   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2134   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
2135   add(rscratch1, rscratch1, t2);
2136   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2137 
2138   if (TLABStats) {
2139     // increment number of slow_allocations
2140     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
2141          1, rscratch1);
2142   }
2143   b(try_eden);
2144 
2145   bind(discard_tlab);
2146   if (TLABStats) {
2147     // increment number of refills
2148     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
2149          rscratch1);
2150     // accumulate wastage -- t1 is amount free in tlab
2151     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
2152          rscratch1);
2153   }
2154 
2155   // if tlab is currently allocated (top or end != null) then
2156   // fill [top, end + alignment_reserve) with array object
2157   cbz(top, do_refill);
2158 
2159   // set up the mark word
2160   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
2161   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
2162   // set the length to the remaining space
2163   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
2164   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
2165   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
2166   str(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
2167   // set klass to intArrayKlass
2168   // dubious reloc why not an oop reloc?
2169   mov(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
2170   ldr(t1, Address(rscratch1));
2171   // store klass last.  concurrent gcs assumes klass length is valid if
2172   // klass field is not null.
2173   store_klass(top, t1);
2174 
2175   mov(t1, top);
2176   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2177   sub(t1, t1, rscratch1);
2178   incr_allocated_bytes(rthread, t1, 0, rscratch1);
2179 
2180   // refill the tlab with an eden allocation
2181   bind(do_refill);
2182   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2183   lsl(t1, t1, LogHeapWordSize);
2184   // allocate new tlab, address returned in top
2185   eden_allocate(top, t1, 0, t2, slow_case);
2186 
2187   // Check that t1 was preserved in eden_allocate.
2188 #ifdef ASSERT
2189   if (UseTLAB) {
2190     Label ok;
2191     Register tsize = r4;
2192     assert_different_registers(tsize, rthread, t1);
2193     str(tsize, Address(pre(sp, -16)));
2194     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2195     lsl(tsize, tsize, LogHeapWordSize);
2196     cmp(t1, tsize);
2197     b(ok, Assembler::EQ);
2198     STOP("assert(t1 != tlab size)");
2199     should_not_reach_here();
2200 
2201     bind(ok);
2202     ldr(tsize, Address(post(sp, 16)));
2203   }
2204 #endif
2205   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2206   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2207   add(top, top, t1);
2208   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
2209   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2210   verify_tlab();
2211   b(retry);
2212 
2213   return rthread; // for use by caller
2214 }
2215 
2216 // Defines obj, preserves var_size_in_bytes
2217 void MacroAssembler::eden_allocate(Register obj,
2218                                    Register var_size_in_bytes,
2219                                    int con_size_in_bytes,
2220                                    Register t1,
2221                                    Label& slow_case) {
2222   assert_different_registers(obj, var_size_in_bytes, t1);
2223   if (!Universe::heap()->supports_inline_contig_alloc()) {
2224     b(slow_case);
2225   } else {
2226     Register end = t1;
2227     Register heap_end = rscratch2;
2228     Label retry;
2229     bind(retry);
2230 
2231     mov(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()));
2232     ldr(heap_end, Address(rscratch1));
2233 
2234     ExternalAddress heap_top((address) Universe::heap()->top_addr());
2235     mov(rscratch1, heap_top);
2236     ldrex(obj, rscratch1);
2237 
2238     // Adjust it my the size of our new object
2239     if (var_size_in_bytes == noreg) {
2240       lea(end, Address(obj, con_size_in_bytes));
2241     } else {
2242       lea(end, Address(obj, var_size_in_bytes));
2243     }
2244 
2245     // if end < obj then we wrapped around high memory
2246     cmp(end, obj);
2247     b(slow_case, Assembler::LO);
2248 
2249     cmp(end, heap_end);
2250     b(slow_case, Assembler::HI);
2251 
2252     // If heap_top hasn't been changed by some other thread, update it.
2253     mov(rscratch2, rscratch1);
2254     strex(rscratch1, end, rscratch2);
2255     cmp(rscratch1, 0);
2256     b(retry, Assembler::NE);
2257   }
2258 }
2259 
2260 void MacroAssembler::verify_tlab() {
2261 #ifdef ASSERT
2262   if (UseTLAB && VerifyOops) {
2263     Label next, ok;
2264 
2265     strd(rscratch2, rscratch1, Address(pre(sp, -16)));
2266 
2267     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2268     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2269     cmp(rscratch2, rscratch1);
2270     b(next, Assembler::HS);
2271     STOP("assert(top >= start)");
2272     should_not_reach_here();
2273 
2274     bind(next);
2275     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2276     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2277     cmp(rscratch2, rscratch1);
2278     b(ok, Assembler::HS);
2279     STOP("assert(top <= end)");
2280     should_not_reach_here();
2281 
2282     bind(ok);
2283     ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
2284   }
2285 #endif
2286 }
2287 
2288 // Writes to stack successive pages until offset reached to check for
2289 // stack overflow + shadow pages.  This clobbers tmp.
2290 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2291   assert_different_registers(tmp, size, rscratch1);
2292   mov(tmp, sp);
2293   // Bang stack for total size given plus shadow page size.
2294   // Bang one page at a time because large size can bang beyond yellow and
2295   // red zones.
2296   Label loop;
2297   mov(rscratch1, os::vm_page_size());
2298   bind(loop);
2299   lea(tmp, Address(tmp, -os::vm_page_size()));
2300   subs(size, size, rscratch1);
2301   str(size, Address(tmp));
2302   b(loop, Assembler::GT);
2303 
2304   // Bang down shadow pages too.
2305   // At this point, (tmp-0) is the last address touched, so don't
2306   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2307   // was post-decremented.)  Skip this address by starting at i=1, and
2308   // touch a few more pages below.  N.B.  It is important to touch all
2309   // the way down to and including i=StackShadowPages.
2310   for (int i = 0; i< StackShadowPages-1; i++) {
2311     // this could be any sized move but this is can be a debugging crumb
2312     // so the bigger the better.
2313     lea(tmp, Address(tmp, -os::vm_page_size()));
2314     str(size, Address(tmp));
2315   }
2316 }
2317 
2318 
2319 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
2320   mov(r, Address(page, rtype));
2321   InstructionMark im(this);
2322   code_section()->relocate(inst_mark(), rtype);
2323   ldr(r, Address(r));
2324   return inst_mark();
2325 }
2326 
2327 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
2328   InstructionMark im(this);
2329   code_section()->relocate(inst_mark(), rtype);
2330   // It's ok to load to reg from reg + off (without write-back)
2331   ldr(r, Address(r, 0));
2332   return inst_mark();
2333 }
2334 
2335 // Helper functions for 64-bit multipliction, division and remainder
2336 // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
2337 void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
2338   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2339   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2340   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2341 
2342   mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
2343 }
2344 
2345 // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
2346 void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
2347   assert_different_registers(Rn, Rnh);
2348   assert_different_registers(Rm, Rmh);
2349   assert_different_registers(Rd, Rdh); // umull restriction
2350   const Register t = rscratch1;
2351 
2352   mul(t, Rm, Rnh);
2353   mla(t, Rn, Rmh, t);
2354   umull(Rd, Rdh, Rm, Rn);
2355   add(Rdh, t, Rdh);
2356 }
2357 
2358 
2359 int64_t internal_ldiv(int64_t a, int64_t b) {
2360   return a / b;
2361 }
2362 
2363 int64_t internal_lmod(int64_t a, int64_t b) {
2364   return a % b;
2365 }
2366 
2367 void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
2368     Register cnt = rscratch1;
2369     Register mod = rscratch2;
2370     Register sign = r14;
2371     assert_different_registers(num, den, rscratch1, rscratch2, r14);
2372 
2373     // FIXME This works by first converting any negative values to positive ones, however
2374     // it is not possible to express |INT_MIN|. Need to fix this
2375 
2376     //Convert to positive values
2377     mov(sign, 0);
2378 
2379     cmp(num, 0);
2380     mov(sign, 1, MI);
2381     rsb(num, num, 0, MI);
2382 
2383     cmp(den, 0);
2384     if(!want_mod) eor(sign, sign, 1, MI);
2385     rsb(den, den, 0, MI);
2386 
2387     // Algorithm from
2388     // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
2389     // Graeme Williams
2390     mov(cnt, 28);
2391     mov(mod, num, lsr(4));
2392     cmp(den, mod, lsr(12));
2393     sub(cnt, cnt, 16, Assembler::LE);
2394     mov(mod, mod, lsr(16), Assembler::LE);
2395     cmp(den, mod, lsr(4));
2396     sub(cnt, cnt, 8, Assembler::LE);
2397     mov(mod, mod, lsr(8), Assembler::LE);
2398     cmp(den, mod);
2399     sub(cnt, cnt, 4, Assembler::LE);
2400     mov(mod, mod, lsr(4), Assembler::LE);
2401     mov(num, num, lsl(cnt));
2402     rsb(den, den, 0);
2403 
2404     adds(num, num, num);
2405     //Now skip over cnt copies of the 3 instr. loop.
2406     add(cnt, cnt, cnt, lsl(1));
2407     add(r15_pc, r15_pc, cnt, lsl(2));
2408     mov(r0, r0);
2409 
2410     for(int i = 0; i < 32; i++) {
2411         adcs(mod, den, mod, lsl(1));
2412         sub(mod, mod, den, Assembler::LO);
2413         adcs(num, num, num);
2414     }
2415 
2416     cmp(sign, 0);
2417     rsb(res, want_mod? mod : num, 0, NE);
2418     mov(res, want_mod? mod : num, EQ);
2419 }
2420 
2421 
2422 // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
2423 // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
2424 // <Rd> = <Rn> / <Rm>
2425 // <Rd> = <Rn> % <Rm>
2426 void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
2427   //Dispatch to best possible
2428   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2429   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2430   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2431 
2432   assert(32 == width || 64 == width, "Invalid width");
2433   bool is64b = 64 == width;
2434 
2435   if(is64b) {
2436     assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
2437   }
2438 
2439   if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
2440     // Emit a hw instruction sequnce.
2441     if(want_remainder) {
2442       sdiv(rscratch1, Rn, Rm);
2443       mls(Rd, rscratch1, Rm, Rn);
2444     } else {
2445       sdiv(Rd, Rn, Rm);
2446     }
2447   } else if(!is64b) {
2448     // Fall back to assembly software routine
2449     divide32(Rd, Rn, Rm, want_remainder);
2450   } else {
2451     // Fall back to C software routine for
2452     // 64 bit divide/mod
2453     if(Rn != r0) {
2454       mov(rscratch1, Rm);
2455       mov(rscratch2, Rmh);
2456 
2457       mov(r0, Rn);
2458       mov(r1, Rnh);
2459 
2460       mov(r2, rscratch1);
2461       mov(r3, rscratch2);
2462     } else if(Rm != r2) {
2463       mov(r2, Rm);
2464       mov(r3, Rmh);
2465     }
2466     address function;
2467     if(want_remainder) function = (address)internal_lmod;
2468     else               function = (address)internal_ldiv;
2469 
2470     mov(rscratch1, function);
2471     bl(rscratch1);
2472     if(Rd != r0) {
2473       mov(Rd, r0);
2474       if(is64b) mov(Rdh, r1);
2475     }
2476   }
2477 }
2478 
2479 void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
2480   assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
2481   // Dispatch to the best sequence
2482   if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
2483     // Can use extend X
2484     switch(width){
2485       case 8:  uxtb(dest, source, ror(lsb)); break;
2486       case 16: uxth(dest, source, ror(lsb)); break;
2487       default:                               break;
2488    }
2489   } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
2490     ubfx(dest, source, lsb, width);
2491   } else {
2492     // Do two shifts
2493     lsl(dest, source, 32 - (width + lsb));
2494     lsr(dest, dest, 32 - width);
2495   }
2496 }
2497 
2498 
2499 void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
2500   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2501   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2502   if(VM_Version::features() & FT_SINGLE_CORE) {
2503     ldrd(Rt, Rbase);
2504   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2505 #ifdef ASSERT
2506     Label lbl;
2507     tst(Rbase, 7);
2508     b(lbl, EQ);
2509     stop("atomic_ldrd is not doubleword aligned!");
2510     bind(lbl);
2511 #endif // ASSERT
2512 
2513     ldrexd(Rt, Rbase);
2514   } else {
2515     // TODO: Find Java way of logging
2516     static bool warning_printed = false;
2517     if(!warning_printed) {
2518       fprintf(stderr, "Unable to provide atomic doubleword load.\n");
2519       warning_printed = true;
2520     }
2521     ldrd(Rt, Rbase);
2522   }
2523 }
2524 
2525 void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
2526                                  Register temp, Register temp2) {
2527   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2528   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2529   assert((Register) (temp + 1) == temp2, "Must be contiguous");
2530   assert_different_registers(temp, Rt, Rbase, temp2);
2531   if(VM_Version::features() & FT_SINGLE_CORE) {
2532     strd(Rt, Rbase);
2533   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2534     // First need to gain exclusive access
2535     Label retry;
2536 
2537 #ifdef ASSERT
2538     tst(Rbase, 7);
2539     b(retry, EQ);
2540     stop("atomic_strd is not doubleword aligned!");
2541 #endif // ASSERT
2542 
2543     bind(retry);
2544     ldrexd(temp, Rbase);
2545     strexd(temp, Rt, Rbase);
2546     cmp(temp, 0);
2547     b(retry, NE);
2548   } else {
2549     // TODO: Find Java way of logging
2550     static bool warning_printed = false;
2551     if(!warning_printed) {
2552       fprintf(stderr, "Unable to provide atomic doubleword store.\n");
2553       warning_printed = true;
2554     }
2555     strd(Rt, Rbase);
2556   }
2557 }
2558 
2559 
2560 #define ENABLE_DEBUGGING 0
2561 // Helloworld is 2,482,397
2562 uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
2563 
2564 uint32_t MacroAssembler::bytecodes_executed = 0;
2565 
2566 int MacroAssembler::enable_debug = 0;
2567 int MacroAssembler::enable_method_debug = 0;
2568 int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
2569 
2570 #define N_J_BYTECODES 234
2571 const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
2572 "lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
2573 "iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
2574 "lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
2575 "aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
2576 "dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
2577 "fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
2578 "iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
2579 "dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
2580 "lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
2581 "ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
2582 "i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
2583 "dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
2584 "ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
2585 "lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
2586 "invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
2587 "anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
2588 "multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
2589 "fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
2590 "fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
2591 "fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
2592 "fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
2593 "fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "INVALID"};
2594 
2595 int bytecodes_seen[256];
2596 
2597 void MacroAssembler::init_unseen_bytecodes() {
2598   for(int i = 0; i < 256; i++ ) {
2599     bytecodes_seen[i] = 0;
2600   }
2601 }
2602 
2603 void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
2604   if(ENABLE_DEBUGGING) {
2605     mov(scratch, (address)bytecodes_seen);
2606     add(scratch, scratch, bc_reg, lsl(2));
2607     add(bc_reg, bc_reg, 1);
2608     str(bc_reg, Address(scratch));
2609     sub(bc_reg, bc_reg, 1);
2610   }
2611 }
2612 
2613 void MacroAssembler::print_unseen_bytecodes() {
2614   if(ENABLE_DEBUGGING) {
2615     printf("=== Unseen bytecodes ===\n");
2616     for(int i = 0; i < N_J_BYTECODES; i++) {
2617       if(0 == bytecodes_seen[i]) {
2618         printf("\t%s\n", j_bytecodes[i]);
2619       }
2620     }
2621     printf("=== End unseen ===\n");
2622   } else {
2623     printf("Not kept track, enable debugging to view info\n");
2624   }
2625   fflush(stdout);
2626 }
2627 
2628 int machine_state_regset = 0b0101111111111111;
2629 int machine_state_float_regset = 0b11;
2630 
2631 void MacroAssembler::save_machine_state() {
2632   stmdb(sp, machine_state_regset);
2633   vstmdb_f64(sp, machine_state_float_regset);
2634   enter();
2635 }
2636 
2637 void MacroAssembler::restore_machine_state() {
2638   leave();
2639   vldmia_f64(sp, machine_state_float_regset);
2640   ldmia(sp, machine_state_regset);
2641 }
2642 
2643 void internal_internal_printf(const char *fmt, ...) {
2644   va_list args;
2645   va_start (args, fmt);
2646   vprintf (fmt, args);
2647   fflush(stdout);
2648   va_end(args);
2649 }
2650 
2651 void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
2652   char buf[2048];
2653   char fmt[2048];
2654   buf[0] = '\0';
2655   const char *thread_str = "THREAD 0x%08x : ";
2656   int id = pthread_self();
2657   strcpy(fmt, format);
2658 
2659   char *str = strtok(fmt, "\n");
2660   int nreplace = 0;
2661   while(str) {
2662     strcpy(buf, thread_str);
2663     strcat(buf, str);
2664     strcat(buf, "\n");
2665     internal_internal_printf((const char*)buf, id, a, b, c);
2666     str = strtok(NULL, "\n");
2667   }
2668 }
2669 
2670 void MacroAssembler::get_bytecode(Register dst, Register bc) {
2671   if(ENABLE_DEBUGGING) {
2672     int nbytecodes = N_J_BYTECODES;
2673     mov(dst, (address)j_bytecodes);
2674     cmp(bc, nbytecodes);
2675 
2676     ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
2677     ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
2678   }
2679 }
2680 
2681 int invocation_depth_count = -1; //TODO remove this with debugging info
2682 
2683 #define MAX_FCALL_DEPTH 4096
2684 struct thread_method_record{
2685   int thread_id;
2686   char names[MAX_FCALL_DEPTH][512];
2687   int invocation_depth_count;
2688 };
2689 int ntmrs = 0;
2690 #define MAX_TMRS 10
2691 thread_method_record tmr_list[MAX_TMRS];
2692 
2693 void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
2694   int id = pthread_self();
2695   *thread_id = id;
2696   for(int i = 0; i < ntmrs; i++) {
2697     thread_method_record *tmr = &tmr_list[i];
2698     if(id == tmr->thread_id) {
2699       // Add a new frame
2700       if(tmr->invocation_depth_count >= -1 &&
2701         tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
2702         *invocation_depth_count = ++(tmr->invocation_depth_count);
2703         *name = tmr->names[tmr->invocation_depth_count];
2704         meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
2705         return;
2706       } else {
2707         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2708         exit(1);
2709       }
2710     }
2711   }
2712   // Add a new thread
2713   if(ntmrs >= MAX_TMRS) {
2714     fprintf(stderr, "Too many tmrs\n");
2715     exit(1);
2716   }
2717   //Create a new tmr
2718   tmr_list[ntmrs].thread_id = id;
2719   tmr_list[ntmrs].invocation_depth_count = 0;
2720   meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
2721   *invocation_depth_count = 0;
2722   *name = tmr_list[ntmrs].names[0];
2723   ntmrs++;
2724 }
2725 
2726 void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
2727   int id = pthread_self();
2728   *thread_id = id;
2729   for(int i = 0; i < ntmrs; i++) {
2730     thread_method_record *tmr = &tmr_list[i];
2731     if(id == tmr->thread_id) {
2732       if(tmr->invocation_depth_count >= 0 &&
2733         tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
2734         // Pop frame
2735         *name = tmr->names[tmr->invocation_depth_count];
2736         *invocation_depth_count = (tmr->invocation_depth_count)--;
2737         return;
2738       } else if ( -1 == tmr->invocation_depth_count) {
2739         *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
2740         *invocation_depth_count = 0;
2741         return;
2742       } else {
2743         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2744         exit(1);
2745       }
2746     }
2747   }
2748   fprintf(stderr, "Unable to find suitable tmr\n");
2749   exit(1);
2750 }
2751 
2752 void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
2753   sprintf(buf, "THREAD 0x%08x : ", id);
2754   for(int i = 0; i < invocation_depth_count; i++) {
2755     strcat(buf, "  ");
2756   }
2757 }
2758 
2759 
2760 void print_entry(Method *meth, int native) {
2761   char *name;
2762   int invocation_depth_count, id;
2763   push_tmr(meth, &id, &invocation_depth_count, &name);
2764 
2765   if(MacroAssembler::enable_method_debug) {
2766     char buf[4096], buf_b[2048];
2767     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2768     if(native) {
2769       sprintf(buf_b, "CALL NATIVE : %s\n", name);
2770     } else {
2771       sprintf(buf_b, "CALL JAVA   : %s\n", name);
2772     }
2773     strcat(buf, buf_b);
2774     printf("%s", buf);
2775     fflush(stdout);
2776   }
2777 }
2778 
2779 void print_exit(bool normal) {
2780   char *name;
2781   int invocation_depth_count, id;
2782   pop_tmr(&id, &invocation_depth_count, &name);
2783 
2784   if(MacroAssembler::enable_method_debug) {
2785     char buf[4096], buf_b[2048];
2786     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2787     sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
2788     strcat(buf, buf_b);
2789     printf("%s", buf);
2790     fflush(stdout);
2791   }
2792 }
2793 
2794 void MacroAssembler::print_method_entry(Register rmethod, bool native) {
2795   if(ENABLE_DEBUGGING) {
2796     save_machine_state();
2797 
2798     bic(sp, sp, 7); // 8-byte align stack
2799     mov(rscratch2, (address)print_entry);
2800     mov(r0, rmethod);
2801     mov(r1, native);
2802     bl(rscratch2);
2803 
2804     restore_machine_state();
2805   }
2806 }
2807 
2808 void MacroAssembler::print_method_exit(bool normal) {
2809   if(ENABLE_DEBUGGING) {
2810     save_machine_state();
2811 
2812     bic(sp, sp, 7); // 8-byte align stack
2813     mov(rscratch2, (address)print_exit);
2814     mov(r0, normal);
2815     bl(rscratch2);
2816 
2817     restore_machine_state();
2818   }
2819 }
2820 
2821 void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
2822   if(ENABLE_DEBUGGING) {
2823     Label skip;
2824     save_machine_state();
2825 
2826         mov(rscratch1, ra);
2827         str(rscratch1, Address(pre(sp, -wordSize)));
2828         mov(rscratch1, rb);
2829         str(rscratch1, Address(pre(sp, -wordSize)));
2830         mov(rscratch1, rc);
2831         str(rscratch1, Address(pre(sp, -wordSize)));
2832 
2833         if(!important) {
2834             mov(r0, (address)&enable_debug);
2835             ldr(r0, Address(r0));
2836             cmp(r0, 0);
2837             b(skip, Assembler::EQ);
2838         }
2839 
2840         int sp_difference = wordSize * (count_bits(machine_state_regset) +
2841                                         2 * count_bits(machine_state_float_regset) +
2842                                         2 + 3); //Frame entry and saved
2843 
2844         mov(r0, (address)fmt);
2845         if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
2846         else         add(r1, sp, sp_difference);
2847 
2848         if(rb != sp) ldr(r2, Address(sp, wordSize));
2849         else         add(r2, sp, sp_difference);
2850 
2851         if(rc != sp) ldr(r3, Address(sp));
2852         else         add(r3, sp, sp_difference);
2853 
2854         bic(sp, sp, 7); // 8-byte align stack
2855 
2856         mov(rscratch2, (address)internal_printf);
2857         bl(rscratch2);
2858 
2859         bind(skip);
2860         restore_machine_state();
2861     }
2862 }
2863 
2864 void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
2865   reg_printf_internal(false, fmt, ra, rb, rc);
2866 }
2867 
2868 void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
2869   reg_printf_internal(true, fmt, ra, rb, rc);
2870 }
2871 
2872 // When debugging, set the break on bkpnt
2873 void bkpnt() { return; }
2874 void MacroAssembler::create_breakpoint() {
2875     if(ENABLE_DEBUGGING) {
2876         save_machine_state();
2877         bic(sp, sp, 7); // 8-byte align stack
2878 
2879         mov(rscratch2, (address) bkpnt);
2880         bl(rscratch2);
2881 
2882         restore_machine_state();
2883     }
2884 }
2885 
2886 
2887 void MacroAssembler::print_cpool(InstanceKlass *klass) {
2888     ttyLocker ttyl;
2889     klass->constants()->print_on(tty);
2890 }
2891 
2892 int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
2893     if((0 == Rt->encoding_nocheck() % 2 &&
2894          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2895       (uabs(adr.offset()) < (1 << 8))) {
2896       /* Good to go with a ldrd */
2897       ldrd(Rt, adr, cond);
2898       return 0x0;
2899     } else {
2900       return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
2901                                 &Assembler::ldr, Rtmp, cond);
2902     }
2903 }
2904 
2905 int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
2906     if((0 == Rt->encoding_nocheck() % 2 &&
2907          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2908       (uabs(adr.offset()) < (1 << 8))) {
2909       /* Good to go with a strd */
2910       strd(Rt, adr, cond);
2911     } else {
2912       double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
2913     }
2914     return 0x0;
2915 }
2916 
2917 int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2918         void (Assembler::* mul)(unsigned, const Address&, Condition),
2919         void (Assembler::* sgl)(Register, const Address&, Condition),
2920         Register Rtmp, Condition cond) {
2921   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2922           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2923     /* Do a load or store multiple instruction */
2924     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2925   } else if (!adr.uses(Rt)) {
2926     double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
2927   } else {
2928     // need to reshuffle operation, otherwise write to Rt destroys adr
2929     if (adr.get_mode() != Address::reg) {
2930       // offset-based addressing. hence Rt2 could not be by adr
2931       if (adr.get_wb_mode() == Address::pre) {
2932         (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
2933         (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
2934       } else if (adr.get_wb_mode() == Address::post) {
2935         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2936         (this->*sgl)(Rt, adr, cond);
2937       } else if (adr.get_wb_mode() == Address::off) {
2938         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2939         (this->*sgl)(Rt, adr, cond);
2940       } else {
2941         ShouldNotReachHere();
2942       }
2943     } else {
2944       // index-based addressing. both Rt and Rt2 could be used by adr
2945       // hence temp register is necessary
2946       adr.lea(this, Rtmp);
2947       double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
2948       // adr.lea have only address manipulation and cannot cause trap.
2949       // first instruction when NPE can occur is in double_ldst_failed_dispatch
2950       // so shift offset appropriately
2951       return 0x4;
2952     }
2953   }
2954   return 0x0;
2955 }
2956 
2957 void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2958         void (Assembler::* mul)(unsigned, const Address&, Condition),
2959         void (Assembler::* sgl)(Register, const Address&, Condition),
2960         Condition cond) {
2961   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2962           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2963     /* Do a store multiple instruction */
2964     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2965   } else {
2966     if (adr.get_mode() != Address::reg) {
2967       // offset-based addressing
2968       if (adr.get_wb_mode() == Address::pre) {
2969         (this->*sgl)(Rt, adr, cond);
2970         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2971       } else if (adr.get_wb_mode() == Address::post) {
2972         (this->*sgl)(Rt, adr, cond);
2973         (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
2974       } else if (adr.get_wb_mode() == Address::off) {
2975         (this->*sgl)(Rt, adr, cond);
2976         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2977       } else {
2978         ShouldNotReachHere();
2979       }
2980     } else {
2981       // index-based addressing
2982       if (adr.get_wb_mode() == Address::pre) {
2983         // current implementation does not use Address::pre for indexed access
2984         ShouldNotReachHere();
2985       } else if (adr.get_wb_mode() == Address::post) {
2986         // current implementation does not use Address:post for indexed access
2987         // enable the code below and implement proper post() method if it is required
2988         ShouldNotReachHere();
2989       } else if (adr.get_wb_mode() == Address::off) {
2990         (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
2991         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2992         compensate_addr_offset(adr, cond);
2993       } else {
2994         ShouldNotReachHere();
2995       }
2996     }
2997   }
2998 }
2999 
3000 #ifdef ASSERT
3001 void MacroAssembler::verify_stack_alignment() {
3002   if (StackAlignmentInBytes > 4) {
3003     Label x;
3004     tst(sp, StackAlignmentInBytes-1);
3005     b(x, EQ);
3006     stop("stack unaligned");
3007     bind(x);
3008   }
3009 }
3010 #endif
3011 
3012 /**
3013  * Emits code to update CRC-32 with a byte value according to constants in table
3014  *
3015  * @param [in,out]crc   Register containing the crc.
3016  * @param [in]val       Register containing the byte to fold into the CRC.
3017  * @param [in]table     Register containing the table of crc constants.
3018  *
3019  * uint32_t crc;
3020  * val = crc_table[(val ^ crc) & 0xFF];
3021  * crc = val ^ (crc >> 8);
3022  *
3023  */
3024 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3025   eor(val, val, crc);
3026   andr(val, val, 0xff);
3027   ldr(val, Address(table, val, lsl(2)));
3028   eor(crc, val, crc, Assembler::lsr(8));
3029 }
3030 
3031 /**
3032  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3033  *
3034  * @param [in,out]crc   Register containing the crc.
3035  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3036  * @param [in]table0    Register containing table 0 of crc constants.
3037  * @param [in]table1    Register containing table 1 of crc constants.
3038  * @param [in]table2    Register containing table 2 of crc constants.
3039  * @param [in]table3    Register containing table 3 of crc constants.
3040  *
3041  * uint32_t crc;
3042  *   v = crc ^ v
3043  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3044  *
3045  */
3046 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3047         Register tmp2, Register table0, Register table1, Register table2, Register table3) {
3048   eor(v, crc, v);
3049   uxtb(tmp, v);
3050   uxtb(tmp2, v, ror(8));
3051   ldr(crc, Address(table3, tmp, lsl(2)));
3052   ldr(tmp2, Address(table2, tmp2, lsl(2)));
3053   uxtb(tmp, v, ror(16));
3054   eor(crc, crc, tmp2);
3055   uxtb(tmp2, v, ror(24));
3056   ldr(tmp, Address(table1, tmp, lsl(2)));
3057   ldr(tmp2, Address(table0, tmp2, lsl(2)));
3058   eor(crc, crc, tmp);
3059   eor(crc, crc, tmp2);
3060 }
3061 
3062 /**
3063  * @param crc   register containing existing CRC (32-bit)
3064  * @param buf   register pointing to input byte buffer (byte*)
3065  * @param len   register containing number of bytes
3066  * @param table register that will contain address of CRC table
3067  * @param tmp   scratch register
3068  */
3069 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3070         Register table0, Register table1, Register table2, Register table3,
3071         Register tmp, Register tmp2, Register tmp3) {
3072   Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
3073 
3074   inv(crc, crc);
3075   if (UseCRC32) {
3076     Label CRC_by4_loop, CRC_by1_loop;
3077 
3078       subs(len, len, 4);
3079       b(CRC_by4_loop, Assembler::GE);
3080       adds(len, len, 4);
3081       b(CRC_by1_loop, Assembler::GT);
3082       b(L_exit);
3083 
3084     BIND(CRC_by4_loop);
3085       ldr(tmp, Address(post(buf, 4)));
3086       subs(len, len, 4);
3087       crc32w(crc, crc, tmp);
3088       b(CRC_by4_loop, Assembler::GE);
3089       adds(len, len, 4);
3090       b(L_exit, Assembler::LE);
3091     BIND(CRC_by1_loop);
3092       ldrb(tmp, Address(post(buf, 1)));
3093       subs(len, len, 1);
3094       crc32b(crc, crc, tmp);
3095       b(CRC_by1_loop, Assembler::GT);
3096     BIND(L_exit);
3097       inv(crc, crc);
3098       return;
3099   }
3100     lea(table0, ExternalAddress(StubRoutines::crc_table_addr()));
3101     add(table1, table0, 1*256*sizeof(juint));
3102     add(table2, table0, 2*256*sizeof(juint));
3103     add(table3, table0, 3*256*sizeof(juint));
3104 
3105   BIND(L_align_by1_loop);
3106     tst(buf, 3);
3107     b(L_align_exit, Assembler::EQ);
3108     cmp(len, 0);
3109     b(L_exit, Assembler::EQ);
3110     sub(len, len, 1);
3111     ldrb(tmp, Address(post(buf, 1)));
3112     update_byte_crc32(crc, tmp, table0);
3113     b(L_align_by1_loop);
3114 
3115   BIND(L_align_exit);
3116 
3117   if (UseNeon) {
3118       cmp(len, 32+12); // account for possible need for alignment
3119       b(L_cpu, Assembler::LT);
3120 
3121     Label L_fold, L_align_by4_loop, L_align_by4_exit;
3122 
3123     BIND(L_align_by4_loop);
3124       tst(buf, 0xf);
3125       b(L_align_by4_exit, Assembler::EQ);
3126       ldr(tmp, Address(post(buf, 4)));
3127       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3128       sub(len, len, 4);
3129       b(L_align_by4_loop);
3130 
3131     BIND(L_align_by4_exit);
3132 
3133       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3134 
3135       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3136       vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
3137       vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
3138       vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
3139       vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
3140       veor_64(d16, d16, d16);
3141       vmov_32(d16, 0, crc);
3142 
3143       veor_64(d0, d0, d16);
3144       sub(len, len, 32);
3145 
3146     BIND(L_fold);
3147       vmullp_8(q8, d0, d5);
3148       vmullp_8(q9, d0, d7);
3149       vmullp_8(q10, d0, d4);
3150       vmullp_8(q11, d0, d6);
3151 
3152       vmullp_8(q12, d1, d5);
3153       vmullp_8(q13, d1, d7);
3154       vmullp_8(q14, d1, d4);
3155       vmullp_8(q15, d1, d6);
3156 
3157       vuzp_128_16(q9, q8);
3158       veor_128(q8, q8, q9);
3159 
3160       vuzp_128_16(q13, q12);
3161       veor_128(q12, q12, q13);
3162 
3163       vshll_16u(q9, d16, 8);
3164       vshll_16u(q8, d17, 8);
3165 
3166       vshll_16u(q13, d24, 8);
3167       vshll_16u(q12, d25, 8);
3168 
3169       veor_128(q8, q8, q10);
3170       veor_128(q12, q12, q14);
3171       veor_128(q9, q9, q11);
3172       veor_128(q13, q13, q15);
3173 
3174       veor_64(d19, d19, d18);
3175       veor_64(d18, d27, d26);
3176 
3177       vshll_32u(q13, d18, 16);
3178       vshll_32u(q9, d19, 16);
3179 
3180       veor_128(q9, q8, q9);
3181       veor_128(q13, q12, q13);
3182 
3183       veor_64(d31, d26, d27);
3184       veor_64(d30, d18, d19);
3185 
3186       vshl_128_64(q15, q15, 1);
3187       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3188       veor_128(q0, q0, q15);
3189 
3190       subs(len, len, 16);
3191       b(L_fold, Assembler::GE);
3192 
3193       vmov_32(tmp, d0, 0);
3194       mov(crc, 0);
3195       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3196       vmov_32(tmp, d0, 1);
3197       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3198       vmov_32(tmp, d1, 0);
3199       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3200       vmov_32(tmp, d1, 1);
3201       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3202 
3203       add(len, len, 16);
3204   }
3205 
3206   BIND(L_cpu);
3207     subs(len, len, 8);
3208     b(L_by8_loop, Assembler::GE);
3209     adds(len, len, 8);
3210     b(L_by1_loop, Assembler::GT);
3211     b(L_exit);
3212 
3213   BIND(L_by8_loop);
3214     ldr(tmp, Address(post(buf, 4)));
3215     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3216     ldr(tmp, Address(post(buf, 4)));
3217     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3218     subs(len, len, 8);
3219     b(L_by8_loop, Assembler::GE);
3220     adds(len, len, 8);
3221     b(L_exit, Assembler::LE);
3222   BIND(L_by1_loop);
3223     subs(len, len, 1);
3224     ldrb(tmp, Address(post(buf, 1)));
3225     update_byte_crc32(crc, tmp, table0);
3226     b(L_by1_loop, Assembler::GT);
3227 
3228   BIND(L_exit);
3229     inv(crc, crc);
3230 }
3231 
3232 void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
3233   if (width > 15 && lsb == 0) {
3234     lsr(Rd, Rd, width);
3235     lsl(Rd, Rd, width);
3236   } else if (width > 15 && lsb + width == 32) {
3237     lsl(Rd, Rd, 32 - lsb);
3238     lsr(Rd, Rd, 32 - lsb);
3239   } else {
3240     const int lsb1 = (lsb & 1);
3241     int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
3242     while (width) {
3243       bic(Rd, Rd, ((1 << w1) - 1) << lsb);
3244       width -= w1;
3245       lsb += w1;
3246       w1 = width > 8 ? 8 : width;
3247     }
3248   }
3249 }