1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include <sys/types.h>
  28 
  29 #include "precompiled.hpp"
  30 #include "asm/assembler.hpp"
  31 #include "asm/assembler.inline.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 
  34 #include "compiler/disassembler.hpp"
  35 #include "memory/resourceArea.hpp"
  36 #include "nativeInst_aarch32.hpp"
  37 //This ifdef was introduced so a core build can be built
  38 #ifdef COMPILER2
  39 #include "opto/compile.hpp"
  40 #include "opto/node.hpp"
  41 #endif
  42 
  43 #include "runtime/biasedLocking.hpp"
  44 #include "runtime/icache.hpp"
  45 #include "runtime/interfaceSupport.hpp"
  46 #include "runtime/sharedRuntime.hpp"
  47 
  48 #if INCLUDE_ALL_GCS
  49 #include "gc_implementation/g1/g1CollectedHeap.inline.hpp"
  50 #include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
  51 #include "gc_implementation/g1/heapRegion.hpp"
  52 #endif
  53 
  54 #ifdef PRODUCT
  55 #define BLOCK_COMMENT(str) /* nothing */
  56 #define STOP(error) stop(error)
  57 #else
  58 #define BLOCK_COMMENT(str) block_comment(str)
  59 #define STOP(error) block_comment(error); stop(error)
  60 #endif
  61 
  62 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  63 
  64 // FIXME This is not a nice fix, this constant was in a compiler2 header
  65 #define MAX_stubs_size_div2 (128 / 2)
  66 // FIXME END
  67 
  68 // Note the corrections in the following three instructions for the PC.
  69 // All literal modes that use the PC need to have the offset adjusted
  70 // Patch any kind of instruction; there may be several instructions.
  71 // Return the total length (in bytes) of the instructions.
  72 
  73 int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
  74   // Note the corrections
  75   int instructions = 1;
  76   long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
  77   bool add = offset >= 0;
  78   unsigned insn = *(unsigned*)branch;
  79   int opc = Instruction_aarch32::extract(insn, 27, 24);
  80 
  81   if(0b1010 == opc || 0b1011 == opc) {
  82     // Branch or branch with link
  83     assert(0 == (offset & 3), "not aligned correctly");
  84     Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
  85   } else if (0b0011 == opc) {
  86     // Movw, Movt or mov, orr, orr, orr
  87     // patch up address load to registers (absolute address).
  88       instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
  89   } else if (0b010 == (opc >> 1)) {
  90     // LDR, LDRB, STR, STRB
  91     Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
  92     Instruction_aarch32::patch(branch, 23, 23, add);
  93   } else if (0b000 == (opc >> 1)) {
  94     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
  95     offset = uabs(offset);
  96     Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
  97     Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
  98     Instruction_aarch32::patch(branch, 23, 23, add);
  99   } else if (0b1101 == opc) {
 100     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 101     offset = uabs(offset);
 102     assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
 103     Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
 104     Instruction_aarch32::patch(branch, 23, 23, add);
 105   } else if (0b0010 == opc) {
 106     // ADR
 107     Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
 108     Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
 109   } else {
 110     ShouldNotReachHere();
 111   }
 112   // aarch64 had something for polling page load?
 113   return instructions * NativeInstruction::arm_insn_sz;
 114 }
 115 
 116 int MacroAssembler::patch_oop(address insn_addr, address o) {
 117     unsigned insn = *(unsigned*)insn_addr;
 118     int opc = Instruction_aarch32::extract(insn, 27, 21);
 119     if(0b0011000 == opc) {
 120         //32-bit pointers, formed of a mov and a movt
 121         assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
 122 
 123         uint32_t btm = (uint32_t)o & 0xffff;
 124         Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
 125         Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
 126         uint32_t top = (uint32_t)o >> 16;
 127         Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
 128         Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
 129         return 2 * NativeInstruction::arm_insn_sz;
 130   } else if(0b0011101 == opc) {
 131     //Instead 32bit load sequence uses mov, orr, orr, orr
 132     assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
 133     assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
 134     assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
 135     // FIXME this could carry us outside valid memory
 136 
 137     uint32_t addr = (uint32_t)o;
 138     Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
 139     Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
 140     Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
 141     Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
 142     return 4 * NativeInstruction::arm_insn_sz;
 143   } else {
 144     ShouldNotReachHere();
 145   }
 146   return 0; //won't reach here
 147 }
 148 
 149 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
 150   long offset = 0;
 151   int opc = Instruction_aarch32::extract(insn, 27, 24);
 152 
 153   if(0b1010 == opc || 0b1011 == opc) {
 154     // Branch or branch with link
 155     offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
 156   } else if (0b0011 == opc) {
 157     unsigned *insn_buf = (unsigned*)insn_addr;
 158     int opc2 = Instruction_aarch32::extract(insn, 23, 21);
 159     if(0b000 == opc2) {
 160       // movw, movt (only on newer ARMs)
 161       assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
 162       uint32_t addr;
 163       addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
 164       addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
 165       addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
 166       addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
 167       return address(addr);
 168     } else if(0b101 == opc2) {
 169       // mov, orr, orr, orr
 170       assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
 171       assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
 172       assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
 173       uint32_t addr;
 174       addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
 175       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
 176       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
 177       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
 178       return address(addr);
 179     } else {
 180       ShouldNotReachHere();
 181     }
 182   } else if (0b010 == (opc >> 1)) {
 183     // LDR, LDRB, STR, STRB
 184     offset = Instruction_aarch32::extract(insn, 11, 0);
 185     bool add = Instruction_aarch32::extract(insn, 23, 23);
 186     offset = add ? offset : -offset;
 187   } else if (0b000 == (opc >> 1)) {
 188     // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
 189     offset = Instruction_aarch32::extract(insn, 3, 0);
 190     offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
 191     bool add = Instruction_aarch32::extract(insn, 23, 23);
 192     offset = add ? offset : -offset;
 193   } else if (0b1101 == opc) {
 194     // VLDR, VSTR - NOTE VSTR(lit) is deprecated
 195     offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
 196     bool add = Instruction_aarch32::extract(insn, 23, 23);
 197     offset = add ? offset : -offset;
 198   } else if (0b0010 == opc) {
 199     // ADR
 200     offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
 201     int code = Instruction_aarch32::extract(insn, 23, 22);
 202     switch(code) {
 203       case 0b01: offset = -offset; break;
 204       case 0b10:                   break;
 205       default: ShouldNotReachHere();
 206     }
 207   } else {
 208     ShouldNotReachHere();
 209   }
 210   //Correct offset for PC
 211   offset -= 8;
 212   return address(((uint32_t)insn_addr + offset));
 213 }
 214 
 215 
 216 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
 217   dmb(Assembler::ISH);
 218 }
 219 
 220 
 221 void MacroAssembler::reset_last_Java_frame(bool clear_fp,
 222                                            bool clear_pc) {
 223   mov(rscratch1, 0);
 224   // we must set sp to zero to clear frame
 225   str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
 226   // must clear fp, so that compiled frames are not confused; it is
 227   // possible that we need it only for debugging
 228   if (clear_fp) {
 229     str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
 230   }
 231 
 232   if (clear_pc) {
 233     str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
 234   }
 235 }
 236 
 237 // Calls to C land
 238 //
 239 // When entering C land, the rfp & sp of the last Java frame have to be recorded
 240 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp
 241 // has to be reset to 0. This is required to allow proper stack traversal.
 242 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 243                                          Register last_java_fp,
 244                                          Register last_java_pc,
 245                                          Register scratch) {
 246 
 247   if (last_java_pc->is_valid()) {
 248       str(last_java_pc, Address(rthread,
 249                                 JavaThread::frame_anchor_offset()
 250                                 + JavaFrameAnchor::last_Java_pc_offset()));
 251     }
 252 
 253   // determine last_java_sp register
 254   if (last_java_sp == sp) {
 255     mov(scratch, sp);
 256     last_java_sp = scratch;
 257   } else if (!last_java_sp->is_valid()) {
 258     last_java_sp = sp;
 259   }
 260 
 261   str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
 262 
 263   // last_java_fp is optional
 264   if (last_java_fp->is_valid()) {
 265     str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
 266   }
 267 }
 268 
 269 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 270                                          Register last_java_fp,
 271                                          address  last_java_pc,
 272                                          Register scratch) {
 273   if (last_java_pc != NULL) {
 274     adr(scratch, last_java_pc);
 275   } else {
 276     // FIXME: This is almost never correct.  We should delete all
 277     // cases of set_last_Java_frame with last_java_pc=NULL and use the
 278     // correct return address instead.
 279     adr(scratch, pc());
 280   }
 281 
 282   str(scratch, Address(rthread,
 283                        JavaThread::frame_anchor_offset()
 284                        + JavaFrameAnchor::last_Java_pc_offset()));
 285 
 286   set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
 287 }
 288 
 289 void MacroAssembler::set_last_Java_frame(Register last_java_sp,
 290                                          Register last_java_fp,
 291                                          Label &L,
 292                                          Register scratch) {
 293   if (L.is_bound()) {
 294     set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
 295   } else {
 296     InstructionMark im(this);
 297     L.add_patch_at(code(), locator());
 298     set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
 299   }
 300 }
 301 
 302 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf, Register tmp) {
 303   assert(CodeCache::find_blob(entry.target()) != NULL,
 304          "destination of far call not found in code cache");
 305   // TODO performance issue: if intented to patch later,
 306   // generate mov rX, imm; bl rX far call (to reserve space)
 307   if (far_branches()) {
 308     lea(tmp, entry);
 309     if (cbuf) cbuf->set_insts_mark();
 310     bl(tmp);
 311   } else {
 312     if (cbuf) cbuf->set_insts_mark();
 313     bl(entry);
 314   }
 315 }
 316 
 317 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
 318   assert(CodeCache::find_blob(entry.target()) != NULL,
 319          "destination of far call not found in code cache");
 320   assert(!external_word_Relocation::is_reloc_index((intptr_t)entry.target()), "can't far jump to reloc index)");
 321   if (far_branches()) {
 322     lea(tmp, entry);
 323     if (cbuf) cbuf->set_insts_mark();
 324     b(tmp);
 325   } else {
 326     if (cbuf) cbuf->set_insts_mark();
 327     b(entry);
 328   }
 329 }
 330 
 331 int MacroAssembler::biased_locking_enter(Register lock_reg,
 332                                          Register obj_reg,
 333                                          Register swap_reg,
 334                                          Register tmp_reg,
 335                                          bool swap_reg_contains_mark,
 336                                          Label& done,
 337                                          Label* slow_case,
 338                                          BiasedLockingCounters* counters) {
 339   assert(UseBiasedLocking, "why call this otherwise?");
 340   assert_different_registers(lock_reg, obj_reg, swap_reg);
 341 
 342   if (PrintBiasedLockingStatistics && counters == NULL)
 343     counters = BiasedLocking::counters();
 344 
 345   bool need_tmp_reg = false;
 346   if (tmp_reg == noreg) {
 347     tmp_reg = rscratch2;
 348   }
 349   assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg, rscratch1);
 350   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
 351   Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
 352   Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
 353   Address saved_mark_addr(lock_reg, 0);
 354 
 355   // Biased locking
 356   // See whether the lock is currently biased toward our thread and
 357   // whether the epoch is still valid
 358   // Note that the runtime guarantees sufficient alignment of JavaThread
 359   // pointers to allow age to be placed into low bits
 360   // First check to see whether biasing is even enabled for this object
 361   Label cas_label;
 362   int null_check_offset = -1;
 363   if (!swap_reg_contains_mark) {
 364     null_check_offset = offset();
 365     ldr(swap_reg, mark_addr);
 366   }
 367   andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
 368   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
 369   b(cas_label, Assembler::NE);
 370   // The bias pattern is present in the object's header. Need to check
 371   // whether the bias owner and the epoch are both still current.
 372   load_prototype_header(tmp_reg, obj_reg);
 373   orr(tmp_reg, tmp_reg, rthread);
 374   eor(tmp_reg, swap_reg, tmp_reg);
 375 //  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
 376   bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
 377   if (counters != NULL) {
 378     Label around;
 379     cbnz(tmp_reg, around);
 380     atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, rscratch1);
 381     b(done);
 382     bind(around);
 383   } else {
 384     cbz(tmp_reg, done);
 385   }
 386 
 387   Label try_revoke_bias;
 388   Label try_rebias;
 389 
 390   // At this point we know that the header has the bias pattern and
 391   // that we are not the bias owner in the current epoch. We need to
 392   // figure out more details about the state of the header in order to
 393   // know what operations can be legally performed on the object's
 394   // header.
 395 
 396   // If the low three bits in the xor result aren't clear, that means
 397   // the prototype header is no longer biased and we have to revoke
 398   // the bias on this object.
 399   andr(rscratch1, tmp_reg, markOopDesc::biased_lock_mask_in_place);
 400   cbnz(rscratch1, try_revoke_bias);
 401 
 402   // Biasing is still enabled for this data type. See whether the
 403   // epoch of the current bias is still valid, meaning that the epoch
 404   // bits of the mark word are equal to the epoch bits of the
 405   // prototype header. (Note that the prototype header's epoch bits
 406   // only change at a safepoint.) If not, attempt to rebias the object
 407   // toward the current thread. Note that we must be absolutely sure
 408   // that the current epoch is invalid in order to do this because
 409   // otherwise the manipulations it performs on the mark word are
 410   // illegal.
 411   andr(rscratch1, tmp_reg, markOopDesc::epoch_mask_in_place);
 412   cbnz(rscratch1, try_rebias);
 413 
 414   // The epoch of the current bias is still valid but we know nothing
 415   // about the owner; it might be set or it might be clear. Try to
 416   // acquire the bias of the object using an atomic operation. If this
 417   // fails we will go in to the runtime to revoke the object's bias.
 418   // Note that we first construct the presumed unbiased header so we
 419   // don't accidentally blow away another thread's valid bias.
 420   {
 421     Label here;
 422     mov(rscratch1, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
 423     andr(swap_reg, swap_reg, rscratch1);
 424     orr(tmp_reg, swap_reg, rthread);
 425     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 426     // If the biasing toward our thread failed, this means that
 427     // another thread succeeded in biasing it toward itself and we
 428     // need to revoke that bias. The revocation will occur in the
 429     // interpreter runtime in the slow case.
 430     bind(here);
 431     if (counters != NULL) {
 432       atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
 433                   tmp_reg, rscratch1);
 434     }
 435   }
 436   b(done);
 437 
 438   bind(try_rebias);
 439   // At this point we know the epoch has expired, meaning that the
 440   // current "bias owner", if any, is actually invalid. Under these
 441   // circumstances _only_, we are allowed to use the current header's
 442   // value as the comparison value when doing the cas to acquire the
 443   // bias in the current epoch. In other words, we allow transfer of
 444   // the bias from one thread to another directly in this situation.
 445   //
 446   // FIXME: due to a lack of registers we currently blow away the age
 447   // bits in this situation. Should attempt to preserve them.
 448   {
 449     Label here;
 450     load_prototype_header(tmp_reg, obj_reg);
 451     orr(tmp_reg, rthread, tmp_reg);
 452     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, slow_case);
 453     // If the biasing toward our thread failed, then another thread
 454     // succeeded in biasing it toward itself and we need to revoke that
 455     // bias. The revocation will occur in the runtime in the slow case.
 456     bind(here);
 457     if (counters != NULL) {
 458       atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
 459                   tmp_reg, rscratch1);
 460     }
 461   }
 462   b(done);
 463 
 464   bind(try_revoke_bias);
 465   // The prototype mark in the klass doesn't have the bias bit set any
 466   // more, indicating that objects of this data type are not supposed
 467   // to be biased any more. We are going to try to reset the mark of
 468   // this object to the prototype value and fall through to the
 469   // CAS-based locking scheme. Note that if our CAS fails, it means
 470   // that another thread raced us for the privilege of revoking the
 471   // bias of this particular object, so it's okay to continue in the
 472   // normal locking code.
 473   //
 474   // FIXME: due to a lack of registers we currently blow away the age
 475   // bits in this situation. Should attempt to preserve them.
 476   {
 477     Label here, nope;
 478     load_prototype_header(tmp_reg, obj_reg);
 479     cmpxchgptr(swap_reg, tmp_reg, obj_reg, rscratch1, here, &nope);
 480     bind(here);
 481 
 482     // Fall through to the normal CAS-based lock, because no matter what
 483     // the result of the above CAS, some thread must have succeeded in
 484     // removing the bias bit from the object's header.
 485     if (counters != NULL) {
 486       atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
 487                   rscratch1);
 488     }
 489     bind(nope);
 490   }
 491 
 492   bind(cas_label);
 493 
 494   return null_check_offset;
 495 }
 496 
 497 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
 498   assert(UseBiasedLocking, "why call this otherwise?");
 499 
 500   // Check for biased locking unlock case, which is a no-op
 501   // Note: we do not have to check the thread ID for two reasons.
 502   // First, the interpreter checks for IllegalMonitorStateException at
 503   // a higher level. Second, if the bias was revoked while we held the
 504   // lock, the object could not be rebiased toward another thread, so
 505   // the bias bit would be clear.
 506   ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
 507   andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
 508   cmp(temp_reg, markOopDesc::biased_lock_pattern);
 509   b(done, Assembler::EQ);
 510 }
 511 
 512 
 513 static void pass_arg0(MacroAssembler* masm, Register arg) {
 514   if (c_rarg0 != arg ) {
 515     masm->mov(c_rarg0, arg);
 516   }
 517 }
 518 
 519 static void pass_arg1(MacroAssembler* masm, Register arg) {
 520   if (c_rarg1 != arg ) {
 521     masm->mov(c_rarg1, arg);
 522   }
 523 }
 524 
 525 static void pass_arg2(MacroAssembler* masm, Register arg) {
 526   if (c_rarg2 != arg ) {
 527     masm->mov(c_rarg2, arg);
 528   }
 529 }
 530 
 531 static void pass_arg3(MacroAssembler* masm, Register arg) {
 532   if (c_rarg3 != arg ) {
 533     masm->mov(c_rarg3, arg);
 534   }
 535 }
 536 
 537 void MacroAssembler::call_VM_base(Register oop_result,
 538                                   Register java_thread,
 539                                   Register last_java_sp,
 540                                   address  entry_point,
 541                                   int      number_of_arguments,
 542                                   bool     check_exceptions) {
 543    // determine java_thread register
 544   if (!java_thread->is_valid()) {
 545     java_thread = rthread;
 546   }
 547 
 548   // determine last_java_sp register
 549   if (!last_java_sp->is_valid()) {
 550     last_java_sp = sp;
 551   }
 552 
 553   // debugging support
 554   assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
 555   assert(java_thread == rthread, "unexpected register");
 556 
 557   assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
 558   assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
 559 
 560   // push java thread (becomes first argument of C function)
 561 
 562   mov(c_rarg0, java_thread);
 563 
 564   // set last Java frame before call
 565   assert(last_java_sp != rfp, "can't use rfp");
 566 
 567   Label l;
 568   set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
 569 
 570 
 571   // FIXME - Can save lr in more elegant way ?
 572   //str(lr, pre(sp, -wordSize));
 573 
 574   // do the call, remove parameters
 575   MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
 576 
 577   //ldr(lr, post(sp, wordSize));
 578 
 579   // reset last Java frame
 580   // Only interpreter should have to clear fp
 581   reset_last_Java_frame(true, true);
 582 
 583    // C++ interp handles this in the interpreter
 584   check_and_handle_popframe(java_thread);
 585   check_and_handle_earlyret(java_thread);
 586 
 587   if (check_exceptions) {
 588     // check for pending exceptions (java_thread is set upon return)
 589     ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
 590     Label ok;
 591     cbz(rscratch2, ok);
 592 
 593     lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
 594     // forward_exception uses LR to choose exception handler but LR is trashed by previous code
 595     // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
 596     bl(rscratch2);
 597     bind(ok);
 598   }
 599 
 600   // get oop result if there is one and reset the value in the thread
 601   if (oop_result->is_valid()) {
 602     get_vm_result(oop_result, java_thread);
 603   }
 604 }
 605 
 606 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 607   call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
 608 }
 609 
 610 // Maybe emit a call via a trampoline.  If the code cache is small
 611 // trampolines won't be emitted.
 612 
 613 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
 614   assert(entry.rspec().type() == relocInfo::runtime_call_type
 615          || entry.rspec().type() == relocInfo::opt_virtual_call_type
 616          || entry.rspec().type() == relocInfo::static_call_type
 617          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 618 
 619   if (cbuf) {
 620     cbuf->set_insts_mark();
 621   }
 622 
 623   if (far_branches()) {
 624     // Have make trampline such way: destination address should be raw 4 byte value,
 625     // so it's patching could be done atomically.
 626     relocate(entry.rspec());
 627     add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
 628     ldr(r15_pc, Address(r15_pc, 4));
 629     emit_int32((uintptr_t) entry.target());
 630     // possibly pad the call to the NativeCall size to make patching happy
 631     for (int i = NativeCall::instruction_size; i > 3 * NativeInstruction::arm_insn_sz; i -= NativeInstruction::arm_insn_sz)
 632       nop();
 633   } else {
 634     bl(entry);
 635   }
 636 }
 637 
 638 void MacroAssembler::ic_call(address entry) {
 639   RelocationHolder rh = virtual_call_Relocation::spec(pc());
 640   // address const_ptr = long_constant((jlong)Universe::non_oop_word());
 641   // unsigned long offset;
 642   // ldr_constant(rscratch2, const_ptr);
 643   movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
 644   trampoline_call(Address(entry, rh));
 645 }
 646 
 647 // Implementation of call_VM versions
 648 
 649 void MacroAssembler::call_VM(Register oop_result,
 650                              address entry_point,
 651                              bool check_exceptions) {
 652   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 653 }
 654 
 655 void MacroAssembler::call_VM(Register oop_result,
 656                              address entry_point,
 657                              Register arg_1,
 658                              bool check_exceptions) {
 659   pass_arg1(this, arg_1);
 660   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 661 }
 662 
 663 void MacroAssembler::call_VM(Register oop_result,
 664                              address entry_point,
 665                              Register arg_1,
 666                              Register arg_2,
 667                              bool check_exceptions) {
 668   assert(arg_1 != c_rarg2, "smashed arg");
 669   pass_arg2(this, arg_2);
 670   pass_arg1(this, arg_1);
 671   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 672 }
 673 
 674 void MacroAssembler::call_VM(Register oop_result,
 675                              address entry_point,
 676                              Register arg_1,
 677                              Register arg_2,
 678                              Register arg_3,
 679                              bool check_exceptions) {
 680   assert(arg_1 != c_rarg3, "smashed arg");
 681   assert(arg_2 != c_rarg3, "smashed arg");
 682   pass_arg3(this, arg_3);
 683 
 684   assert(arg_1 != c_rarg2, "smashed arg");
 685   pass_arg2(this, arg_2);
 686 
 687   pass_arg1(this, arg_1);
 688   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 689 }
 690 
 691 void MacroAssembler::call_VM(Register oop_result,
 692                              Register last_java_sp,
 693                              address entry_point,
 694                              int number_of_arguments,
 695                              bool check_exceptions) {
 696   call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
 697 }
 698 
 699 void MacroAssembler::call_VM(Register oop_result,
 700                              Register last_java_sp,
 701                              address entry_point,
 702                              Register arg_1,
 703                              bool check_exceptions) {
 704   pass_arg1(this, arg_1);
 705   call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
 706 }
 707 
 708 void MacroAssembler::call_VM(Register oop_result,
 709                              Register last_java_sp,
 710                              address entry_point,
 711                              Register arg_1,
 712                              Register arg_2,
 713                              bool check_exceptions) {
 714 
 715   assert(arg_1 != c_rarg2, "smashed arg");
 716   pass_arg2(this, arg_2);
 717   pass_arg1(this, arg_1);
 718   call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
 719 }
 720 
 721 void MacroAssembler::call_VM(Register oop_result,
 722                              Register last_java_sp,
 723                              address entry_point,
 724                              Register arg_1,
 725                              Register arg_2,
 726                              Register arg_3,
 727                              bool check_exceptions) {
 728   assert(arg_1 != c_rarg3, "smashed arg");
 729   assert(arg_2 != c_rarg3, "smashed arg");
 730   pass_arg3(this, arg_3);
 731   assert(arg_1 != c_rarg2, "smashed arg");
 732   pass_arg2(this, arg_2);
 733   pass_arg1(this, arg_1);
 734   call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
 735 }
 736 
 737 
 738 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
 739   ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
 740   assert(oop_result != rscratch2, "can't be");
 741   mov(rscratch2, 0);
 742   str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
 743   verify_oop(oop_result, "broken oop in call_VM_base");
 744 }
 745 
 746 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
 747   ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
 748   assert(metadata_result != rscratch2 &&
 749          java_thread != rscratch2, "can't be");
 750   mov(rscratch2, 0);
 751   str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
 752 }
 753 
 754 void MacroAssembler::align(int modulus) {
 755   while (offset() % modulus != 0) nop();
 756 }
 757 
 758 // these are no-ops overridden by InterpreterMacroAssembler
 759 
 760 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
 761 
 762 void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
 763 
 764 
 765 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
 766                                                       Register tmp,
 767                                                       int offset) {
 768   intptr_t value = *delayed_value_addr;
 769   if (value != 0)
 770     return RegisterOrConstant(value + offset);
 771 
 772   // load indirectly to solve generation ordering problem
 773   ldr(tmp, ExternalAddress((address) delayed_value_addr));
 774 
 775   if (offset != 0)
 776     add(tmp, tmp, offset);
 777 
 778   return RegisterOrConstant(tmp);
 779 }
 780 
 781 
 782 // Look up the method for a megamorphic invokeinterface call.
 783 // The target method is determined by <intf_klass, itable_index>.
 784 // The receiver klass is in recv_klass.
 785 // On success, the result will be in method_result, and execution falls through.
 786 // On failure, execution transfers to the given label.
 787 void MacroAssembler::lookup_interface_method(Register recv_klass,
 788                                              Register intf_klass,
 789                                              RegisterOrConstant itable_index,
 790                                              Register method_result,
 791                                              Register scan_temp,
 792                                              Label& L_no_such_interface) {
 793   assert_different_registers(recv_klass, intf_klass, method_result, scan_temp);
 794   assert(itable_index.is_constant() || itable_index.as_register() == method_result,
 795          "caller must use same register for non-constant itable index as for method");
 796 
 797   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
 798   int vtable_base = InstanceKlass::vtable_start_offset() * wordSize;
 799   int itentry_off = itableMethodEntry::method_offset_in_bytes();
 800   int scan_step   = itableOffsetEntry::size() * wordSize;
 801   int vte_size    = vtableEntry::size() * wordSize;
 802   assert(vte_size == wordSize, "else adjust times_vte_scale");
 803 
 804   ldr(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize));
 805 
 806   // %%% Could store the aligned, prescaled offset in the klassoop.
 807   // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
 808   lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
 809   add(scan_temp, scan_temp, vtable_base);
 810   if (HeapWordsPerLong > 1) {
 811     // Round up to align_object_offset boundary
 812     // see code for instanceKlass::start_of_itable!
 813     round_to(scan_temp, BytesPerLong);
 814   }
 815 
 816   // Adjust recv_klass by scaled itable_index, so we can free itable_index.
 817   assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 818   // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
 819   lea(recv_klass, itable_index.is_register() ?
 820           Address(recv_klass, itable_index, lsl(2)) :
 821           Address(recv_klass, itable_index.as_constant() << 2));
 822   if (itentry_off)
 823     add(recv_klass, recv_klass, itentry_off);
 824 
 825   // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
 826   //   if (scan->interface() == intf) {
 827   //     result = (klass + scan->offset() + itable_index);
 828   //   }
 829   // }
 830   Label search, found_method;
 831 
 832   for (int peel = 1; peel >= 0; peel--) {
 833     ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
 834     cmp(intf_klass, method_result);
 835 
 836     if (peel) {
 837       b(found_method, Assembler::EQ);
 838     } else {
 839       b(search, Assembler::NE);
 840       // (invert the test to fall through to found_method...)
 841     }
 842 
 843     if (!peel)  break;
 844 
 845     bind(search);
 846 
 847     // Check that the previous entry is non-null.  A null entry means that
 848     // the receiver class doesn't implement the interface, and wasn't the
 849     // same as when the caller was compiled.
 850     cbz(method_result, L_no_such_interface);
 851     add(scan_temp, scan_temp, scan_step);
 852   }
 853 
 854   bind(found_method);
 855 
 856   // Got a hit.
 857   ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
 858   ldr(method_result, Address(recv_klass, scan_temp));
 859 }
 860 
 861 // virtual method calling
 862 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 863                                            RegisterOrConstant vtable_index,
 864                                            Register method_result) {
 865   const int base = InstanceKlass::vtable_start_offset() * wordSize;
 866   //assert(vtableEntry::size() * wordSize == 8,
 867   //       "adjust the scaling in the code below");
 868   // FIXME What scaling needs changing as indexes address by one word
 869   int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
 870 
 871   if (vtable_index.is_register()) {
 872     lea(method_result, Address(recv_klass,
 873                                vtable_index.as_register(),
 874                                lsl(LogBytesPerWord)));
 875     ldr(method_result, Address(method_result, vtable_offset_in_bytes));
 876   } else {
 877     vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
 878     ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
 879   }
 880 }
 881 
 882 void MacroAssembler::check_klass_subtype(Register sub_klass,
 883                            Register super_klass,
 884                            Register temp_reg,
 885                            Label& L_success) {
 886   Label L_failure;
 887   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
 888   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
 889   bind(L_failure);
 890 }
 891 
 892 
 893 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 894                                                    Register super_klass,
 895                                                    Register temp_reg,
 896                                                    Label* L_success,
 897                                                    Label* L_failure,
 898                                                    Label* L_slow_path,
 899                                         RegisterOrConstant super_check_offset) {
 900   assert_different_registers(sub_klass, super_klass, temp_reg);
 901   bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
 902   if (super_check_offset.is_register()) {
 903     assert_different_registers(sub_klass, super_klass,
 904                                super_check_offset.as_register());
 905   } else if (must_load_sco) {
 906     assert(temp_reg != noreg, "supply either a temp or a register offset");
 907   }
 908 
 909   Label L_fallthrough;
 910   int label_nulls = 0;
 911   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 912   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 913   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 914   assert(label_nulls <= 1, "at most one NULL in the batch");
 915 
 916   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 917   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 918   Address super_check_offset_addr(super_klass, sco_offset);
 919 
 920   // Hacked jmp, which may only be used just before L_fallthrough.
 921 #define final_jmp(label)                                                \
 922   if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
 923   else                            b(label)                /*omit semi*/
 924 
 925   // If the pointers are equal, we are done (e.g., String[] elements).
 926   // This self-check enables sharing of secondary supertype arrays among
 927   // non-primary types such as array-of-interface.  Otherwise, each such
 928   // type would need its own customized SSA.
 929   // We move this check to the front of the fast path because many
 930   // type checks are in fact trivially successful in this manner,
 931   // so we get a nicely predicted branch right at the start of the check.
 932   cmp(sub_klass, super_klass);
 933   b(*L_success, Assembler::EQ);
 934 
 935   // Check the supertype display:
 936   if (must_load_sco) {
 937     ldr(temp_reg, super_check_offset_addr);
 938     super_check_offset = RegisterOrConstant(temp_reg);
 939   }
 940   Address super_check_addr(sub_klass, super_check_offset);
 941   ldr(rscratch1, super_check_addr);
 942   cmp(super_klass, rscratch1); // load displayed supertype
 943 
 944   // This check has worked decisively for primary supers.
 945   // Secondary supers are sought in the super_cache ('super_cache_addr').
 946   // (Secondary supers are interfaces and very deeply nested subtypes.)
 947   // This works in the same check above because of a tricky aliasing
 948   // between the super_cache and the primary super display elements.
 949   // (The 'super_check_addr' can address either, as the case requires.)
 950   // Note that the cache is updated below if it does not help us find
 951   // what we need immediately.
 952   // So if it was a primary super, we can just fail immediately.
 953   // Otherwise, it's the slow path for us (no success at this point).
 954 
 955   if (super_check_offset.is_register()) {
 956     b(*L_success, Assembler::EQ);
 957     cmp(super_check_offset.as_register(), sc_offset);
 958     if (L_failure == &L_fallthrough) {
 959       b(*L_slow_path, Assembler::EQ);
 960     } else {
 961       b(*L_failure, Assembler::NE);
 962       final_jmp(*L_slow_path);
 963     }
 964   } else if (super_check_offset.as_constant() == sc_offset) {
 965     // Need a slow path; fast failure is impossible.
 966     if (L_slow_path == &L_fallthrough) {
 967       b(*L_success, Assembler::EQ);
 968     } else {
 969       b(*L_slow_path, Assembler::NE);
 970       final_jmp(*L_success);
 971     }
 972   } else {
 973     // No slow path; it's a fast decision.
 974     if (L_failure == &L_fallthrough) {
 975       b(*L_success, Assembler::EQ);
 976     } else {
 977       b(*L_failure, Assembler::NE);
 978       final_jmp(*L_success);
 979     }
 980   }
 981 
 982   bind(L_fallthrough);
 983 
 984 #undef final_jmp
 985 }
 986 
 987 // These two are taken from x86, but they look generally useful
 988 
 989 // scans count pointer sized words at [addr] for occurence of value,
 990 // generic
 991 void MacroAssembler::repne_scan(Register addr, Register value, Register count,
 992                                 Register scratch) {
 993   Label loop, fail, found;
 994   cmp(count, 0);
 995   b(fail, EQ);
 996 
 997   bind(loop);
 998   ldr(scratch, post(addr, wordSize));
 999   cmp(value, scratch);
1000   b(found, EQ);
1001   subs(count, count, 1);
1002   b(loop, NE);
1003 
1004   bind(fail);
1005   cmp(sp, 0); // sp never zero
1006   bind(found);
1007 }
1008 
1009 // Form an address from base + offset in Rd.  Rd may or may
1010 // not actually be used: you must use the Address that is returned.
1011 // It is up to you to ensure that the shift provided matches the size
1012 // of your data.
1013 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
1014   // form_address result should only be used together with ldr/str instructions
1015   // otherwise please provide exact type instead of IDT_INT or apply safe_for()
1016   if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
1017     // It fits; no need for any heroics
1018     return Address(base, byte_offset);
1019 
1020   // See if we can do this with two 12-bit offsets
1021   {
1022     unsigned long masked_offset = byte_offset & ~0xfff;
1023     if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
1024         && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
1025       add(Rd, base, masked_offset);
1026       byte_offset -= masked_offset;
1027       return Address(Rd, byte_offset);
1028     }
1029   }
1030 
1031   // Do it the hard way
1032   mov(Rd, byte_offset);
1033   add(Rd, base, Rd);
1034   return Address(Rd);
1035 }
1036 
1037 // scans count 4 byte words at [addr] for occurence of value,
1038 // generic
1039 /*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
1040                                 Register scratch) {
1041   Label Lloop, Lexit;
1042   cbz(count, Lexit);
1043   bind(Lloop);
1044   ldr(scratch, post(addr, wordSize));
1045   cmp(value, scratch);
1046   b(Lexit, EQ);
1047   sub(count, count, 1);
1048   cbnz(count, Lloop);
1049   bind(Lexit);
1050 }*/
1051 
1052 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
1053                                                    Register super_klass,
1054                                                    Register temp_reg,
1055                                                    Register temp2_reg,
1056                                                    Label* L_success,
1057                                                    Label* L_failure,
1058                                                    bool set_cond_codes) {
1059   assert_different_registers(sub_klass, super_klass, temp_reg);
1060   if (temp2_reg != noreg)
1061     assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
1062 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
1063 
1064   Label L_fallthrough;
1065   int label_nulls = 0;
1066   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
1067   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
1068   assert(label_nulls <= 1, "at most one NULL in the batch");
1069 
1070   // a couple of useful fields in sub_klass:
1071   int ss_offset = in_bytes(Klass::secondary_supers_offset());
1072   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
1073   Address secondary_supers_addr(sub_klass, ss_offset);
1074   Address super_cache_addr(     sub_klass, sc_offset);
1075 
1076   BLOCK_COMMENT("check_klass_subtype_slow_path");
1077 
1078   // Do a linear scan of the secondary super-klass chain.
1079   // This code is rarely used, so simplicity is a virtue here.
1080   // The repne_scan instruction uses fixed registers, which we must spill.
1081   // Don't worry too much about pre-existing connections with the input regs.
1082 
1083   assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
1084   assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
1085 
1086   // Get super_klass value into r0 (even if it was in r14 or r2).
1087   RegSet pushed_registers;
1088   if (!IS_A_TEMP(r2))    pushed_registers += r2;
1089   if (!IS_A_TEMP(r14))    pushed_registers += r14;
1090 
1091   if (super_klass != r0) {
1092     if (!IS_A_TEMP(r0))   pushed_registers += r0;
1093   }
1094 
1095   push(pushed_registers, sp);
1096 
1097 #ifndef PRODUCT
1098   mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
1099   Address pst_counter_addr(rscratch2);
1100   ldr(rscratch1, pst_counter_addr);
1101   add(rscratch1, rscratch1, 1);
1102   str(rscratch1, pst_counter_addr);
1103 #endif //PRODUCT
1104 
1105   // We will consult the secondary-super array.
1106   ldr(r14, secondary_supers_addr);
1107   // Load the array length.
1108   ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
1109   // Skip to start of data.
1110   add(r14, r14, Array<Klass*>::base_offset_in_bytes());
1111 
1112   cmp(sp, 0); // Clear Z flag; SP is never zero
1113   // Scan R2 words at [R14] for an occurrence of R0.
1114   // Set NZ/Z based on last compare.
1115   repne_scan(r14, r0, r2, rscratch1);
1116 
1117   // Unspill the temp. registers:
1118   pop(pushed_registers, sp);
1119 
1120   b(*L_failure, Assembler::NE);
1121 
1122   // Success.  Cache the super we found and proceed in triumph.
1123   str(super_klass, super_cache_addr);
1124 
1125   if (L_success != &L_fallthrough) {
1126     b(*L_success);
1127   }
1128 
1129 #undef IS_A_TEMP
1130 
1131   bind(L_fallthrough);
1132 }
1133 
1134 
1135 void MacroAssembler::verify_oop(Register reg, const char* s) {
1136   if (!VerifyOops) return;
1137 
1138   // Pass register number to verify_oop_subroutine
1139   const char* b = NULL;
1140   {
1141     ResourceMark rm;
1142     stringStream ss;
1143     ss.print("verify_oop: %s: %s", reg->name(), s);
1144     b = code_string(ss.as_string());
1145   }
1146   BLOCK_COMMENT("verify_oop {");
1147 
1148   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1149 
1150   mov(r0, reg);
1151   mov(rscratch1, (address)b);
1152   mrs(r1);
1153 
1154   // call indirectly to solve generation ordering problem
1155   reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
1156   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1157   ldr(rscratch2, Address(rscratch2));
1158   bl(rscratch2);
1159   reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
1160 
1161   msr(r1);
1162   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1163 
1164   BLOCK_COMMENT("} verify_oop");
1165 }
1166 
1167 void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
1168   if (!VerifyOops) return;
1169 
1170   const char* b = NULL;
1171   {
1172     ResourceMark rm;
1173     stringStream ss;
1174     ss.print("verify_oop_addr: %s", s);
1175     b = code_string(ss.as_string());
1176   }
1177   BLOCK_COMMENT("verify_oop_addr {");
1178 
1179   stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1180   mrs(r1);
1181 
1182   // addr may contain sp so we will have to adjust it based on the
1183   // pushes that we just did.
1184   if (addr.uses(sp)) {
1185     lea(r0, addr);
1186     ldr(r0, Address(r0, 5 * wordSize));
1187   } else {
1188     ldr(r0, addr);
1189   }
1190   mov(rscratch1, (address)b);
1191 
1192   // call indirectly to solve generation ordering problem
1193   lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
1194   ldr(rscratch2, Address(rscratch2));
1195   bl(rscratch2);
1196 
1197   msr(r1);
1198   ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
1199 
1200   BLOCK_COMMENT("} verify_oop_addr");
1201 }
1202 
1203 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
1204                                          int extra_slot_offset) {
1205   // cf. TemplateTable::prepare_invoke(), if (load_receiver).
1206   int stackElementSize = Interpreter::stackElementSize;
1207   int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
1208 #ifdef ASSERT
1209   int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
1210   assert(offset1 - offset == stackElementSize, "correct arithmetic");
1211 #endif
1212   if (arg_slot.is_constant()) {
1213     return Address(sp, arg_slot.as_constant() * stackElementSize
1214                    + offset);
1215   } else {
1216     add(rscratch1, sp, arg_slot.as_register(),
1217         lsl(exact_log2(stackElementSize)));
1218     return Address(rscratch1, offset);
1219   }
1220 }
1221 
1222 void MacroAssembler::call_VM_leaf_base(address entry_point,
1223                                        int number_of_arguments,
1224                                        Label *retaddr) {
1225   Label E, L;
1226 
1227   //FIXME Do this alignment in a more elegant way
1228   mov(rscratch2, sp);
1229   sub(sp, sp, wordSize);
1230   bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
1231   str(rscratch2, Address(sp));
1232 
1233   // FIXME Do we need to preserve rscratch2?
1234   //str(rscratch2, Address(pre(sp, -wordSize)));
1235 
1236   mov(rscratch2, entry_point);
1237   reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
1238   bl(rscratch2);
1239   if (retaddr)
1240     bind(*retaddr);
1241   reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
1242 
1243   //ldr(rscratch2, Address(post(sp, wordSize)));
1244 
1245   //Undo alignment
1246   ldr(sp, Address(sp));
1247 
1248   maybe_isb();
1249 }
1250 
1251 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
1252   call_VM_leaf_base(entry_point, number_of_arguments);
1253 }
1254 
1255 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
1256   pass_arg0(this, arg_0);
1257   call_VM_leaf_base(entry_point, 1);
1258 }
1259 
1260 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1261   pass_arg0(this, arg_0);
1262   pass_arg1(this, arg_1);
1263   call_VM_leaf_base(entry_point, 2);
1264 }
1265 
1266 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
1267                                   Register arg_1, Register arg_2) {
1268   pass_arg0(this, arg_0);
1269   pass_arg1(this, arg_1);
1270   pass_arg2(this, arg_2);
1271   call_VM_leaf_base(entry_point, 3);
1272 }
1273 
1274 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
1275   pass_arg0(this, arg_0);
1276   MacroAssembler::call_VM_leaf_base(entry_point, 1);
1277 }
1278 
1279 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
1280 
1281   assert(arg_0 != c_rarg1, "smashed arg");
1282   pass_arg1(this, arg_1);
1283   pass_arg0(this, arg_0);
1284   MacroAssembler::call_VM_leaf_base(entry_point, 2);
1285 }
1286 
1287 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
1288   assert(arg_0 != c_rarg2, "smashed arg");
1289   assert(arg_1 != c_rarg2, "smashed arg");
1290   pass_arg2(this, arg_2);
1291   assert(arg_0 != c_rarg1, "smashed arg");
1292   pass_arg1(this, arg_1);
1293   pass_arg0(this, arg_0);
1294   MacroAssembler::call_VM_leaf_base(entry_point, 3);
1295 }
1296 
1297 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
1298   assert(arg_0 != c_rarg3, "smashed arg");
1299   assert(arg_1 != c_rarg3, "smashed arg");
1300   assert(arg_2 != c_rarg3, "smashed arg");
1301   pass_arg3(this, arg_3);
1302   assert(arg_0 != c_rarg2, "smashed arg");
1303   assert(arg_1 != c_rarg2, "smashed arg");
1304   pass_arg2(this, arg_2);
1305   assert(arg_0 != c_rarg1, "smashed arg");
1306   pass_arg1(this, arg_1);
1307   pass_arg0(this, arg_0);
1308   MacroAssembler::call_VM_leaf_base(entry_point, 4);
1309 }
1310 
1311 // Clobbers rscratch1
1312 void MacroAssembler::null_check(Register reg, int offset) {
1313   if (needs_explicit_null_check(offset)) {
1314     // provoke OS NULL exception if reg = NULL by
1315     // accessing M[reg] w/o changing any registers
1316     // NOTE: this is plenty to provoke a segv
1317     reg_printf("Generating OS check null with ptr = %p\n", reg);
1318     assert(reg != rscratch1, "can't be");
1319     ldr(rscratch1, Address(reg));
1320   } else {
1321     // nothing to do, (later) access of M[reg + offset]
1322     // will provoke OS NULL exception if reg = NULL
1323   }
1324 }
1325 
1326 // MacroAssembler protected routines needed to implement
1327 // public methods
1328 
1329 void MacroAssembler::mov(Register r, Address dest, Condition cond) {
1330   code_section()->relocate(pc(), dest.rspec());
1331   uint32_t imm32 = (uint32_t)dest.target();
1332   movptr(r, imm32, cond);
1333 }
1334 
1335 // Move a constant pointer into r.  In aarch32 address space
1336 // is 32 bits in size and so a pointer can be encoded in two mov
1337 // instructions.
1338 void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
1339 #ifndef PRODUCT
1340   {
1341     char buffer[64];
1342     snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
1343     block_comment(buffer);
1344   }
1345 #endif
1346   Assembler::mov_immediate32(r, imm32, cond, false);
1347 }
1348 
1349 void MacroAssembler::ret(Register reg) {
1350   assert(reg == lr, "Can do return only to LR");
1351   mov(r15_pc, lr);
1352 }
1353 
1354 void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
1355   Label retry_load;
1356   bind(retry_load);
1357   // flush and load exclusive from the memory location
1358   ldrex(tmp, counter_addr);
1359   add(tmp, tmp, 1);
1360   // if we store+flush with no intervening write tmp wil be zero
1361   strex(tmp, tmp, counter_addr);
1362   cmp(tmp, 0);
1363   b(retry_load, Assembler::NE);
1364 }
1365 
1366 
1367 // MacroAssembler routines found actually to be needed
1368 
1369 void MacroAssembler::push(Register src)
1370 {
1371   str(src, Address(pre(sp, -1 * wordSize)));
1372 }
1373 
1374 void MacroAssembler::pop(Register dst)
1375 {
1376   ldr(dst, Address(post(sp, 1 * wordSize)));
1377 }
1378 
1379 // Note: load_unsigned_short used to be called load_unsigned_word.
1380 int MacroAssembler::load_unsigned_short(Register dst, Address src) {
1381   int off = offset();
1382   ldrh(dst, src);
1383   return off;
1384 }
1385 
1386 int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
1387   int off = offset();
1388   ldrb(dst, src);
1389   return off;
1390 }
1391 
1392 int MacroAssembler::load_signed_short(Register dst, Address src) {
1393   int off = offset();
1394   ldrsh(dst, src);
1395   return off;
1396 }
1397 
1398 int MacroAssembler::load_signed_byte(Register dst, Address src) {
1399   int off = offset();
1400   ldrsb(dst, src);
1401   return off;
1402 }
1403 
1404 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
1405   switch (size_in_bytes) {
1406   //case  8:  ldr(dst, src); break;
1407   case  4:  ldr(dst, src); break;
1408   case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
1409   case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
1410   default:  ShouldNotReachHere();
1411   }
1412 }
1413 
1414 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
1415   switch (size_in_bytes) {
1416   //case  8:  str(src, dst); break;
1417   case  4:  str(src, dst); break;
1418   case  2:  strh(src, dst); break;
1419   case  1:  strb(src, dst); break;
1420   default:  ShouldNotReachHere();
1421   }
1422 }
1423 
1424 void MacroAssembler::decrement(Register reg, int value) {
1425   if (value < 0) {
1426     increment(reg, -value);
1427     return;
1428   }
1429   if (value == 0) {
1430     return;
1431   }
1432   if (operand_valid_for_add_sub_immediate(value)) {
1433     sub(reg, reg, value);
1434     return;
1435   }
1436   assert(reg != rscratch2, "invalid register for decrement");
1437   mov(rscratch2, (unsigned int) value);
1438   sub(reg, reg, rscratch2);
1439 }
1440 
1441 void MacroAssembler::decrement(Address dst, int value) {
1442   assert(!dst.uses(rscratch1), "invalid address for decrement");
1443   ldr(rscratch1, dst);
1444   decrement(rscratch1, value);
1445   str(rscratch1, dst);
1446 }
1447 
1448 void MacroAssembler::increment(Register reg, int value) {
1449   if (value < 0) {
1450     decrement(reg, -value);
1451     return;
1452   }
1453   if (value == 0) {
1454     return;
1455   }
1456   if (operand_valid_for_add_sub_immediate(value)) {
1457     add(reg, reg, value);
1458     return;
1459   }
1460   assert(reg != rscratch2, "invalid register for increment");
1461   mov(rscratch2, (unsigned int) value);
1462   add(reg, reg, rscratch2);
1463 }
1464 
1465 void MacroAssembler::increment(Address dst, int value) {
1466   assert(!dst.uses(rscratch1), "invalid address for increment");
1467   ldr(rscratch1, dst);
1468   increment(rscratch1, value);
1469   str(rscratch1, dst);
1470 }
1471 
1472 // Loads and stores everything except the pc and sp
1473 void MacroAssembler::pusha() {
1474   unsigned regset = 0b0101111111111111;
1475   stmdb(sp, regset);
1476 }
1477 void MacroAssembler::popa() {
1478   unsigned regset = 0b0101111111111111;
1479   ldmia(sp, regset);
1480 }
1481 
1482 static void multiple_reg_check(unsigned int bitset, Register stack) {
1483   const unsigned int pcbit = 1 << r15_pc->encoding();
1484   const unsigned int lrbit = 1 << lr->encoding();
1485   const unsigned int spbit = 1 << sp->encoding();
1486   const unsigned int stackbit = 1 << stack->encoding();
1487   assert(!(bitset & spbit), "The SP can be in the list. However, "
1488       "ARM deprecates using these instructions with SP in the list.");
1489   assert(!(bitset & pcbit) || !(bitset & lrbit),
1490       "ARM deprecates using these instructions with both "
1491       "the LR and the PC in the list.");
1492   assert(!(bitset & stackbit), "Instructions with the base register "
1493       "in the list and ! specified are only available before ARMv7, "
1494       "and ARM deprecates the use of such instructions. "
1495       "The value of the base register after such an instruction is UNKNOWN");
1496 }
1497 
1498 // Push lots of registers in the bit set supplied.  Don't push sp.
1499 // Return the number of words pushed
1500 int MacroAssembler::push(unsigned int bitset, Register stack) {
1501   multiple_reg_check(bitset, stack);
1502   unsigned bc = bitset, count = 0, i;
1503   for(i = 0; i <= 15; i++) {
1504     if (1 & bc) count++;
1505     bc >>= 1;
1506   }
1507   // TODO Also why did it only do even quantities before?
1508   stmdb(stack, bitset);
1509   return count;
1510 }
1511 
1512 int MacroAssembler::pop(unsigned int bitset, Register stack) {
1513   multiple_reg_check(bitset, stack);
1514   unsigned bc = bitset, count = 0, i;
1515   for(i = 0; i <= 15; i++) {
1516     if (1 & bc) count++;
1517     bc >>= 1;
1518   }
1519   // TODO Also why did it only do even quantities before?
1520   ldmia(stack, bitset);
1521   return count;
1522 }
1523 
1524 void MacroAssembler::stop(const char* msg) {
1525   pusha();
1526   // Save old sp value
1527   add(rscratch2, sp, 14 * wordSize);
1528   str(rscratch2, Address(pre(sp, -4)));
1529   mov(c_rarg0, (address)msg);
1530   mov(c_rarg1, r15_pc);
1531   sub(c_rarg1, c_rarg1, 8); // Restore to actual value
1532   mov(c_rarg2, sp);
1533   mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
1534   bl(c_rarg3);
1535   hlt(0);
1536 }
1537 
1538 // this simulates the behaviour of the x86 cmpxchg instruction using a
1539 // load linked/store conditional pair. we use the acquire/release
1540 // versions of these instructions so that we flush pending writes as
1541 // per Java semantics.
1542 
1543 // n.b the x86 version assumes the old value to be compared against is
1544 // in rax and updates rax with the value located in memory if the
1545 // cmpxchg fails. we supply a register for the old value explicitly
1546 
1547 // the aarch32 load linked/store conditional instructions do not
1548 // accept an offset. so, unlike x86, we must provide a plain register
1549 // to identify the memory word to be compared/exchanged rather than a
1550 // register+offset Address.
1551 
1552 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
1553                                 Label &succeed, Label *fail) {
1554   // oldv holds comparison value
1555   // newv holds value to write in exchange
1556   // addr identifies memory word to compare against/update
1557   // tmp returns 0/1 for success/failure
1558   Label retry_load, nope;
1559 
1560   bind(retry_load);
1561   // flush and load exclusive from the memory location
1562   // and fail if it is not what we expect
1563   ldrex(tmp, addr);
1564   cmp(tmp, oldv);
1565   b(nope, Assembler::NE);
1566   // if we store+flush with no intervening write tmp wil be zero
1567   strex(tmp, newv, addr);
1568   cmp(tmp, 0);
1569   b(succeed, Assembler::EQ);
1570   // retry so we only ever return after a load fails to compare
1571   // ensures we don't return a stale value after a failed write.
1572   b(retry_load);
1573   // if the memory word differs we return it in oldv and signal a fail
1574   bind(nope);
1575   membar(AnyAny);
1576   mov(oldv, tmp);
1577   if (fail)
1578     b(*fail);
1579 }
1580 
1581 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
1582                                 Label &succeed, Label *fail) {
1583   // oldv holds comparison value
1584   // newv holds value to write in exchange
1585   // addr identifies memory word to compare against/update
1586   // tmp returns 0/1 for success/failure
1587   Label retry_load, nope;
1588 
1589   bind(retry_load);
1590   // flush and load exclusive from the memory location
1591   // and fail if it is not what we expect
1592   ldrex(tmp, addr);
1593   cmp(tmp, oldv);
1594   b(nope, Assembler::NE);
1595   // if we store+flush with no intervening write tmp wil be zero
1596   strex(tmp, newv, addr);
1597   cmp(tmp, 0);
1598   b(succeed, Assembler::EQ);
1599   // retry so we only ever return after a load fails to compare
1600   // ensures we don't return a stale value after a failed write.
1601   b(retry_load);
1602   // if the memory word differs we return it in oldv and signal a fail
1603   bind(nope);
1604   membar(AnyAny);
1605   mov(oldv, tmp);
1606   if (fail)
1607     b(*fail);
1608 }
1609 
1610 void MacroAssembler::incr_allocated_bytes(Register thread,
1611                                           Register var_size_in_bytes,
1612                                           int con_size_in_bytes,
1613                                           Register t1) {
1614   if (!thread->is_valid()) {
1615     thread = rthread;
1616   }
1617   assert(t1->is_valid(), "need temp reg");
1618 
1619   ldr(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1620   if (var_size_in_bytes->is_valid()) {
1621     add(t1, t1, var_size_in_bytes);
1622   } else {
1623     add(t1, t1, con_size_in_bytes);
1624   }
1625   str(t1, Address(thread, in_bytes(JavaThread::allocated_bytes_offset())));
1626 }
1627 
1628 #ifndef PRODUCT
1629 extern "C" void findpc(intptr_t x);
1630 #endif
1631 
1632 void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
1633 {
1634   print_unseen_bytecodes();
1635   // In order to get locks to work, we need to fake a in_VM state
1636   if (ShowMessageBoxOnError) {
1637     JavaThread* thread = JavaThread::current();
1638     JavaThreadState saved_state = thread->thread_state();
1639     thread->set_thread_state(_thread_in_vm);
1640 #ifndef PRODUCT
1641     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
1642       ttyLocker ttyl;
1643       BytecodeCounter::print();
1644     }
1645 #endif
1646     if (os::message_box(msg, "Execution stopped, print registers?")) {
1647       ttyLocker ttyl;
1648       tty->print_cr(" pc = 0x%016x", pc);
1649 #ifndef PRODUCT
1650       tty->cr();
1651       findpc(pc);
1652       tty->cr();
1653 #endif
1654       tty->print_cr("THIS IS WRONG!");
1655       tty->print_cr(" r0 = 0x%016x", regs[0]);
1656       tty->print_cr(" r1 = 0x%016x", regs[1]);
1657       tty->print_cr(" r2 = 0x%016x", regs[2]);
1658       tty->print_cr(" r3 = 0x%016x", regs[3]);
1659       tty->print_cr(" r4 = 0x%016x", regs[4]);
1660       tty->print_cr(" r5 = 0x%016x", regs[5]);
1661       tty->print_cr(" r6 = 0x%016x", regs[6]);
1662       tty->print_cr(" r7 = 0x%016x", regs[7]);
1663       tty->print_cr(" r8 = 0x%016x", regs[8]);
1664       tty->print_cr(" r9 = 0x%016x", regs[9]);
1665       tty->print_cr("r10 = 0x%016x", regs[10]);
1666       tty->print_cr("r11 = 0x%016x", regs[11]);
1667       tty->print_cr("r12 = 0x%016x", regs[12]);
1668       tty->print_cr("r13 = 0x%016x", regs[13]);
1669       tty->print_cr("r14 = 0x%016x", regs[14]);
1670       tty->print_cr("r15 = 0x%016x", regs[15]);
1671       BREAKPOINT;
1672     }
1673     ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
1674   } else {
1675     {
1676     ttyLocker ttyl;
1677     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
1678     ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
1679     ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
1680     ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
1681     ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
1682     ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
1683     ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
1684     ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
1685     ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
1686     ::tty->print_cr(" r8 [  rthread  ] = 0x%08x", regs[9]);
1687     ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
1688     ::tty->print_cr("r10 [  rmethod  ] = 0x%08x", regs[11]);
1689     ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
1690     ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
1691     ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
1692     ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
1693     ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
1694     }
1695     assert(false, err_msg("DEBUG MESSAGE: %s", msg));
1696   }
1697 }
1698 
1699 void MacroAssembler::push_CPU_state() {
1700   // ensure the sp is decremented by the multiple of StackAlignmentInBytes
1701   sub(sp, sp, 4);
1702   // if fix this, update also RegisterSaved::save_live_registers and it's map
1703   push(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1704 
1705   int nfloat = 16;
1706   vstmdb_f64(sp, (1 << nfloat) - 1);
1707 }
1708 
1709 void MacroAssembler::pop_CPU_state() {
1710   int nfloat = 16;
1711   vldmia_f64(sp, (1 << nfloat) - 1);
1712   pop(0x1fff, sp); // integer registers except lr & sp & (aarch32 pc)
1713   add(sp, sp, 4);
1714 }
1715 
1716 // appears this needs to round up!
1717 void MacroAssembler::round_to(Register reg, int modulus) {
1718   // from x86
1719   add(reg, reg, modulus - 1);
1720   bic(reg, reg, modulus - 1); // and( reg, -modulus)
1721 }
1722 
1723 SkipIfEqual::SkipIfEqual(
1724     MacroAssembler* masm, const bool* flag_addr, bool value) {
1725   _masm = masm;
1726   _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
1727   _masm->ldrb(rscratch1, rscratch1);
1728   _masm->cmp(rscratch1, 0);
1729   _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
1730 }
1731 
1732 SkipIfEqual::~SkipIfEqual() {
1733   _masm->bind(_label);
1734 }
1735 
1736 void MacroAssembler::cmpptr(Register src1, Address src2) {
1737   mov(rscratch1, src2);
1738   ldr(rscratch1, Address(rscratch1));
1739   cmp(src1, rscratch1);
1740 }
1741 
1742 void MacroAssembler::store_check(Register obj) {
1743   // Does a store check for the oop in register obj. The content of
1744   // register obj is destroyed afterwards.
1745 
1746   BarrierSet* bs = Universe::heap()->barrier_set();
1747   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
1748   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1749   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1750 
1751   // The calculation for byte_map_base is as follows:
1752   // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift);
1753   // So this essentially converts an address to a displacement and
1754   // it will never need to be relocated.
1755 
1756   // FIXME: It's not likely that disp will fit into an offset so we
1757   // don't bother to check, but it could save an instruction.
1758   intptr_t disp = (intptr_t) ct->byte_map_base;
1759   mov(rscratch1, disp);
1760   assert((disp & 0xff) == 0, "fix store char 0 below");
1761   strb(rscratch1, Address(rscratch1, obj, lsr((int) CardTableModRefBS::card_shift)));
1762 }
1763 
1764 void MacroAssembler::store_check(Register obj, Address dst) {
1765   store_check(obj);
1766 }
1767 
1768 // split the store check operation so that other instructions can be scheduled inbetween
1769 void MacroAssembler::store_check_part_1(Register obj) {
1770   ShouldNotCallThis();
1771 }
1772 
1773 void MacroAssembler::store_check_part_2(Register obj) {
1774   ShouldNotCallThis();
1775 }
1776 
1777 void MacroAssembler::load_klass(Register dst, Register src) {
1778   ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
1779 }
1780 
1781 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
1782   ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
1783   cmp(trial_klass, tmp);
1784 }
1785 
1786 void MacroAssembler::load_prototype_header(Register dst, Register src) {
1787   load_klass(dst, src);
1788   ldr(dst, Address(dst, Klass::prototype_header_offset()));
1789 }
1790 
1791 void MacroAssembler::store_klass(Register dst, Register src) {
1792   str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
1793 }
1794 
1795 void MacroAssembler::store_klass_gap(Register dst, Register src) { }
1796 
1797 void MacroAssembler::load_heap_oop(Register dst, Address src)
1798 {
1799   ldr(dst, src);
1800 }
1801 
1802 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src)
1803 {
1804   ldr(dst, src);
1805 }
1806 
1807 void MacroAssembler::store_heap_oop(Address dst, Register src) {
1808   str(src, dst);
1809 }
1810 
1811 // Used for storing NULLs.
1812 void MacroAssembler::store_heap_oop_null(Address dst) {
1813   mov(rscratch1, 0);
1814   str(rscratch1, dst);
1815 }
1816 
1817 
1818 
1819 
1820 
1821 
1822 
1823 
1824 
1825 
1826 
1827 
1828 
1829 
1830 
1831 
1832 
1833 
1834 
1835 
1836 
1837 
1838 
1839 
1840 
1841 
1842 
1843 
1844 
1845 
1846 
1847 
1848 
1849 #if INCLUDE_ALL_GCS
1850 void MacroAssembler::g1_write_barrier_pre(Register obj,
1851                                           Register pre_val,
1852                                           Register thread,
1853                                           Register tmp,
1854                                           bool tosca_live,
1855                                           bool expand_call) {
1856   // If expand_call is true then we expand the call_VM_leaf macro
1857   // directly to skip generating the check by
1858   // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
1859 
1860   assert(thread == rthread, "must be");
1861 
1862   Label done;
1863   Label runtime;
1864 
1865   assert(pre_val != noreg, "check this code");
1866 
1867   if (obj != noreg)
1868     assert_different_registers(obj, pre_val, tmp);
1869 
1870   Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1871                                        PtrQueue::byte_offset_of_active()));
1872   Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1873                                        PtrQueue::byte_offset_of_index()));
1874   Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
1875                                        PtrQueue::byte_offset_of_buf()));
1876 
1877 
1878   // Is marking active?
1879   if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
1880     ldr(tmp, in_progress);
1881   } else {
1882     assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
1883     ldrb(tmp, in_progress);
1884   }
1885   cmp(tmp, 0);
1886   b(done, Assembler::EQ);
1887 
1888   // Do we need to load the previous value?
1889   if (obj != noreg) {
1890     load_heap_oop(pre_val, Address(obj, 0));
1891   }
1892 
1893   // Is the previous value null?
1894   cbz(pre_val, done);
1895 
1896   // Can we store original value in the thread's buffer?
1897   // Is index == 0?
1898   // (The index field is typed as size_t.)
1899 
1900   ldr(tmp, index);                      // tmp := *index_adr
1901   cbz(tmp, runtime);                    // tmp == 0?
1902                                         // If yes, goto runtime
1903 
1904   sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
1905   str(tmp, index);                      // *index_adr := tmp
1906   ldr(rscratch1, buffer);
1907   add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
1908 
1909   // Record the previous value
1910   str(pre_val, Address(tmp, 0));
1911   b(done);
1912 
1913   bind(runtime);
1914   // save the live input values
1915   push(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1916 
1917   // Calling the runtime using the regular call_VM_leaf mechanism generates
1918   // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
1919   // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
1920   //
1921   // If we care generating the pre-barrier without a frame (e.g. in the
1922   // intrinsified Reference.get() routine) then ebp might be pointing to
1923   // the caller frame and so this check will most likely fail at runtime.
1924   //
1925   // Expanding the call directly bypasses the generation of the check.
1926   // So when we do not have have a full interpreter frame on the stack
1927   // expand_call should be passed true.
1928 
1929   if (expand_call) {
1930     assert(pre_val != c_rarg1, "smashed arg");
1931     pass_arg1(this, thread);
1932     pass_arg0(this, pre_val);
1933     MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2);
1934   } else {
1935     call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
1936   }
1937 
1938   pop(r0->bit(tosca_live) | obj->bit(obj != noreg) | pre_val->bit(true), sp);
1939 
1940   bind(done);
1941 }
1942 
1943 void MacroAssembler::g1_write_barrier_post(Register store_addr,
1944                                            Register new_val,
1945                                            Register thread,
1946                                            Register tmp,
1947                                            Register tmp2) {
1948   assert(thread == rthread, "must be");
1949 
1950   Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1951                                        PtrQueue::byte_offset_of_index()));
1952   Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
1953                                        PtrQueue::byte_offset_of_buf()));
1954 
1955   BarrierSet* bs = Universe::heap()->barrier_set();
1956   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1957   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1958 
1959   Label done;
1960   Label runtime;
1961 
1962   // Does store cross heap regions?
1963 
1964   eor(tmp, store_addr, new_val);
1965   lsr(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
1966   cbz(tmp, done);
1967 
1968   // crosses regions, storing NULL?
1969 
1970   cbz(new_val, done);
1971 
1972   // storing region crossing non-NULL, is card already dirty?
1973 
1974 
1975   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1976   const Register card_addr = tmp;
1977 
1978   lsr(card_addr, store_addr, CardTableModRefBS::card_shift);
1979 
1980   //ExternalAddress cardtable((address) ct->byte_map_base);
1981   mov(tmp2, (unsigned)ct->byte_map_base);
1982 
1983   // get the address of the card
1984   add(card_addr, card_addr, tmp2);
1985   ldrb(tmp2, Address(card_addr));
1986   cmp(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
1987   b(done, Assembler::EQ);
1988 
1989   assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
1990 
1991   membar(Assembler::StoreLoad);
1992 
1993   ldrb(tmp2, Address(card_addr));
1994   cmp(tmp2, 0);
1995   b(done, Assembler::EQ);
1996 
1997   // storing a region crossing, non-NULL oop, card is clean.
1998   // dirty card and log.
1999   mov(rscratch1, 0);
2000   strb(rscratch1, Address(card_addr));
2001 
2002   ldr(rscratch1, queue_index);
2003   cbz(rscratch1, runtime);
2004   sub(rscratch1, rscratch1, wordSize);
2005   str(rscratch1, queue_index);
2006 
2007   ldr(tmp2, buffer);
2008   str(card_addr, Address(tmp2, rscratch1));
2009   b(done);
2010 
2011   bind(runtime);
2012   // save the live input values
2013   push(store_addr->bit(true) | new_val->bit(true), sp);
2014   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
2015   pop(store_addr->bit(true) | new_val->bit(true), sp);
2016 
2017   bind(done);
2018 }
2019 
2020 #endif // INCLUDE_ALL_GCS
2021 
2022 Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
2023   assert(oop_recorder() != NULL, "this assembler needs a Recorder");
2024   int index = oop_recorder()->allocate_metadata_index(obj);
2025   RelocationHolder rspec = metadata_Relocation::spec(index);
2026   return Address((address)obj, rspec);
2027 }
2028 
2029 // Move an oop into a register.  immediate is true if we want
2030 // immediate instrcutions, i.e. we are not going to patch this
2031 // instruction while the code is being executed by another thread.  In
2032 // that case we can use move immediates rather than the constant pool.
2033 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
2034   int oop_index;
2035   if (obj == NULL) {
2036     oop_index = oop_recorder()->allocate_oop_index(obj);
2037   } else {
2038     oop_index = oop_recorder()->find_index(obj);
2039     assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
2040   }
2041   RelocationHolder rspec = oop_Relocation::spec(oop_index);
2042   if (! immediate) {
2043     address dummy = address(uintptr_t(pc()) & -wordSize); // A nearby aligned address
2044     ldr_constant(dst, Address(dummy, rspec));
2045   } else
2046     mov(dst, Address((address)obj, rspec));
2047 }
2048 
2049 // Move a metadata address into a register.
2050 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
2051   int oop_index;
2052   if (obj == NULL) {
2053     oop_index = oop_recorder()->allocate_metadata_index(obj);
2054   } else {
2055     oop_index = oop_recorder()->find_index(obj);
2056   }
2057   RelocationHolder rspec = metadata_Relocation::spec(oop_index);
2058   mov(dst, Address((address)obj, rspec));
2059 }
2060 
2061 Address MacroAssembler::constant_oop_address(jobject obj) {
2062   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
2063   assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
2064   int oop_index = oop_recorder()->find_index(obj);
2065   return Address((address)obj, oop_Relocation::spec(oop_index));
2066 }
2067 
2068 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
2069 void MacroAssembler::tlab_allocate(Register obj,
2070                                    Register var_size_in_bytes,
2071                                    int con_size_in_bytes,
2072                                    Register t1,
2073                                    Register t2,
2074                                    Label& slow_case) {
2075   assert_different_registers(obj, t2);
2076   assert_different_registers(obj, var_size_in_bytes);
2077   Register end = t2;
2078 
2079   // verify_tlab();
2080 
2081   ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
2082   if (var_size_in_bytes == noreg) {
2083     lea(end, Address(obj, con_size_in_bytes));
2084   } else {
2085     lea(end, Address(obj, var_size_in_bytes));
2086   }
2087   ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
2088   cmp(end, rscratch1);
2089   b(slow_case, Assembler::HI);
2090 
2091   // update the tlab top pointer
2092   str(end, Address(rthread, JavaThread::tlab_top_offset()));
2093 
2094   // recover var_size_in_bytes if necessary
2095   if (var_size_in_bytes == end) {
2096     sub(var_size_in_bytes, var_size_in_bytes, obj);
2097   }
2098   // verify_tlab();
2099 }
2100 
2101 // Preserves r6, and r3.
2102 Register MacroAssembler::tlab_refill(Label& retry,
2103                                      Label& try_eden,
2104                                      Label& slow_case) {
2105   Register top = r0;
2106   Register t1  = r2;
2107   Register t2  = r4;
2108   assert_different_registers(top, rthread, t1, t2, /* preserve: */ r6, r3);
2109   Label do_refill, discard_tlab;
2110 
2111   if (!Universe::heap()->supports_inline_contig_alloc()) {
2112     // No allocation in the shared eden.
2113     b(slow_case);
2114   }
2115 
2116   ldr(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2117   ldr(t1,  Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2118 
2119   // calculate amount of free space
2120   sub(t1, t1, top);
2121   lsr(t1, t1, LogHeapWordSize);
2122 
2123   // Retain tlab and allocate object in shared space if
2124   // the amount free in the tlab is too large to discard.
2125 
2126   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2127   cmp(t1, rscratch1);
2128   b(discard_tlab, Assembler::LE);
2129 
2130   // Retain
2131   // ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2132   mov(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment());
2133   add(rscratch1, rscratch1, t2);
2134   str(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_refill_waste_limit_offset())));
2135 
2136   if (TLABStats) {
2137     // increment number of slow_allocations
2138     addmw(Address(rthread, in_bytes(JavaThread::tlab_slow_allocations_offset())),
2139          1, rscratch1);
2140   }
2141   b(try_eden);
2142 
2143   bind(discard_tlab);
2144   if (TLABStats) {
2145     // increment number of refills
2146     addmw(Address(rthread, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1,
2147          rscratch1);
2148     // accumulate wastage -- t1 is amount free in tlab
2149     addmw(Address(rthread, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1,
2150          rscratch1);
2151   }
2152 
2153   // if tlab is currently allocated (top or end != null) then
2154   // fill [top, end + alignment_reserve) with array object
2155   cbz(top, do_refill);
2156 
2157   // set up the mark word
2158   mov(rscratch1, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
2159   str(rscratch1, Address(top, oopDesc::mark_offset_in_bytes()));
2160   // set the length to the remaining space
2161   sub(t1, t1, typeArrayOopDesc::header_size(T_INT));
2162   add(t1, t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve());
2163   lsl(t1, t1, log2_intptr(HeapWordSize/sizeof(jint)));
2164   str(t1, Address(top, arrayOopDesc::length_offset_in_bytes()));
2165   // set klass to intArrayKlass
2166   // dubious reloc why not an oop reloc?
2167   mov(rscratch1, ExternalAddress((address)Universe::intArrayKlassObj_addr()));
2168   ldr(t1, Address(rscratch1));
2169   // store klass last.  concurrent gcs assumes klass length is valid if
2170   // klass field is not null.
2171   store_klass(top, t1);
2172 
2173   mov(t1, top);
2174   ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2175   sub(t1, t1, rscratch1);
2176   incr_allocated_bytes(rthread, t1, 0, rscratch1);
2177 
2178   // refill the tlab with an eden allocation
2179   bind(do_refill);
2180   ldr(t1, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2181   lsl(t1, t1, LogHeapWordSize);
2182   // allocate new tlab, address returned in top
2183   eden_allocate(top, t1, 0, t2, slow_case);
2184 
2185   // Check that t1 was preserved in eden_allocate.
2186 #ifdef ASSERT
2187   if (UseTLAB) {
2188     Label ok;
2189     Register tsize = r4;
2190     assert_different_registers(tsize, rthread, t1);
2191     str(tsize, Address(pre(sp, -16)));
2192     ldr(tsize, Address(rthread, in_bytes(JavaThread::tlab_size_offset())));
2193     lsl(tsize, tsize, LogHeapWordSize);
2194     cmp(t1, tsize);
2195     b(ok, Assembler::EQ);
2196     STOP("assert(t1 != tlab size)");
2197     should_not_reach_here();
2198 
2199     bind(ok);
2200     ldr(tsize, Address(post(sp, 16)));
2201   }
2202 #endif
2203   str(top, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2204   str(top, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2205   add(top, top, t1);
2206   sub(top, top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
2207   str(top, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2208   verify_tlab();
2209   b(retry);
2210 
2211   return rthread; // for use by caller
2212 }
2213 
2214 // Defines obj, preserves var_size_in_bytes
2215 void MacroAssembler::eden_allocate(Register obj,
2216                                    Register var_size_in_bytes,
2217                                    int con_size_in_bytes,
2218                                    Register t1,
2219                                    Label& slow_case) {
2220   assert_different_registers(obj, var_size_in_bytes, t1);
2221   if (!Universe::heap()->supports_inline_contig_alloc()) {
2222     b(slow_case);
2223   } else {
2224     Register end = t1;
2225     Register heap_end = rscratch2;
2226     Label retry;
2227     bind(retry);
2228 
2229     mov(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()));
2230     ldr(heap_end, Address(rscratch1));
2231 
2232     ExternalAddress heap_top((address) Universe::heap()->top_addr());
2233     mov(rscratch1, heap_top);
2234     ldrex(obj, rscratch1);
2235 
2236     // Adjust it my the size of our new object
2237     if (var_size_in_bytes == noreg) {
2238       lea(end, Address(obj, con_size_in_bytes));
2239     } else {
2240       lea(end, Address(obj, var_size_in_bytes));
2241     }
2242 
2243     // if end < obj then we wrapped around high memory
2244     cmp(end, obj);
2245     b(slow_case, Assembler::LO);
2246 
2247     cmp(end, heap_end);
2248     b(slow_case, Assembler::HI);
2249 
2250     // If heap_top hasn't been changed by some other thread, update it.
2251     mov(rscratch2, rscratch1);
2252     strex(rscratch1, end, rscratch2);
2253     cmp(rscratch1, 0);
2254     b(retry, Assembler::NE);
2255   }
2256 }
2257 
2258 void MacroAssembler::verify_tlab() {
2259 #ifdef ASSERT
2260   if (UseTLAB && VerifyOops) {
2261     Label next, ok;
2262 
2263     strd(rscratch2, rscratch1, Address(pre(sp, -16)));
2264 
2265     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2266     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
2267     cmp(rscratch2, rscratch1);
2268     b(next, Assembler::HS);
2269     STOP("assert(top >= start)");
2270     should_not_reach_here();
2271 
2272     bind(next);
2273     ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
2274     ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
2275     cmp(rscratch2, rscratch1);
2276     b(ok, Assembler::HS);
2277     STOP("assert(top <= end)");
2278     should_not_reach_here();
2279 
2280     bind(ok);
2281     ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
2282   }
2283 #endif
2284 }
2285 
2286 // Writes to stack successive pages until offset reached to check for
2287 // stack overflow + shadow pages.  This clobbers tmp.
2288 void MacroAssembler::bang_stack_size(Register size, Register tmp) {
2289   assert_different_registers(tmp, size, rscratch1);
2290   mov(tmp, sp);
2291   // Bang stack for total size given plus shadow page size.
2292   // Bang one page at a time because large size can bang beyond yellow and
2293   // red zones.
2294   Label loop;
2295   mov(rscratch1, os::vm_page_size());
2296   bind(loop);
2297   lea(tmp, Address(tmp, -os::vm_page_size()));
2298   subs(size, size, rscratch1);
2299   str(size, Address(tmp));
2300   b(loop, Assembler::GT);
2301 
2302   // Bang down shadow pages too.
2303   // At this point, (tmp-0) is the last address touched, so don't
2304   // touch it again.  (It was touched as (tmp-pagesize) but then tmp
2305   // was post-decremented.)  Skip this address by starting at i=1, and
2306   // touch a few more pages below.  N.B.  It is important to touch all
2307   // the way down to and including i=StackShadowPages.
2308   for (int i = 0; i< StackShadowPages-1; i++) {
2309     // this could be any sized move but this is can be a debugging crumb
2310     // so the bigger the better.
2311     lea(tmp, Address(tmp, -os::vm_page_size()));
2312     str(size, Address(tmp));
2313   }
2314 }
2315 
2316 
2317 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
2318   mov(r, Address(page, rtype));
2319   InstructionMark im(this);
2320   code_section()->relocate(inst_mark(), rtype);
2321   ldr(r, Address(r));
2322   return inst_mark();
2323 }
2324 
2325 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
2326   InstructionMark im(this);
2327   code_section()->relocate(inst_mark(), rtype);
2328   // It's ok to load to reg from reg + off (without write-back)
2329   ldr(r, Address(r, 0));
2330   return inst_mark();
2331 }
2332 
2333 // Helper functions for 64-bit multipliction, division and remainder
2334 // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
2335 void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
2336   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2337   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2338   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2339 
2340   mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
2341 }
2342 
2343 // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
2344 void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
2345   assert_different_registers(Rn, Rnh);
2346   assert_different_registers(Rm, Rmh);
2347   assert_different_registers(Rd, Rdh); // umull restriction
2348   const Register t = rscratch1;
2349 
2350   mul(t, Rm, Rnh);
2351   mla(t, Rn, Rmh, t);
2352   umull(Rd, Rdh, Rm, Rn);
2353   add(Rdh, t, Rdh);
2354 }
2355 
2356 
2357 int64_t internal_ldiv(int64_t a, int64_t b) {
2358   return a / b;
2359 }
2360 
2361 int64_t internal_lmod(int64_t a, int64_t b) {
2362   return a % b;
2363 }
2364 
2365 void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
2366     Register cnt = rscratch1;
2367     Register mod = rscratch2;
2368     Register sign = r14;
2369     assert_different_registers(num, den, rscratch1, rscratch2, r14);
2370 
2371     // FIXME This works by first converting any negative values to positive ones, however
2372     // it is not possible to express |INT_MIN|. Need to fix this
2373 
2374     //Convert to positive values
2375     mov(sign, 0);
2376 
2377     cmp(num, 0);
2378     mov(sign, 1, MI);
2379     rsb(num, num, 0, MI);
2380 
2381     cmp(den, 0);
2382     if(!want_mod) eor(sign, sign, 1, MI);
2383     rsb(den, den, 0, MI);
2384 
2385     // Algorithm from
2386     // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
2387     // Graeme Williams
2388     mov(cnt, 28);
2389     mov(mod, num, lsr(4));
2390     cmp(den, mod, lsr(12));
2391     sub(cnt, cnt, 16, Assembler::LE);
2392     mov(mod, mod, lsr(16), Assembler::LE);
2393     cmp(den, mod, lsr(4));
2394     sub(cnt, cnt, 8, Assembler::LE);
2395     mov(mod, mod, lsr(8), Assembler::LE);
2396     cmp(den, mod);
2397     sub(cnt, cnt, 4, Assembler::LE);
2398     mov(mod, mod, lsr(4), Assembler::LE);
2399     mov(num, num, lsl(cnt));
2400     rsb(den, den, 0);
2401 
2402     adds(num, num, num);
2403     //Now skip over cnt copies of the 3 instr. loop.
2404     add(cnt, cnt, cnt, lsl(1));
2405     add(r15_pc, r15_pc, cnt, lsl(2));
2406     mov(r0, r0);
2407 
2408     for(int i = 0; i < 32; i++) {
2409         adcs(mod, den, mod, lsl(1));
2410         sub(mod, mod, den, Assembler::LO);
2411         adcs(num, num, num);
2412     }
2413 
2414     cmp(sign, 0);
2415     rsb(res, want_mod? mod : num, 0, NE);
2416     mov(res, want_mod? mod : num, EQ);
2417 }
2418 
2419 
2420 // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
2421 // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
2422 // <Rd> = <Rn> / <Rm>
2423 // <Rd> = <Rn> % <Rm>
2424 void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
2425   //Dispatch to best possible
2426   Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
2427   Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
2428   Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
2429 
2430   assert(32 == width || 64 == width, "Invalid width");
2431   bool is64b = 64 == width;
2432 
2433   if(is64b) {
2434     assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
2435   }
2436 
2437   if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
2438     // Emit a hw instruction sequnce.
2439     if(want_remainder) {
2440       sdiv(rscratch1, Rn, Rm);
2441       mls(Rd, rscratch1, Rm, Rn);
2442     } else {
2443       sdiv(Rd, Rn, Rm);
2444     }
2445   } else if(!is64b) {
2446     // Fall back to assembly software routine
2447     divide32(Rd, Rn, Rm, want_remainder);
2448   } else {
2449     // Fall back to C software routine for
2450     // 64 bit divide/mod
2451     if(Rn != r0) {
2452       mov(rscratch1, Rm);
2453       mov(rscratch2, Rmh);
2454 
2455       mov(r0, Rn);
2456       mov(r1, Rnh);
2457 
2458       mov(r2, rscratch1);
2459       mov(r3, rscratch2);
2460     } else if(Rm != r2) {
2461       mov(r2, Rm);
2462       mov(r3, Rmh);
2463     }
2464     address function;
2465     if(want_remainder) function = (address)internal_lmod;
2466     else               function = (address)internal_ldiv;
2467 
2468     mov(rscratch1, function);
2469     bl(rscratch1);
2470     if(Rd != r0) {
2471       mov(Rd, r0);
2472       if(is64b) mov(Rdh, r1);
2473     }
2474   }
2475 }
2476 
2477 void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
2478   assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
2479   // Dispatch to the best sequence
2480   if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
2481     // Can use extend X
2482     switch(width){
2483       case 8:  uxtb(dest, source, ror(lsb)); break;
2484       case 16: uxth(dest, source, ror(lsb)); break;
2485       default:                               break;
2486    }
2487   } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
2488     ubfx(dest, source, lsb, width);
2489   } else {
2490     // Do two shifts
2491     lsl(dest, source, 32 - (width + lsb));
2492     lsr(dest, dest, 32 - width);
2493   }
2494 }
2495 
2496 
2497 void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
2498   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2499   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2500   if(VM_Version::features() & FT_SINGLE_CORE) {
2501     ldrd(Rt, Rbase);
2502   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2503 #ifdef ASSERT
2504     Label lbl;
2505     tst(Rbase, 7);
2506     b(lbl, EQ);
2507     stop("atomic_ldrd is not doubleword aligned!");
2508     bind(lbl);
2509 #endif // ASSERT
2510 
2511     ldrexd(Rt, Rbase);
2512   } else {
2513     // TODO: Find Java way of logging
2514     static bool warning_printed = false;
2515     if(!warning_printed) {
2516       fprintf(stderr, "Unable to provide atomic doubleword load.\n");
2517       warning_printed = true;
2518     }
2519     ldrd(Rt, Rbase);
2520   }
2521 }
2522 
2523 void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
2524                                  Register temp, Register temp2) {
2525   assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
2526   assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
2527   assert((Register) (temp + 1) == temp2, "Must be contiguous");
2528   assert_different_registers(temp, Rt, Rbase, temp2);
2529   if(VM_Version::features() & FT_SINGLE_CORE) {
2530     strd(Rt, Rbase);
2531   } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
2532     // First need to gain exclusive access
2533     Label retry;
2534 
2535 #ifdef ASSERT
2536     tst(Rbase, 7);
2537     b(retry, EQ);
2538     stop("atomic_strd is not doubleword aligned!");
2539 #endif // ASSERT
2540 
2541     bind(retry);
2542     ldrexd(temp, Rbase);
2543     strexd(temp, Rt, Rbase);
2544     cmp(temp, 0);
2545     b(retry, NE);
2546   } else {
2547     // TODO: Find Java way of logging
2548     static bool warning_printed = false;
2549     if(!warning_printed) {
2550       fprintf(stderr, "Unable to provide atomic doubleword store.\n");
2551       warning_printed = true;
2552     }
2553     strd(Rt, Rbase);
2554   }
2555 }
2556 
2557 
2558 #define ENABLE_DEBUGGING 0
2559 // Helloworld is 2,482,397
2560 uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
2561 
2562 uint32_t MacroAssembler::bytecodes_executed = 0;
2563 
2564 int MacroAssembler::enable_debug = 0;
2565 int MacroAssembler::enable_method_debug = 0;
2566 int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
2567 
2568 #define N_J_BYTECODES 234
2569 const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
2570 "lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
2571 "iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
2572 "lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
2573 "aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
2574 "dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
2575 "fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
2576 "iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
2577 "dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
2578 "lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
2579 "ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
2580 "i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
2581 "dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
2582 "ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
2583 "lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
2584 "invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
2585 "anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
2586 "multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
2587 "fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
2588 "fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
2589 "fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
2590 "fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
2591 "fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "INVALID"};
2592 
2593 int bytecodes_seen[256];
2594 
2595 void MacroAssembler::init_unseen_bytecodes() {
2596   for(int i = 0; i < 256; i++ ) {
2597     bytecodes_seen[i] = 0;
2598   }
2599 }
2600 
2601 void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
2602   if(ENABLE_DEBUGGING) {
2603     mov(scratch, (address)bytecodes_seen);
2604     add(scratch, scratch, bc_reg, lsl(2));
2605     add(bc_reg, bc_reg, 1);
2606     str(bc_reg, Address(scratch));
2607     sub(bc_reg, bc_reg, 1);
2608   }
2609 }
2610 
2611 void MacroAssembler::print_unseen_bytecodes() {
2612   if(ENABLE_DEBUGGING) {
2613     printf("=== Unseen bytecodes ===\n");
2614     for(int i = 0; i < N_J_BYTECODES; i++) {
2615       if(0 == bytecodes_seen[i]) {
2616         printf("\t%s\n", j_bytecodes[i]);
2617       }
2618     }
2619     printf("=== End unseen ===\n");
2620   } else {
2621     printf("Not kept track, enable debugging to view info\n");
2622   }
2623   fflush(stdout);
2624 }
2625 
2626 int machine_state_regset = 0b0101111111111111;
2627 int machine_state_float_regset = 0b11;
2628 
2629 void MacroAssembler::save_machine_state() {
2630   stmdb(sp, machine_state_regset);
2631   vstmdb_f64(sp, machine_state_float_regset);
2632   enter();
2633 }
2634 
2635 void MacroAssembler::restore_machine_state() {
2636   leave();
2637   vldmia_f64(sp, machine_state_float_regset);
2638   ldmia(sp, machine_state_regset);
2639 }
2640 
2641 void internal_internal_printf(const char *fmt, ...) {
2642   va_list args;
2643   va_start (args, fmt);
2644   vprintf (fmt, args);
2645   fflush(stdout);
2646   va_end(args);
2647 }
2648 
2649 void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
2650   char buf[2048];
2651   char fmt[2048];
2652   buf[0] = '\0';
2653   const char *thread_str = "THREAD 0x%08x : ";
2654   int id = pthread_self();
2655   strcpy(fmt, format);
2656 
2657   char *str = strtok(fmt, "\n");
2658   int nreplace = 0;
2659   while(str) {
2660     strcpy(buf, thread_str);
2661     strcat(buf, str);
2662     strcat(buf, "\n");
2663     internal_internal_printf((const char*)buf, id, a, b, c);
2664     str = strtok(NULL, "\n");
2665   }
2666 }
2667 
2668 void MacroAssembler::get_bytecode(Register dst, Register bc) {
2669   if(ENABLE_DEBUGGING) {
2670     int nbytecodes = N_J_BYTECODES;
2671     mov(dst, (address)j_bytecodes);
2672     cmp(bc, nbytecodes);
2673 
2674     ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
2675     ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
2676   }
2677 }
2678 
2679 int invocation_depth_count = -1; //TODO remove this with debugging info
2680 
2681 #define MAX_FCALL_DEPTH 4096
2682 struct thread_method_record{
2683   int thread_id;
2684   char names[MAX_FCALL_DEPTH][512];
2685   int invocation_depth_count;
2686 };
2687 int ntmrs = 0;
2688 #define MAX_TMRS 10
2689 thread_method_record tmr_list[MAX_TMRS];
2690 
2691 void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
2692   int id = pthread_self();
2693   *thread_id = id;
2694   for(int i = 0; i < ntmrs; i++) {
2695     thread_method_record *tmr = &tmr_list[i];
2696     if(id == tmr->thread_id) {
2697       // Add a new frame
2698       if(tmr->invocation_depth_count >= -1 &&
2699         tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
2700         *invocation_depth_count = ++(tmr->invocation_depth_count);
2701         *name = tmr->names[tmr->invocation_depth_count];
2702         meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
2703         return;
2704       } else {
2705         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2706         exit(1);
2707       }
2708     }
2709   }
2710   // Add a new thread
2711   if(ntmrs >= MAX_TMRS) {
2712     fprintf(stderr, "Too many tmrs\n");
2713     exit(1);
2714   }
2715   //Create a new tmr
2716   tmr_list[ntmrs].thread_id = id;
2717   tmr_list[ntmrs].invocation_depth_count = 0;
2718   meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
2719   *invocation_depth_count = 0;
2720   *name = tmr_list[ntmrs].names[0];
2721   ntmrs++;
2722 }
2723 
2724 void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
2725   int id = pthread_self();
2726   *thread_id = id;
2727   for(int i = 0; i < ntmrs; i++) {
2728     thread_method_record *tmr = &tmr_list[i];
2729     if(id == tmr->thread_id) {
2730       if(tmr->invocation_depth_count >= 0 &&
2731         tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
2732         // Pop frame
2733         *name = tmr->names[tmr->invocation_depth_count];
2734         *invocation_depth_count = (tmr->invocation_depth_count)--;
2735         return;
2736       } else if ( -1 == tmr->invocation_depth_count) {
2737         *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
2738         *invocation_depth_count = 0;
2739         return;
2740       } else {
2741         fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
2742         exit(1);
2743       }
2744     }
2745   }
2746   fprintf(stderr, "Unable to find suitable tmr\n");
2747   exit(1);
2748 }
2749 
2750 void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
2751   sprintf(buf, "THREAD 0x%08x : ", id);
2752   for(int i = 0; i < invocation_depth_count; i++) {
2753     strcat(buf, "  ");
2754   }
2755 }
2756 
2757 
2758 void print_entry(Method *meth, int native) {
2759   char *name;
2760   int invocation_depth_count, id;
2761   push_tmr(meth, &id, &invocation_depth_count, &name);
2762 
2763   if(MacroAssembler::enable_method_debug) {
2764     char buf[4096], buf_b[2048];
2765     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2766     if(native) {
2767       sprintf(buf_b, "CALL NATIVE : %s\n", name);
2768     } else {
2769       sprintf(buf_b, "CALL JAVA   : %s\n", name);
2770     }
2771     strcat(buf, buf_b);
2772     printf("%s", buf);
2773     fflush(stdout);
2774   }
2775 }
2776 
2777 void print_exit(bool normal) {
2778   char *name;
2779   int invocation_depth_count, id;
2780   pop_tmr(&id, &invocation_depth_count, &name);
2781 
2782   if(MacroAssembler::enable_method_debug) {
2783     char buf[4096], buf_b[2048];
2784     prepare_entry_exit_prefix(buf, id, invocation_depth_count);
2785     sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
2786     strcat(buf, buf_b);
2787     printf("%s", buf);
2788     fflush(stdout);
2789   }
2790 }
2791 
2792 void MacroAssembler::print_method_entry(Register rmethod, bool native) {
2793   if(ENABLE_DEBUGGING) {
2794     save_machine_state();
2795 
2796     bic(sp, sp, 7); // 8-byte align stack
2797     mov(rscratch2, (address)print_entry);
2798     mov(r0, rmethod);
2799     mov(r1, native);
2800     bl(rscratch2);
2801 
2802     restore_machine_state();
2803   }
2804 }
2805 
2806 void MacroAssembler::print_method_exit(bool normal) {
2807   if(ENABLE_DEBUGGING) {
2808     save_machine_state();
2809 
2810     bic(sp, sp, 7); // 8-byte align stack
2811     mov(rscratch2, (address)print_exit);
2812     mov(r0, normal);
2813     bl(rscratch2);
2814 
2815     restore_machine_state();
2816   }
2817 }
2818 
2819 void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
2820   if(ENABLE_DEBUGGING) {
2821     Label skip;
2822     save_machine_state();
2823 
2824         mov(rscratch1, ra);
2825         str(rscratch1, Address(pre(sp, -wordSize)));
2826         mov(rscratch1, rb);
2827         str(rscratch1, Address(pre(sp, -wordSize)));
2828         mov(rscratch1, rc);
2829         str(rscratch1, Address(pre(sp, -wordSize)));
2830 
2831         if(!important) {
2832             mov(r0, (address)&enable_debug);
2833             ldr(r0, Address(r0));
2834             cmp(r0, 0);
2835             b(skip, Assembler::EQ);
2836         }
2837 
2838         int sp_difference = wordSize * (count_bits(machine_state_regset) +
2839                                         2 * count_bits(machine_state_float_regset) +
2840                                         2 + 3); //Frame entry and saved
2841 
2842         mov(r0, (address)fmt);
2843         if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
2844         else         add(r1, sp, sp_difference);
2845 
2846         if(rb != sp) ldr(r2, Address(sp, wordSize));
2847         else         add(r2, sp, sp_difference);
2848 
2849         if(rc != sp) ldr(r3, Address(sp));
2850         else         add(r3, sp, sp_difference);
2851 
2852         bic(sp, sp, 7); // 8-byte align stack
2853 
2854         mov(rscratch2, (address)internal_printf);
2855         bl(rscratch2);
2856 
2857         bind(skip);
2858         restore_machine_state();
2859     }
2860 }
2861 
2862 void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
2863   reg_printf_internal(false, fmt, ra, rb, rc);
2864 }
2865 
2866 void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
2867   reg_printf_internal(true, fmt, ra, rb, rc);
2868 }
2869 
2870 // When debugging, set the break on bkpnt
2871 void bkpnt() { return; }
2872 void MacroAssembler::create_breakpoint() {
2873     if(ENABLE_DEBUGGING) {
2874         save_machine_state();
2875         bic(sp, sp, 7); // 8-byte align stack
2876 
2877         mov(rscratch2, (address) bkpnt);
2878         bl(rscratch2);
2879 
2880         restore_machine_state();
2881     }
2882 }
2883 
2884 
2885 void MacroAssembler::print_cpool(InstanceKlass *klass) {
2886     ttyLocker ttyl;
2887     klass->constants()->print_on(tty);
2888 }
2889 
2890 int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
2891     if((0 == Rt->encoding_nocheck() % 2 &&
2892          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2893       (uabs(adr.offset()) < (1 << 8))) {
2894       /* Good to go with a ldrd */
2895       ldrd(Rt, adr, cond);
2896       return 0x0;
2897     } else {
2898       return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
2899                                 &Assembler::ldr, Rtmp, cond);
2900     }
2901 }
2902 
2903 int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
2904     if((0 == Rt->encoding_nocheck() % 2 &&
2905          (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
2906       (uabs(adr.offset()) < (1 << 8))) {
2907       /* Good to go with a strd */
2908       strd(Rt, adr, cond);
2909     } else {
2910       double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
2911     }
2912     return 0x0;
2913 }
2914 
2915 int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2916         void (Assembler::* mul)(unsigned, const Address&, Condition),
2917         void (Assembler::* sgl)(Register, const Address&, Condition),
2918         Register Rtmp, Condition cond) {
2919   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2920           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2921     /* Do a load or store multiple instruction */
2922     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2923   } else if (!adr.uses(Rt)) {
2924     double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
2925   } else {
2926     // need to reshuffle operation, otherwise write to Rt destroys adr
2927     if (adr.get_mode() != Address::reg) {
2928       // offset-based addressing. hence Rt2 could not be by adr
2929       if (adr.get_wb_mode() == Address::pre) {
2930         (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
2931         (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
2932       } else if (adr.get_wb_mode() == Address::post) {
2933         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2934         (this->*sgl)(Rt, adr, cond);
2935       } else if (adr.get_wb_mode() == Address::off) {
2936         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2937         (this->*sgl)(Rt, adr, cond);
2938       } else {
2939         ShouldNotReachHere();
2940       }
2941     } else {
2942       // index-based addressing. both Rt and Rt2 could be used by adr
2943       // hence temp register is necessary
2944       adr.lea(this, Rtmp);
2945       double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
2946       // adr.lea have only address manipulation and cannot cause trap.
2947       // first instruction when NPE can occur is in double_ldst_failed_dispatch
2948       // so shift offset appropriately
2949       return 0x4;
2950     }
2951   }
2952   return 0x0;
2953 }
2954 
2955 void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
2956         void (Assembler::* mul)(unsigned, const Address&, Condition),
2957         void (Assembler::* sgl)(Register, const Address&, Condition),
2958         Condition cond) {
2959   if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
2960           (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
2961     /* Do a store multiple instruction */
2962     (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
2963   } else {
2964     if (adr.get_mode() != Address::reg) {
2965       // offset-based addressing
2966       if (adr.get_wb_mode() == Address::pre) {
2967         (this->*sgl)(Rt, adr, cond);
2968         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2969       } else if (adr.get_wb_mode() == Address::post) {
2970         (this->*sgl)(Rt, adr, cond);
2971         (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
2972       } else if (adr.get_wb_mode() == Address::off) {
2973         (this->*sgl)(Rt, adr, cond);
2974         (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
2975       } else {
2976         ShouldNotReachHere();
2977       }
2978     } else {
2979       // index-based addressing
2980       if (adr.get_wb_mode() == Address::pre) {
2981         // current implementation does not use Address::pre for indexed access
2982         ShouldNotReachHere();
2983       } else if (adr.get_wb_mode() == Address::post) {
2984         // current implementation does not use Address:post for indexed access
2985         // enable the code below and implement proper post() method if it is required
2986         ShouldNotReachHere();
2987       } else if (adr.get_wb_mode() == Address::off) {
2988         (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
2989         (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
2990         compensate_addr_offset(adr, cond);
2991       } else {
2992         ShouldNotReachHere();
2993       }
2994     }
2995   }
2996 }
2997 
2998 #ifdef ASSERT
2999 void MacroAssembler::verify_stack_alignment() {
3000   if (StackAlignmentInBytes > 4) {
3001     Label x;
3002     tst(sp, StackAlignmentInBytes-1);
3003     b(x, EQ);
3004     stop("stack unaligned");
3005     bind(x);
3006   }
3007 }
3008 #endif
3009 
3010 /**
3011  * Emits code to update CRC-32 with a byte value according to constants in table
3012  *
3013  * @param [in,out]crc   Register containing the crc.
3014  * @param [in]val       Register containing the byte to fold into the CRC.
3015  * @param [in]table     Register containing the table of crc constants.
3016  *
3017  * uint32_t crc;
3018  * val = crc_table[(val ^ crc) & 0xFF];
3019  * crc = val ^ (crc >> 8);
3020  *
3021  */
3022 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
3023   eor(val, val, crc);
3024   andr(val, val, 0xff);
3025   ldr(val, Address(table, val, lsl(2)));
3026   eor(crc, val, crc, Assembler::lsr(8));
3027 }
3028 
3029 /**
3030  * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
3031  *
3032  * @param [in,out]crc   Register containing the crc.
3033  * @param [in]v         Register containing the 32-bit to fold into the CRC.
3034  * @param [in]table0    Register containing table 0 of crc constants.
3035  * @param [in]table1    Register containing table 1 of crc constants.
3036  * @param [in]table2    Register containing table 2 of crc constants.
3037  * @param [in]table3    Register containing table 3 of crc constants.
3038  *
3039  * uint32_t crc;
3040  *   v = crc ^ v
3041  *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
3042  *
3043  */
3044 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
3045         Register tmp2, Register table0, Register table1, Register table2, Register table3) {
3046   eor(v, crc, v);
3047   uxtb(tmp, v);
3048   uxtb(tmp2, v, ror(8));
3049   ldr(crc, Address(table3, tmp, lsl(2)));
3050   ldr(tmp2, Address(table2, tmp2, lsl(2)));
3051   uxtb(tmp, v, ror(16));
3052   eor(crc, crc, tmp2);
3053   uxtb(tmp2, v, ror(24));
3054   ldr(tmp, Address(table1, tmp, lsl(2)));
3055   ldr(tmp2, Address(table0, tmp2, lsl(2)));
3056   eor(crc, crc, tmp);
3057   eor(crc, crc, tmp2);
3058 }
3059 
3060 /**
3061  * @param crc   register containing existing CRC (32-bit)
3062  * @param buf   register pointing to input byte buffer (byte*)
3063  * @param len   register containing number of bytes
3064  * @param table register that will contain address of CRC table
3065  * @param tmp   scratch register
3066  */
3067 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
3068         Register table0, Register table1, Register table2, Register table3,
3069         Register tmp, Register tmp2, Register tmp3) {
3070   Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
3071 
3072   inv(crc, crc);
3073   if (UseCRC32) {
3074     Label CRC_by4_loop, CRC_by1_loop;
3075 
3076       subs(len, len, 4);
3077       b(CRC_by4_loop, Assembler::GE);
3078       adds(len, len, 4);
3079       b(CRC_by1_loop, Assembler::GT);
3080       b(L_exit);
3081 
3082     BIND(CRC_by4_loop);
3083       ldr(tmp, Address(post(buf, 4)));
3084       subs(len, len, 4);
3085       crc32w(crc, crc, tmp);
3086       b(CRC_by4_loop, Assembler::GE);
3087       adds(len, len, 4);
3088       b(L_exit, Assembler::LE);
3089     BIND(CRC_by1_loop);
3090       ldrb(tmp, Address(post(buf, 1)));
3091       subs(len, len, 1);
3092       crc32b(crc, crc, tmp);
3093       b(CRC_by1_loop, Assembler::GT);
3094     BIND(L_exit);
3095       inv(crc, crc);
3096       return;
3097   }
3098     lea(table0, ExternalAddress(StubRoutines::crc_table_addr()));
3099     add(table1, table0, 1*256*sizeof(juint));
3100     add(table2, table0, 2*256*sizeof(juint));
3101     add(table3, table0, 3*256*sizeof(juint));
3102 
3103   BIND(L_align_by1_loop);
3104     tst(buf, 3);
3105     b(L_align_exit, Assembler::EQ);
3106     cmp(len, 0);
3107     b(L_exit, Assembler::EQ);
3108     sub(len, len, 1);
3109     ldrb(tmp, Address(post(buf, 1)));
3110     update_byte_crc32(crc, tmp, table0);
3111     b(L_align_by1_loop);
3112 
3113   BIND(L_align_exit);
3114 
3115   if (UseNeon) {
3116       cmp(len, 32+12); // account for possible need for alignment
3117       b(L_cpu, Assembler::LT);
3118 
3119     Label L_fold, L_align_by4_loop, L_align_by4_exit;
3120 
3121     BIND(L_align_by4_loop);
3122       tst(buf, 0xf);
3123       b(L_align_by4_exit, Assembler::EQ);
3124       ldr(tmp, Address(post(buf, 4)));
3125       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3126       sub(len, len, 4);
3127       b(L_align_by4_loop);
3128 
3129     BIND(L_align_by4_exit);
3130 
3131       add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
3132 
3133       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3134       vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
3135       vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
3136       vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
3137       vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
3138       veor_64(d16, d16, d16);
3139       vmov_32(d16, 0, crc);
3140 
3141       veor_64(d0, d0, d16);
3142       sub(len, len, 32);
3143 
3144     BIND(L_fold);
3145       vmullp_8(q8, d0, d5);
3146       vmullp_8(q9, d0, d7);
3147       vmullp_8(q10, d0, d4);
3148       vmullp_8(q11, d0, d6);
3149 
3150       vmullp_8(q12, d1, d5);
3151       vmullp_8(q13, d1, d7);
3152       vmullp_8(q14, d1, d4);
3153       vmullp_8(q15, d1, d6);
3154 
3155       vuzp_128_16(q9, q8);
3156       veor_128(q8, q8, q9);
3157 
3158       vuzp_128_16(q13, q12);
3159       veor_128(q12, q12, q13);
3160 
3161       vshll_16u(q9, d16, 8);
3162       vshll_16u(q8, d17, 8);
3163 
3164       vshll_16u(q13, d24, 8);
3165       vshll_16u(q12, d25, 8);
3166 
3167       veor_128(q8, q8, q10);
3168       veor_128(q12, q12, q14);
3169       veor_128(q9, q9, q11);
3170       veor_128(q13, q13, q15);
3171 
3172       veor_64(d19, d19, d18);
3173       veor_64(d18, d27, d26);
3174 
3175       vshll_32u(q13, d18, 16);
3176       vshll_32u(q9, d19, 16);
3177 
3178       veor_128(q9, q8, q9);
3179       veor_128(q13, q12, q13);
3180 
3181       veor_64(d31, d26, d27);
3182       veor_64(d30, d18, d19);
3183 
3184       vshl_128_64(q15, q15, 1);
3185       vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
3186       veor_128(q0, q0, q15);
3187 
3188       subs(len, len, 16);
3189       b(L_fold, Assembler::GE);
3190 
3191       vmov_32(tmp, d0, 0);
3192       mov(crc, 0);
3193       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3194       vmov_32(tmp, d0, 1);
3195       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3196       vmov_32(tmp, d1, 0);
3197       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3198       vmov_32(tmp, d1, 1);
3199       update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3200 
3201       add(len, len, 16);
3202   }
3203 
3204   BIND(L_cpu);
3205     subs(len, len, 8);
3206     b(L_by8_loop, Assembler::GE);
3207     adds(len, len, 8);
3208     b(L_by1_loop, Assembler::GT);
3209     b(L_exit);
3210 
3211   BIND(L_by8_loop);
3212     ldr(tmp, Address(post(buf, 4)));
3213     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3214     ldr(tmp, Address(post(buf, 4)));
3215     update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
3216     subs(len, len, 8);
3217     b(L_by8_loop, Assembler::GE);
3218     adds(len, len, 8);
3219     b(L_exit, Assembler::LE);
3220   BIND(L_by1_loop);
3221     subs(len, len, 1);
3222     ldrb(tmp, Address(post(buf, 1)));
3223     update_byte_crc32(crc, tmp, table0);
3224     b(L_by1_loop, Assembler::GT);
3225 
3226   BIND(L_exit);
3227     inv(crc, crc);
3228 }
3229 
3230 void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
3231   if (width > 15 && lsb == 0) {
3232     lsr(Rd, Rd, width);
3233     lsl(Rd, Rd, width);
3234   } else if (width > 15 && lsb + width == 32) {
3235     lsl(Rd, Rd, 32 - lsb);
3236     lsr(Rd, Rd, 32 - lsb);
3237   } else {
3238     const int lsb1 = (lsb & 1);
3239     int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
3240     while (width) {
3241       bic(Rd, Rd, ((1 << w1) - 1) << lsb);
3242       width -= w1;
3243       lsb += w1;
3244       w1 = width > 8 ? 8 : width;
3245     }
3246   }
3247 }