1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
   6  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   7  *
   8  * This code is free software; you can redistribute it and/or modify it
   9  * under the terms of the GNU General Public License version 2 only, as
  10  * published by the Free Software Foundation.
  11  *
  12  * This code is distributed in the hope that it will be useful, but WITHOUT
  13  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  14  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  15  * version 2 for more details (a copy is included in the LICENSE file that
  16  * accompanied this code).
  17  *
  18  * You should have received a copy of the GNU General Public License version
  19  * 2 along with this work; if not, write to the Free Software Foundation,
  20  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  21  *
  22  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  23  * or visit www.oracle.com if you need additional information or have any
  24  * questions.
  25  *
  26  */
  27 
  28 #ifndef CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP
  29 #define CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP
  30 
  31 #include "asm/assembler.hpp"
  32 #include "nativeInst_aarch32.hpp"
  33 
  34 // MacroAssembler extends Assembler by frequently used macros.
  35 //
  36 // Instructions for which a 'better' code sequence exists depending
  37 // on arguments should also go in here.
  38 
  39 class MacroAssembler: public Assembler {
  40   friend class LIR_Assembler;
  41   friend class G1BarrierSetAssembler;
  42 
  43   using Assembler::mov;
  44 
  45  protected:
  46 
  47   // Support for VM calls
  48   //
  49   // This is the base routine called by the different versions of call_VM_leaf. The interpreter
  50   // may customize this version by overriding it for its purposes (e.g., to save/restore
  51   // additional registers when doing a VM call).
  52   virtual void call_VM_leaf_base(
  53     address entry_point,               // the entry point
  54     int     number_of_arguments,        // the number of arguments to pop after the call
  55     Label *retaddr = NULL
  56   );
  57 
  58   virtual void call_VM_leaf_base(
  59     address entry_point,               // the entry point
  60     int     number_of_arguments,        // the number of arguments to pop after the call
  61     Label &retaddr) {
  62     call_VM_leaf_base(entry_point, number_of_arguments, &retaddr);
  63   }
  64 
  65   // This is the base routine called by the different versions of call_VM. The interpreter
  66   // may customize this version by overriding it for its purposes (e.g., to save/restore
  67   // additional registers when doing a VM call).
  68   //
  69   // If no java_thread register is specified (noreg) than rthread will be used instead. call_VM_base
  70   // returns the register which contains the thread upon return. If a thread register has been
  71   // specified, the return value will correspond to that register. If no last_java_sp is specified
  72   // (noreg) than rsp will be used instead.
  73   virtual void call_VM_base(           // returns the register containing the thread upon return
  74     Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
  75     Register java_thread,              // the thread if computed before     ; use noreg otherwise
  76     Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
  77     address  entry_point,              // the entry point
  78     int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
  79     bool     check_exceptions          // whether to check for pending exceptions after return
  80   );
  81 
  82  public:
  83   void init_unseen_bytecodes();
  84   MacroAssembler(CodeBuffer* code) : Assembler(code) { init_unseen_bytecodes();}
  85 
  86   // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
  87   // The implementation is only non-empty for the InterpreterMacroAssembler,
  88   // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
  89   virtual void check_and_handle_popframe(Register java_thread);
  90   virtual void check_and_handle_earlyret(Register java_thread);
  91 
  92   void safepoint_poll(Label& slow_path);
  93   void safepoint_poll_acquire(Label& slow_path);
  94 
  95   void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
  96 
  97   // Biased locking support
  98   // obj_reg must be loaded up with the appropriate values.
  99   // swap_reg is killed.
 100   // tmp_reg and tmp_reg2 shall be supplied.
 101   // Optional slow case is for implementations (interpreter and C1) which branch to
 102   // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
 103   // Returns offset of first potentially-faulting instruction for null
 104   // check info (currently consumed only by C1). If
 105   // swap_reg_contains_mark is true then returns -1 as it is assumed
 106   // the calling code has already passed any potential faults.
 107   int biased_locking_enter(Register obj_reg,
 108                            Register swap_reg, Register tmp_reg, Register tmp_reg2,
 109                            bool swap_reg_contains_mark,
 110                            Label& done, Label* slow_case = NULL,
 111                            BiasedLockingCounters* counters = NULL);
 112   void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
 113 
 114 
 115   // Helper functions for statistics gathering.
 116   // Unconditional atomic increment.
 117   void atomic_inc(Register counter_addr, Register tmp);
 118   void atomic_inc(Address counter_addr, Register tmp1, Register tmp2) {
 119     lea(tmp1, counter_addr);
 120     atomic_inc(tmp1, tmp2);
 121   }
 122   // Load Effective Address
 123   void lea(Register r, const Address &a) {
 124     InstructionMark im(this);
 125     code_section()->relocate(inst_mark(), a.rspec());
 126     a.lea(this, r);
 127   }
 128 
 129   virtual void _call_Unimplemented(address call_site) {
 130     mov(rscratch2, call_site);
 131     stop("HALT");
 132   }
 133 
 134 #define call_Unimplemented() _call_Unimplemented((address)__PRETTY_FUNCTION__)
 135 
 136 // macro assembly operations needed for aarch32
 137 
 138 private:
 139 
 140   int push(unsigned int bitset, Register stack);
 141   int pop(unsigned int bitset, Register stack);
 142 
 143 public:
 144 
 145   void mov(Register dst, Address a, Condition cond = C_DFLT);
 146 
 147   void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
 148   void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
 149 
 150   // now mov instructions for loading absolute addresses and 32bit immediates
 151 
 152   inline void mov(Register dst, address addr, Condition cond = C_DFLT) {
 153     // TODO: Do Address end up as address and then passing through this method, after
 154     // being marked for relocation elsewhere? If not (as I suspect) then this can
 155     // be relaxed to mov_immediate to potentially produce shorter code sequences.
 156     mov_immediate32(dst, (uint32_t)addr, cond, false);
 157   }
 158 
 159   inline void mov(Register dst, long l, Condition cond = C_DFLT) {
 160     mov(dst, (uint32_t)l, cond);
 161   }
 162   inline void mov(Register dst, unsigned long l, Condition cond = C_DFLT) {
 163     mov(dst, (uint32_t)l, cond);
 164   }
 165   inline void mov(Register dst, int i, Condition cond = C_DFLT) {
 166     mov(dst, (uint32_t)i, cond);
 167   }
 168 #ifdef COMPILER2
 169   inline void mov(Register dst, jlong i, Condition cond = C_DFLT) {
 170     assert(!(i >> 32), "must be 32-bit"); // really a 32-bit value contained in jlong. not sign extended!
 171     mov(dst, (uint32_t)i, cond);
 172   }
 173   inline void mov(Register dst, julong i, Condition cond = C_DFLT) {
 174     assert(!(i >> 32), "must be 32-bit");
 175     mov(dst, (uint32_t)i, cond);
 176   }
 177 #endif
 178   inline void mov(Register dst, uint32_t i, Condition cond = C_DFLT) {
 179     mov_immediate(dst, i, cond, false);
 180   }
 181 
 182   inline void mov(Register dst, Register src, Condition cond = C_DFLT) {
 183     Assembler::mov(dst, src, cond);
 184   }
 185   inline void mov(Register dst, Register src, shift_op shift,
 186                   Condition cond = C_DFLT) {
 187     Assembler::mov(dst, src, shift, cond);
 188   }
 189   // TODO add sflag compatibility
 190   void movptr(Register r, uintptr_t imm32, Condition cond = C_DFLT);
 191 
 192   // to reduce the chance for mistake these shall overload the mvn(Register, Register) variant
 193   using Assembler::mvn;
 194   using Assembler::mvns;
 195   inline void mvn(Register dst, uint32_t i, Condition cond = C_DFLT) {
 196     mov_immediate(dst, ~i, cond, false);
 197   }
 198   inline void mvns(Register dst, uint32_t i, Condition cond = C_DFLT) {
 199     mov_immediate(dst, ~i, cond, true);
 200   }
 201 
 202   void ret(Register reg);
 203 
 204   // Both of these are aarch64 instructions that can easily be emulated
 205   // Note that this does not quite have the same semantics as aarch64
 206   // version as this updates the s flag.
 207   void cbz(Register r, Label& l) {
 208     cmp(r, 0);
 209     b(l, EQ);
 210   }
 211   void cbnz(Register r, Label& l) {
 212     cmp(r, 0);
 213     b(l, NE);
 214   }
 215   void tbz(Register r, unsigned bit, Label& l) {
 216     tst(r, 1 << bit);
 217     b(l, EQ);
 218   }
 219   void tbnz(Register r, unsigned bit, Label& l) {
 220     tst(r, 1 << bit);
 221     b(l, NE);
 222   }
 223 
 224   void addmw(Address a, Register incr, Register scratch) {
 225     ldr(scratch, a);
 226     add(scratch, scratch, incr);
 227     str(scratch, a);
 228   }
 229 
 230   // Add constant to memory word
 231   void addmw(Address a, int imm, Register scratch) {
 232     ldr(scratch, a);
 233     if (imm > 0)
 234       add(scratch, scratch, (unsigned)imm);
 235     else
 236       sub(scratch, scratch, (unsigned)-imm);
 237     str(scratch, a);
 238   }
 239 
 240 // XXX stubs
 241 
 242   // macro instructions for accessing and updating floating point
 243   // status register
 244   //
 245   // FPSR : op1 == 011
 246   //        CRn == 0100
 247   //        CRm == 0100
 248   //        op2 == 001
 249 
 250   inline void get_fpsr(Register reg = as_Register(0xf)) {
 251     vmrs(reg);
 252   }
 253 
 254   inline void set_fpsr(Register reg) {
 255     vmsr(reg);
 256   }
 257 
 258   inline void clear_fpsr() {
 259     mov(rscratch1, 0);
 260     set_fpsr(rscratch1);
 261   }
 262 
 263   // Support for NULL-checks
 264   //
 265   // Generates code that causes a NULL OS exception if the content of reg is NULL.
 266   // If the accessed location is M[reg + offset] and the offset is known, provide the
 267   // offset. No explicit code generation is needed if the offset is within a certain
 268   // range (0 <= offset <= page_size).
 269 
 270   virtual void null_check(Register reg, int offset = -1);
 271   static bool needs_explicit_null_check(intptr_t offset);
 272 
 273   static address target_addr_for_insn(address insn_addr, unsigned insn);
 274   static address target_addr_for_insn(address insn_addr) {
 275     unsigned insn = *(unsigned*)insn_addr;
 276     return target_addr_for_insn(insn_addr, insn);
 277   }
 278 
 279   // Required platform-specific helpers for Label::patch_instructions.
 280   // They _shadow_ the declarations in AbstractAssembler, which are undefined.
 281   static int pd_patch_instruction_size(address branch, address target);
 282   static void pd_patch_instruction(address branch, address target) {
 283     pd_patch_instruction_size(branch, target);
 284   }
 285 
 286 #ifndef PRODUCT
 287   static void pd_print_patched_instruction(address branch);
 288 #endif
 289 
 290   static int patch_oop(address insn_addr, address o);
 291 
 292   // The following 4 methods return the offset of the appropriate move instruction
 293 
 294   // Support for fast byte/short loading with zero extension (depending on particular CPU)
 295   int load_unsigned_byte(Register dst, Address src);
 296   int load_unsigned_short(Register dst, Address src);
 297 
 298   // Support for fast byte/short loading with sign extension (depending on particular CPU)
 299   int load_signed_byte(Register dst, Address src);
 300   int load_signed_short(Register dst, Address src);
 301 
 302   // Support for sign-extension (hi:lo = extend_sign(lo))
 303   void extend_sign(Register hi, Register lo);
 304 
 305   // Load and store values by size and signed-ness
 306   void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
 307   void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
 308 
 309   // Support for inc/dec with optimal instruction selection depending on value.
 310   // increment()/decrement() calls with an address destination will need to use
 311   // rscratch1 to load the value to be incremented. increment()/decrement()
 312   // calls which add or subtract a constant value greater than 2^12 will need
 313   // to use rscratch2 to hold the constant. So, a register increment()/
 314   // decrement() may trash rscratch2, and an address increment()/decrement()
 315   // may trash rscratch1 and rscratch2.
 316   void decrement(Register reg, int value = 1);
 317   void decrement(Address dst, int value = 1);
 318   void increment(Register reg, int value = 1);
 319   void increment(Address dst, int value = 1);
 320 
 321   // Alignment
 322   void align(int modulus);
 323 
 324   // Stack frame creation/removal
 325   //
 326   // VM and intepreter code may have different stack layouts. enter/leave default layout
 327   // is selected by FrameAPCS option. One can make enter/leave to use VMFrameAPCS instead.
 328   void enter(bool as_apcs = FrameAPCS) {
 329     if (as_apcs) {
 330       mov(rscratch2, sp);
 331       stmdb(sp, RegSet::of(rfp, rscratch2, lr, r15_pc).bits());
 332       sub(rfp, rscratch2, 4);
 333     } else {
 334       stmdb(sp, RegSet::of(rfp, lr).bits());
 335       add(rfp, sp, wordSize);
 336     }
 337   }
 338 
 339   void leave(bool as_apcs = FrameAPCS) {
 340     if (as_apcs) {
 341       ldmea(rfp, RegSet::of(rfp, sp, lr).bits(), false/*wb*/);
 342     } else {
 343       sub(sp, rfp, wordSize);
 344       ldmia(sp, RegSet::of(rfp, lr).bits());
 345     }
 346   }
 347 
 348   // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
 349   // The pointer will be loaded into the thread register.
 350   void get_thread(Register thread);
 351 
 352   enum ret_type { ret_type_void, ret_type_integral, ret_type_float, ret_type_double};
 353   // Support for VM calls
 354   //
 355   // It is imperative that all calls into the VM are handled via the call_VM macros.
 356   // They make sure that the stack linkage is setup correctly. call_VM's correspond
 357   // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
 358 
 359 
 360   void call_VM(Register oop_result,
 361                address entry_point,
 362                bool check_exceptions = true);
 363   void call_VM(Register oop_result,
 364                address entry_point,
 365                Register arg_1,
 366                bool check_exceptions = true);
 367   void call_VM(Register oop_result,
 368                address entry_point,
 369                Register arg_1, Register arg_2,
 370                bool check_exceptions = true);
 371   void call_VM(Register oop_result,
 372                address entry_point,
 373                Register arg_1, Register arg_2, Register arg_3,
 374                bool check_exceptions = true);
 375 
 376   // Overloadings with last_Java_sp
 377   void call_VM(Register oop_result,
 378                Register last_java_sp,
 379                address entry_point,
 380                int number_of_arguments = 0,
 381                bool check_exceptions = true);
 382   void call_VM(Register oop_result,
 383                Register last_java_sp,
 384                address entry_point,
 385                Register arg_1, bool
 386                check_exceptions = true);
 387   void call_VM(Register oop_result,
 388                Register last_java_sp,
 389                address entry_point,
 390                Register arg_1, Register arg_2,
 391                bool check_exceptions = true);
 392   void call_VM(Register oop_result,
 393                Register last_java_sp,
 394                address entry_point,
 395                Register arg_1, Register arg_2, Register arg_3,
 396                bool check_exceptions = true);
 397 
 398   void get_vm_result  (Register oop_result, Register thread);
 399   void get_vm_result_2(Register metadata_result, Register thread);
 400 
 401   // These always tightly bind to MacroAssembler::call_VM_base
 402   // bypassing the virtual implementation
 403   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
 404   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
 405   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
 406   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
 407   void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
 408 
 409   void call_VM_leaf(address entry_point,
 410                     int number_of_arguments = 0);
 411   void call_VM_leaf(address entry_point,
 412                     Register arg_1);
 413   void call_VM_leaf(address entry_point,
 414                     Register arg_1, Register arg_2);
 415   void call_VM_leaf(address entry_point,
 416                     Register arg_1, Register arg_2, Register arg_3);
 417 
 418   // These always tightly bind to MacroAssembler::call_VM_leaf_base
 419   // bypassing the virtual implementation
 420   void super_call_VM_leaf(address entry_point);
 421   void super_call_VM_leaf(address entry_point, Register arg_1);
 422   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
 423   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
 424   void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
 425 
 426   // last Java Frame (fills frame anchor)
 427   void set_last_Java_frame(Register last_java_sp,
 428                            Register last_java_fp,
 429                            address last_java_pc,
 430                            Register scratch);
 431 
 432   void set_last_Java_frame(Register last_java_sp,
 433                            Register last_java_fp,
 434                            Label &last_java_pc,
 435                            Register scratch);
 436 
 437   void set_last_Java_frame(Register last_java_sp,
 438                            Register last_java_fp,
 439                            Register last_java_pc,
 440                            Register scratch);
 441 
 442   void reset_last_Java_frame(Register thread);
 443 
 444   // thread in the default location (rthread)
 445   void reset_last_Java_frame(bool clear_fp);
 446 
 447   // Stores
 448   void store_check(Register obj);                // store check for obj - register is destroyed afterwards
 449   void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
 450 
 451   void resolve_jobject(Register value, Register thread, Register tmp);
 452 
 453   // C 'boolean' to Java boolean: x == 0 ? 0 : 1
 454   void c2bool(Register x);
 455 
 456   // oop manipulations
 457   void load_klass(Register dst, Register src);
 458   void store_klass(Register dst, Register src);
 459   void cmp_klass(Register oop, Register trial_klass, Register tmp);
 460 
 461   void resolve_oop_handle(Register result, Register tmp);
 462   void load_mirror(Register dst, Register method, Register tmp);
 463 
 464   void access_load_word_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
 465                            Register tmp1, Register tmp_thread);
 466 
 467   void access_store_word_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
 468                             Register tmp1, Register tmp_thread);
 469 
 470   void access_load_tos_at(BasicType type, DecoratorSet decorators, Address src,
 471                           Register tmp1, Register tmp_thread);
 472 
 473   void access_store_tos_at(BasicType type, DecoratorSet decorators, Address dst,
 474                            Register tmp1, Register tmp_thread);
 475 
 476   void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
 477                      Register thread_tmp = noreg, DecoratorSet decorators = 0);
 478 
 479   void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
 480                               Register thread_tmp = noreg, DecoratorSet decorators = 0);
 481   void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
 482                       Register tmp_thread = noreg, DecoratorSet decorators = 0);
 483 
 484   // Used for storing NULL. All other oop constants should be
 485   // stored using routines that take a jobject.
 486   void store_heap_oop_null(Address dst, Register tmp);
 487 
 488   void load_prototype_header(Register dst, Register src);
 489 
 490   void store_klass_gap(Register dst, Register src);
 491 
 492   // This dummy is to prevent a call to store_heap_oop from
 493   // converting a zero (like NULL) into a Register by giving
 494   // the compiler two choices it can't resolve
 495 
 496   void store_heap_oop(Address dst, void* dummy);
 497 
 498   // Push and pop everything that might be clobbered by a native
 499   // runtime call except rscratch1 and rscratch2.  (They are always
 500   // scratch, so we don't have to protect them.)  Only save the f0-f15
 501   // and do not save f32-f63 even if present.
 502   void push_call_clobbered_registers();
 503   void pop_call_clobbered_registers();
 504 
 505   void push_CPU_state();
 506   void pop_CPU_state() ;
 507 
 508   // Round up to a power of two
 509   void round_to(Register reg, int modulus);
 510 
 511   // allocation
 512   void eden_allocate(
 513     Register obj,                      // result: pointer to object after successful allocation
 514     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 515     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 516     Register t1,                       // temp register
 517     Label&   slow_case                 // continuation point if fast allocation fails
 518   );
 519   void tlab_allocate(
 520     Register obj,                      // result: pointer to object after successful allocation
 521     Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
 522     int      con_size_in_bytes,        // object size in bytes if   known at compile time
 523     Register t1,                       // temp register
 524     Register t2,                       // temp register
 525     Label&   slow_case                 // continuation point if fast allocation fails
 526   );
 527 
 528   void zero_memory(Register addr, Register len, Register t1);
 529   void verify_tlab();
 530 
 531   // interface method calling
 532   void lookup_interface_method(Register recv_klass,
 533                                Register intf_klass,
 534                                RegisterOrConstant itable_index,
 535                                Register method_result,
 536                                Register scan_temp,
 537                                Label& no_such_interface,
 538                                bool return_method = true);
 539 
 540   // virtual method calling
 541   // n.b. x86 allows RegisterOrConstant for vtable_index
 542   void lookup_virtual_method(Register recv_klass,
 543                              RegisterOrConstant vtable_index,
 544                              Register method_result);
 545 
 546   // Test sub_klass against super_klass, with fast and slow paths.
 547 
 548   // The fast path produces a tri-state answer: yes / no / maybe-slow.
 549   // One of the three labels can be NULL, meaning take the fall-through.
 550   // If super_check_offset is -1, the value is loaded up from super_klass.
 551   // No registers are killed, except temp_reg.
 552   void check_klass_subtype_fast_path(Register sub_klass,
 553                                      Register super_klass,
 554                                      Register temp_reg,
 555                                      Label* L_success,
 556                                      Label* L_failure,
 557                                      Label* L_slow_path,
 558                 RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
 559 
 560   // The rest of the type check; must be wired to a corresponding fast path.
 561   // It does not repeat the fast path logic, so don't use it standalone.
 562   // The temp_reg and temp2_reg can be noreg, if no temps are available.
 563   // Updates the sub's secondary super cache as necessary.
 564   // If set_cond_codes, condition codes will be Z on success, NZ on failure.
 565   void check_klass_subtype_slow_path(Register sub_klass,
 566                                      Register super_klass,
 567                                      Register temp_reg,
 568                                      Register temp2_reg,
 569                                      Label* L_success,
 570                                      Label* L_failure,
 571                                      bool set_cond_codes = false);
 572 
 573   // Simplified, combined version, good for typical uses.
 574   // Falls through on failure.
 575   void check_klass_subtype(Register sub_klass,
 576                            Register super_klass,
 577                            Register temp_reg,
 578                            Label& L_success);
 579 
 580   Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
 581 
 582 
 583   // Debugging
 584 
 585   // only if +VerifyOops
 586   void verify_oop(Register reg, const char* s = "broken oop");
 587   void verify_oop_addr(Address addr, const char * s = "broken oop addr");
 588 
 589 // TODO: verify method and klass metadata (compare against vptr?)
 590   void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
 591   void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
 592 
 593 #define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
 594 #define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
 595 
 596   // only if +VerifyFPU
 597   void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
 598 
 599   // prints msg, dumps registers and stops execution
 600   void stop(const char* msg);
 601 
 602   // prints msg and continues
 603   void warn(const char* msg);
 604 
 605   static void debug32(char* msg, int32_t pc, int32_t regs[]);
 606 
 607   void untested()                                { stop("untested"); }
 608 
 609   void unimplemented(const char* what = "");
 610 
 611 #define should_not_reach_here() should_not_reach_here_line(__FILE__, __LINE__)
 612   void should_not_reach_here_line(const char *file, int line) {
 613 #ifdef ASSERT
 614     mov(rscratch1, line);
 615     reg_printf_important(file);
 616     reg_printf_important(": %d", rscratch1);
 617 #endif
 618     stop("should_not_reach_here");
 619   }
 620 
 621   // Stack overflow checking
 622   void bang_stack_with_offset(int offset) {
 623     // stack grows down, caller passes positive offset
 624     assert(offset > 0, "must bang with negative offset");
 625     // bang with random value from r0
 626     if (operand_valid_for_add_sub_immediate(offset)) {
 627       sub(rscratch2, sp, offset);
 628       strb(r0, Address(rscratch2));
 629     } else {
 630       mov(rscratch2, offset);
 631       strb(r0, Address(sp, rscratch2, Assembler::lsl(), Address::SUB));
 632     }
 633   }
 634 
 635   // Writes to stack successive pages until offset reached to check for
 636   // stack overflow + shadow pages.  Also, clobbers tmp
 637   void bang_stack_size(Register size, Register tmp);
 638 
 639   // Check for reserved stack access in method being exited (for JIT)
 640   void reserved_stack_check();
 641 
 642   virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
 643                                                 Register tmp,
 644                                                 int offset);
 645 
 646   // Support for serializing memory accesses between threads
 647   void serialize_memory(Register thread, Register tmp);
 648 
 649   // Arithmetics
 650 
 651   void addptr(Address dst, int32_t src) {
 652     lea(rscratch2, dst);
 653     ldr(rscratch1, Address(rscratch2));
 654     add(rscratch1, rscratch1, src);
 655     str(rscratch1, Address(rscratch2));
 656   }
 657 
 658   void cmpptr(Register src1, Address src2);
 659   void cmpoop(Register obj1, Register obj2);
 660 
 661   void cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
 662                           Label &suceed, Label *fail);
 663   void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
 664                   Label &suceed, Label *fail);
 665 
 666   void cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
 667                   Label &suceed, Label *fail);
 668 
 669   void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
 670   void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
 671 
 672   void atomic_xchg(Register prev, Register newv, Register addr);
 673   void atomic_xchgw(Register prev, Register newv, Register addr);
 674 
 675   void orptr(Address adr, RegisterOrConstant src) {
 676     ldr(rscratch1, adr);
 677     if (src.is_register())
 678       orr(rscratch1, rscratch1, src.as_register());
 679     else
 680       orr(rscratch1, rscratch1, src.as_constant());
 681     str(rscratch1, adr);
 682   }
 683 
 684   // Calls
 685 
 686   void trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
 687 
 688   static bool far_branches() {
 689     return ReservedCodeCacheSize > branch_range;
 690   }
 691 
 692   // Jumps that can reach anywhere in the code cache.
 693   // Trashes tmp.
 694   void far_call(Address entry, CodeBuffer *cbuf = NULL);
 695   void far_jump(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
 696 
 697   static int far_branch_size() {
 698     if (far_branches()) {
 699       if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))  {
 700         return 3 * NativeInstruction::arm_insn_sz;  // movw, movt, br
 701       } else {
 702         return 5 * NativeInstruction::arm_insn_sz;  // mov, 3 orr, br
 703       }
 704     } else {
 705       return NativeInstruction::arm_insn_sz; // br
 706     }
 707   }
 708 
 709   // Emit the CompiledIC call idiom
 710   void ic_call(address entry, jint method_index = 0);
 711 
 712   // Data
 713   void mov_metadata(Register dst, Metadata* obj);
 714   Address allocate_metadata_address(Metadata* obj);
 715   Address constant_oop_address(jobject obj);
 716 
 717   void movoop(Register dst, jobject obj, bool immediate = false);
 718 
 719   void far_load(Register dst, address addr);
 720   void far_load_oop(Register dst, int oop_index);
 721   void far_load_metadata(Register dst, int metadata_index);
 722   void far_load_const(Register dst, address const);
 723 
 724 
 725   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
 726   void kernel_crc32(Register crc, Register buf, Register len,
 727         Register table0, Register table1, Register table2, Register table3,
 728         Register tmp, Register tmp2, Register tmp3, int is_crc32c);
 729   //AES code for com.sun.crypto.provider.AESCrypt::encryptBlock() intrinsic.
 730   void kernel_aescrypt_encryptBlock(Register from, Register to, Register key, Register keylen,
 731         Register table1,
 732         Register t0, Register t1, Register t2, Register t3,
 733         Register t4, Register t5, Register t6, Register t7);
 734   void kernel_aescrypt_decryptBlock(Register from, Register to, Register key, Register keylen,
 735         Register table1,
 736         Register t0, Register t1, Register t2, Register t3,
 737         Register t4, Register t5, Register t6, Register t7);
 738   void kernel_aescrypt_round(Register table_te, Register key,
 739         Register t0, Register t1, Register t2, Register t3,
 740         Register a, Register tmp1, Register tmp2);
 741   void kernel_aescrypt_firstRound(Register in, Register key,
 742         Register t0, Register t1, Register t2, Register t3,
 743         Register t4, Register t5, Register t6, Register t7);
 744   void kernel_aescrypt_lastRound(
 745         Register table_te, Register key, Register to,
 746         Register t0, Register t1, Register t2, Register t3,
 747         Register t4, Register t5, Register t6, Register t7);
 748   void kernel_aescrypt_lastRound_cbc(
 749         Register table_te,
 750         Register t0, Register t1, Register t2, Register t3,
 751         Register t4, Register t5, Register t6);
 752 
 753   void kernel_aescrypt_encrypt(Register from, Register to, Register key, Register rvec,
 754         Register len, Register keylen, Register table1,
 755         Register t0, Register t1, Register t2, Register t3,
 756         Register t4, Register t5, Register t6);
 757   void kernel_aescrypt_decrypt(Register from, Register to, Register key, Register rvec,
 758         Register len, Register keylen, Register table1,
 759         Register t0, Register t1, Register t2, Register t3,
 760         Register t4, Register t5, Register t6);
 761 
 762   void sha_round1(Register st_b, Register st_c, Register st_d,
 763         Register tmp, Register st_f, int sh);
 764 
 765   void sha_round2(Register st_b, Register st_c, Register st_d,
 766         Register tmp, Register st_f, int sh);
 767 
 768   void sha_round3(Register st_b, Register st_c, Register st_d,
 769         Register tmp, Register st_f, int sh);
 770 
 771   void sha_w0(FloatRegister w16, FloatRegister w14,
 772         FloatRegister w8, FloatRegister w4, FloatRegister w2,
 773         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
 774         FloatRegister st_k, FloatRegister st_kw, bool update);
 775 
 776   void sha_w(FloatRegister w16, FloatRegister w14,
 777         FloatRegister w12, FloatRegister w10, FloatRegister w8,
 778         FloatRegister w6, FloatRegister w4, FloatRegister w2,
 779         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
 780         FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp,
 781         bool update = true);
 782 
 783   void kernel_sha_implCompress(Register from, Register state,
 784         Register counter, Register table_k,
 785         Register st_a, Register st_b,
 786         Register st_c, Register st_d, Register st_e,
 787         Register tmp, Register cunter2, Register st_new_a, Register st_w);
 788 
 789   void sha256_implCompress_iter(
 790       Register ra, Register rb, Register rc, Register rd,
 791       Register re, Register rf, Register rg, Register rh,
 792       FloatRegister Dkw1, FloatRegister Dkw2,
 793       Register step,
 794       Register tmp,
 795       Register ra2, Register re2);
 796   void sha256_implCompress_iter0(
 797       Register Da, Register Db, Register Dc, Register Dd,
 798       Register De, Register Df, Register Dg, Register Dh,
 799       FloatRegister Dkw, int index,
 800       Register Dtmp,
 801       Register Dnew_a, Register Dnew_e);
 802   void sha256_w0(
 803       FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14,
 804       FloatRegister w_m7, FloatRegister w_m6,
 805       FloatRegister w_m2,
 806       FloatRegister Qtmp_S0, FloatRegister Qtmp_S1,
 807       FloatRegister Qtmp1);
 808   void sha256_w(FloatRegister w16, FloatRegister w14,
 809         FloatRegister w12, FloatRegister w10, FloatRegister w8,
 810         FloatRegister w6, FloatRegister w4, FloatRegister w2,
 811         FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3,
 812         FloatRegister st_kw, Register counter, Register rtmp);
 813 
 814   void kernel_sha256_implCompress(Register from, Register state,
 815         Register counter, Register table_k,
 816         Register ra, Register rb, Register rc, Register rd, Register re,
 817         Register rf, Register rg, Register rh,
 818         Register ra2, Register re2);
 819 
 820   void kernel_sha512_implCompress(Register from, Register state,
 821         Register counter, Register table_k);
 822 
 823   void sha512_sigma(FloatRegister x,
 824         FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3);
 825   void sha512_delta(FloatRegister x,
 826         FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3);
 827   void sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z,
 828         FloatRegister Dtmp, FloatRegister Dch);
 829   void sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z,
 830         FloatRegister Dtmp, FloatRegister Dmaj);
 831 
 832   // Stack push and pop individual 64 bit registers
 833   void push(Register src);
 834   void pop(Register dst);
 835 
 836   // push all registers onto the stack
 837   void pusha();
 838   void popa();
 839 
 840   void repne_scan(Register addr, Register value, Register count,
 841                   Register scratch);
 842   void repne_scanw(Register addr, Register value, Register count,
 843                    Register scratch);
 844 
 845   // Form an address from base + offset in Rd. Rd may or may not actually be
 846   // used: you must use the Address that is returned. It is up to you to ensure
 847   // that the shift provided matches the size of your data.
 848   Address form_address(Register Rd, Register base, long byte_offset, int shift);
 849 
 850  public:
 851 
 852   void ldr_constant(Register dest, const Address &const_addr) {
 853     if (NearCpool) {
 854       ldr(dest, const_addr);
 855     } else {
 856       mov(dest, InternalAddress(const_addr.target()));
 857       ldr(dest, dest);
 858     }
 859   }
 860 
 861   address read_polling_page(Register r, address page, relocInfo::relocType rtype);
 862   address read_polling_page(Register r, relocInfo::relocType rtype);
 863   void get_polling_page(Register dest, address page, relocInfo::relocType rtype);
 864 
 865   // BigInteger intrinsics
 866   void multiply_to_len(Register x, Register xlen, Register y, Register ylen,
 867                         Register z, Register zlen,
 868                         Register tmp1, Register tmp2, Register tmp3, Register tmp4,
 869                         Register tmp5, Register tmp6);
 870   void mul_add(Register out, Register in, Register offset, Register len, Register k,
 871                         Register tmp1, Register tmp2, Register tmp3);
 872 
 873   // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
 874   void update_byte_crc32(Register crc, Register val, Register table);
 875   void update_word_crc32(Register crc, Register v, Register tmp, Register tmp2,
 876         Register table0, Register table1, Register table2, Register table3);
 877 //  void update_byte_crc32c(Register crc, Register val, Register table);
 878   void update_word_crc32c(Register crc, Register v, Register tmp, Register tmp2,
 879         Register table0, Register table1, Register table2, Register table3);
 880 
 881   // Auto dispatch for barriers isb, dmb & dsb.
 882   void isb() {
 883     if(VM_Version::features() & FT_ARMV7) {
 884       Assembler::isb();
 885     } else {
 886       cp15isb();
 887     }
 888   }
 889 
 890   void dsb(enum barrier option) {
 891     if(VM_Version::features() & FT_ARMV7) {
 892       Assembler::dsb(option);
 893     } else {
 894       cp15dsb();
 895     }
 896   }
 897 
 898   void dmb(enum barrier option) {
 899     if(VM_Version::features() & FT_ARMV7) {
 900       Assembler::dmb(option);
 901     } else {
 902       cp15dmb();
 903     }
 904   }
 905 
 906   void membar(Membar_mask_bits order_constraint) {
 907     dmb(Assembler::barrier(order_constraint));
 908   }
 909 
 910   // ISB may be needed because of a safepoint
 911   void maybe_isb() { MacroAssembler::isb(); }
 912 
 913   // Helper functions for 64-bit multipliction, division and remainder
 914   // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
 915   void mult_long(Register Rd, Register Rn, Register Rm);
 916   // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
 917   void mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh);
 918 
 919  private:
 920   void divide32(Register res, Register num, Register den, bool want_mod);
 921  public:
 922   // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
 923   // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
 924   // <Rd> = <Rn> / <Rm>
 925   // <Rd> = <Rn> % <Rm>
 926   void divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder);
 927 
 928   void extract_bits(Register dest, Register source, int lsb, int width);
 929 
 930   // These functions require that the src/dst register is an even register
 931   // and will emit LDREXD/STREXD if there are multiple cores and the procesor
 932   // supports it. If there's only one core then LDRD/STRD will be emit instead.
 933   // If the processor has multiple cores and doesn't support LDREXD/STREXD then
 934   // LDRD/STRD will be emitted and a warning message printed.
 935   void atomic_ldrd(Register Rt, Register RtII, Register Rbase);
 936   void atomic_strd(Register Rt, Register RtII, Register Rbase,
 937                    Register temp, Register tempII);
 938 
 939  private:
 940   // generic fallback ldrd generator. may need to use temporary register
 941   // when register collisions are found
 942   //
 943   // since double_ld_failed_dispatch can introduce address manipulation instructions
 944   // it should return offset of first load/store instruction that will be used
 945   // while constructing implicit null check table
 946   int double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
 947                             void (Assembler::* mul)(unsigned, const Address&, Condition),
 948                             void (Assembler::* sgl)(Register, const Address&, Condition),
 949                             Register Rtmp, Condition cond);
 950   // ldrd/strd generator. can handle all strd cases and those ldrd where there
 951   // are no register collisions
 952   void double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
 953                             void (Assembler::* mul)(unsigned, const Address&, Condition),
 954                             void (Assembler::* sgl)(Register, const Address&, Condition),
 955                             Condition cond);
 956 public:
 957   // override ldrd/strd to perform a magic for when Rt + 1 != Rt2 or any other
 958   // conditions which prevent to use single ldrd/strd insn. a pair of ldr/str
 959   // is used instead then
 960   //
 961   // Since ldrd/strd macro can introduce address manipulation instructions
 962   // it should return offset of first load/store instruction that will be used
 963   // while constructing implicit null check table
 964   using Assembler::ldrd;
 965   int ldrd(Register Rt, Register Rt2, const Address& adr, Register Rmp = rscratch1, Condition cond = C_DFLT);
 966   using Assembler::strd;
 967   int strd(Register Rt, Register Rt2, const Address& adr, Condition cond = C_DFLT);
 968 
 969 private:
 970   void bfc_impl(Register rd, int lsb, int width, Condition cond);
 971 public:
 972   void bfc(Register Rd, int lsb, int width, Condition cond = C_DFLT) {
 973     if (VM_Version::features() & (FT_ARMV6T2 | FT_ARMV7))
 974       Assembler::bfc(Rd, lsb, width, cond);
 975     else
 976       bfc_impl(Rd, lsb, width, cond);
 977   }
 978 
 979   void align_stack() {
 980     if (StackAlignmentInBytes > 4)
 981       bic(sp, sp, StackAlignmentInBytes-1);
 982   }
 983 
 984 #ifdef ASSERT
 985   void verify_stack_alignment();
 986 #endif
 987 
 988   // Debug helper
 989   void save_machine_state();
 990   void restore_machine_state();
 991 
 992   static uint32_t bytecodes_until_print;
 993   static uint32_t bytecodes_executed;
 994   static int enable_debug;
 995   static int enable_method_debug;
 996   static int enable_debugging_static;
 997 
 998 
 999   void bytecode_seen(Register bc_reg, Register scratch);
1000   static void print_unseen_bytecodes();
1001   void reg_printf_internal(bool important, const char *fmt, Register a = r0, Register b = r0, Register c = r0);
1002   void reg_printf_important(const char *fmt, Register a = r0, Register b = r0, Register c = r0);
1003   void reg_printf(const char *fmt, Register a = r0, Register b = r0, Register c = r0);
1004   void print_method_entry(Register rmethod, bool native);
1005   void print_method_exit(bool normal = true);
1006   void get_bytecode(Register bc, Register dst);
1007   static void print_cpool(InstanceKlass *klass);
1008 
1009   void create_breakpoint();
1010 
1011 #ifdef COMPILER2
1012   static bool _reachable_from_cache(address target);
1013   bool reachable_from_cache(address target);
1014   static bool _cache_fully_reachable();
1015   bool cache_fully_reachable();
1016 
1017   void call(address target, RelocationHolder rspec, Condition cond = Assembler::AL);
1018 
1019   void call(address target,
1020             relocInfo::relocType rtype = relocInfo::runtime_call_type,
1021             Condition cond = Assembler::AL) {
1022     call(target, Relocation::spec_simple(rtype), cond);
1023   }
1024 
1025   void jump(address target,
1026           relocInfo::relocType rtype = relocInfo::runtime_call_type,
1027           Register scratch = noreg,
1028           Condition cond = Assembler::AL);
1029 
1030   void jump(address dest, relocInfo::relocType rtype = relocInfo::runtime_call_type,
1031           Condition cond = Assembler::AL) {
1032     jump(dest, rtype, rscratch2, cond);
1033   }
1034 
1035   void mov_address(Register rd, address addr, RelocationHolder const& rspec) {
1036     assert(rspec.type() != relocInfo::runtime_call_type, "do not use mov_address for runtime calls");
1037     assert(rspec.type() != relocInfo::static_call_type, "do not use mov_address for relocable calls");
1038     if (rspec.type() == relocInfo::none) {
1039       // absolute address, relocation not needed
1040       mov(rd, (uint32_t)addr);
1041       return;
1042     }
1043     if (VM_Version::features() & FT_ARMV6T2) {
1044       relocate(rspec);
1045       int c = (int)addr;
1046       movw_i(rd, c & 0xffff);
1047       if ((unsigned int)c >> 16) {
1048         movt_i(rd, (unsigned int)c >> 16);
1049       }
1050       return;
1051     }
1052     Label skip_literal;
1053     Label literal;
1054     ldr(rd, literal);
1055     b(skip_literal);
1056     bind(literal);
1057     emit_address(addr);
1058     bind(skip_literal);
1059   }
1060 
1061   void arm_stack_overflow_check(int frame_size_in_bytes, Register tmp);
1062   void arm_stack_overflow_check(Register Rsize, Register tmp);
1063 
1064   void mov_relative_address(Register rd, address addr, Condition cond = Assembler::AL) {
1065     int offset = addr - pc() - 8;
1066     assert((offset & 3) == 0, "bad alignment");
1067     if (offset >= 0) {
1068       assert(is_valid_for_imm12(offset), "addr too far");
1069       add(rd, r15_pc, offset, cond);
1070     } else {
1071       assert(is_valid_for_imm12(-offset), "addr too far");
1072       sub(rd, r15_pc, -offset, cond);
1073     }
1074   }
1075 
1076   void floating_cmp(Register dst);
1077 
1078   void fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2);
1079   void fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2);
1080 #endif
1081 };
1082 
1083 
1084 #ifdef ASSERT
1085 inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
1086 #endif
1087 
1088 /**
1089  * class SkipIfEqual:
1090  *
1091  * Instantiating this class will result in assembly code being output that will
1092  * jump around any code emitted between the creation of the instance and it's
1093  * automatic destruction at the end of a scope block, depending on the value of
1094  * the flag passed to the constructor, which will be checked at run-time.
1095  */
1096 class SkipIfEqual {
1097  private:
1098   MacroAssembler* _masm;
1099   Label _label;
1100 
1101  public:
1102    SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
1103    ~SkipIfEqual();
1104 };
1105 
1106 struct tableswitch {
1107   Register _reg;
1108   int _insn_index;
1109   jint _first_key;
1110   jint _last_key;
1111   Label _after;
1112   Label _branches;
1113 };
1114 
1115 #endif // CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP