--- old/src/cpu/aarch32/vm/assembler_aarch32.cpp	2016-08-26 13:07:32.000000000 +0300
+++ new/src/cpu/aarch32/vm/assembler_aarch32.cpp	2016-08-26 13:07:32.000000000 +0300
@@ -1375,7 +1375,7 @@
     if (rtype == relocInfo::none)
       __ mov(r, target());
     else
-      __ movptr(r, (u_int32_t)target());
+      __ movptr(r, (uint32_t)target());
     break;
   }
   default:
@@ -1693,10 +1693,10 @@
 }
 
 //This should really be in the macroassembler
-void Assembler::mov_immediate32(Register dst, u_int32_t imm32, Condition cond, bool s)
+void Assembler::mov_immediate32(Register dst, uint32_t imm32, Condition cond, bool s)
 {
-  // Need to move a full 32 bit immediate, for example if we're loading an address that
-  // might change later and therefore need to be updated.
+    // Need to move a full 32 bit immediate, for example if we're loading an address that
+    // might change later and therefore need to be updated.
   if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))  {
     //Use a movw and a movt
     Assembler::movw_i(dst, (unsigned)(imm32 & 0xffff), cond);
@@ -1735,7 +1735,7 @@
 
   //Try plan B - a mov first - need to have destination that is not an arg
   assert(Rd != Rn, "Can't use imm and can't do a mov. I'm in a jam.");
-  mov_immediate(Rd, (u_int32_t)uabs(imm), cond, s);
+  mov_immediate(Rd, (uint32_t)uabs(imm), cond, s);
   //Now do the non immediate version - copied from the immediate encodings
   {
     starti;
--- old/src/cpu/aarch32/vm/assembler_aarch32.hpp	2016-08-26 13:07:33.000000000 +0300
+++ new/src/cpu/aarch32/vm/assembler_aarch32.hpp	2016-08-26 13:07:33.000000000 +0300
@@ -874,10 +874,10 @@
  protected:
   // Mov data to destination register in the shortest number of instructions
   // possible.
-  void mov_immediate(Register dst, u_int32_t imm32, Condition cond, bool s);
+  void mov_immediate(Register dst, uint32_t imm32, Condition cond, bool s);
   // Mov data to destination register but always emit enough instructions that would
   // permit any 32-bit constant to be loaded. (Allow for rewriting later).
-  void mov_immediate32(Register dst, u_int32_t imm32, Condition cond, bool s);
+  void mov_immediate32(Register dst, uint32_t imm32, Condition cond, bool s);
 
    void add_sub_imm(int decode, Register Rd, Register Rn, int imm,
                    Condition cond, bool s);
@@ -1210,7 +1210,7 @@
       NAME(Rt, Address(r15_pc, offset), cond);                                       \
     } else if(isload){ /* Plan B */                                                  \
       /* TODO check we don't have to relocate this*/                                 \
-      mov_immediate(Rt, (u_int32_t)dest, cond, false);                               \
+      mov_immediate(Rt, (uint32_t)dest, cond, false);                               \
       NAME(Rt, Address(Rt, 0), cond);                                                \
     } else { /* There is no plan C */                                                \
       ShouldNotReachHere();                                                          \
@@ -1596,7 +1596,7 @@
     sync_instr(0b0100, option);
   }
   void dmb(enum barrier option) {
-    sync_instr(0b0100, option);
+    sync_instr(0b0101, option);
   }
   void bkpt();
   void isb() {
--- old/src/cpu/aarch32/vm/globals_aarch32.hpp	2016-08-26 13:07:34.000000000 +0300
+++ new/src/cpu/aarch32/vm/globals_aarch32.hpp	2016-08-26 13:07:34.000000000 +0300
@@ -67,8 +67,6 @@
 define_pd_global(bool, RewriteBytecodes,     true);
 define_pd_global(bool, RewriteFrequentPairs, true);
 
-define_pd_global(bool, UseMembar,            true);
-
 define_pd_global(bool, PreserveFramePointer, false);
 
 // GC Ergo Flags
@@ -81,9 +79,13 @@
 define_pd_global(intx, InlineSmallCode,          1000);
 //#endif
 
+// Define it instead providing as option, inlining the constant significantly
+// improves perfromance. The option is disabled for AARCH32 in globals.hpp too.
+#define UseMembar true
+
 #define ARCH_FLAGS(develop, product, diagnostic, experimental, notproduct) \
-                                                                        \
-  product(bool, NearCpool, true,                                        \
+                                                                           \
+  product(bool, NearCpool, true,                                           \
          "constant pool is close to instructions")                      \
                                                                         \
   product(bool, UseBarriersForVolatile, false,                          \
--- old/src/cpu/aarch32/vm/icBuffer_aarch32.cpp	2016-08-26 13:07:34.000000000 +0300
+++ new/src/cpu/aarch32/vm/icBuffer_aarch32.cpp	2016-08-26 13:07:34.000000000 +0300
@@ -35,7 +35,7 @@
 #include "oops/oop.inline.hpp"
 
 int InlineCacheBuffer::ic_stub_code_size() {
-  return 5 * NativeInstruction::arm_insn_sz;
+  return (MacroAssembler::far_branches() ? 5 : 3) * NativeInstruction::arm_insn_sz;
 }
 
 #define __ masm->
--- old/src/cpu/aarch32/vm/macroAssembler_aarch32.cpp	2016-08-26 13:07:35.000000000 +0300
+++ new/src/cpu/aarch32/vm/macroAssembler_aarch32.cpp	2016-08-26 13:07:35.000000000 +0300
@@ -159,7 +159,7 @@
     if(0b000 == opc2) {
       // movw, movt (only on newer ARMs)
       assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
-      u_int32_t addr;
+      uint32_t addr;
       addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
       addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
       addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
@@ -170,7 +170,7 @@
       assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
       assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
       assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
-      u_int32_t addr;
+      uint32_t addr;
       addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
       addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
@@ -209,12 +209,12 @@
   }
   //Correct offset for PC
   offset -= 8;
-  return address(((u_int32_t)insn_addr + offset));
+  return address(((uint32_t)insn_addr + offset));
 }
 
 
 void MacroAssembler::serialize_memory(Register thread, Register tmp) {
-  dsb(Assembler::SY);
+  dmb(Assembler::ISH);
 }
 
 
@@ -304,7 +304,7 @@
          "destination of far call not found in code cache");
   // TODO performance issue: if intented to patch later,
   // generate mov rX, imm; bl rX far call (to reserve space)
-  if (entry.rspec().type() != relocInfo::none || far_branches()) {
+  if (far_branches()) {
     lea(tmp, entry);
     if (cbuf) cbuf->set_insts_mark();
     bl(tmp);
@@ -318,9 +318,7 @@
   assert(CodeCache::find_blob(entry.target()) != NULL,
          "destination of far call not found in code cache");
   assert(!external_word_Relocation::is_reloc_index((intptr_t)entry.target()), "can't far jump to reloc index)");
-  // TODO performance issue: if intented to patch later,
-  // generate mov rX, imm; bl rX far call (to reserve space)
-  if (entry.rspec().type() != relocInfo::none || far_branches()) {
+  if (far_branches()) {
     lea(tmp, entry);
     if (cbuf) cbuf->set_insts_mark();
     b(tmp);
@@ -591,7 +589,10 @@
     ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
     Label ok;
     cbz(rscratch2, ok);
+
     lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
+    // forward_exception uses LR to choose exception handler but LR is trashed by previous code
+    // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
     bl(rscratch2);
     bind(ok);
   }
@@ -615,23 +616,23 @@
          || entry.rspec().type() == relocInfo::static_call_type
          || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
 
-  //FIXME This block
-  bool compile_in_scratch_emit_size = false;
-  #ifdef COMPILER2
-  compile_in_scratch_emit_size = Compile::current()->in_scratch_emit_size();
-  #endif
-
-  if (cbuf) cbuf->set_insts_mark();
-  relocate(entry.rspec());
-
-  // Have make trampline such way: destination address should be raw 4 byte value,
-  // so it's patching could be done atomically.
-  add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
-  ldr(r15_pc, Address(r15_pc, 4)); // Address does correction for offset from pc base
-  emit_int32((uintptr_t) entry.target());
-  // possibly pad the call to the NativeCall size to make patching happy
-  for (int i = NativeCall::instruction_size; i > 3 * NativeInstruction::arm_insn_sz; i -= NativeInstruction::arm_insn_sz)
-    nop();
+  if (cbuf) {
+    cbuf->set_insts_mark();
+  }
+
+  if (far_branches()) {
+    // Have make trampline such way: destination address should be raw 4 byte value,
+    // so it's patching could be done atomically.
+    relocate(entry.rspec());
+    add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
+    ldr(r15_pc, Address(r15_pc, 4));
+    emit_int32((uintptr_t) entry.target());
+    // possibly pad the call to the NativeCall size to make patching happy
+    for (int i = NativeCall::instruction_size; i > 3 * NativeInstruction::arm_insn_sz; i -= NativeInstruction::arm_insn_sz)
+      nop();
+  } else {
+    bl(entry);
+  }
 }
 
 void MacroAssembler::ic_call(address entry) {
@@ -1741,23 +1742,7 @@
 void MacroAssembler::store_check(Register obj) {
   // Does a store check for the oop in register obj. The content of
   // register obj is destroyed afterwards.
-  store_check_part_1(obj);
-  store_check_part_2(obj);
-}
 
-void MacroAssembler::store_check(Register obj, Address dst) {
-  store_check(obj);
-}
-
-
-// split the store check operation so that other instructions can be scheduled inbetween
-void MacroAssembler::store_check_part_1(Register obj) {
-  BarrierSet* bs = Universe::heap()->barrier_set();
-  assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
-  lsr(obj, obj, CardTableModRefBS::card_shift);
-}
-
-void MacroAssembler::store_check_part_2(Register obj) {
   BarrierSet* bs = Universe::heap()->barrier_set();
   assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind");
   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
@@ -1772,8 +1757,21 @@
   // don't bother to check, but it could save an instruction.
   intptr_t disp = (intptr_t) ct->byte_map_base;
   mov(rscratch1, disp);
-  mov(rscratch2, 0);
-  strb(rscratch2, Address(obj, rscratch1));
+  assert((disp & 0xff) == 0, "fix store char 0 below");
+  strb(rscratch1, Address(rscratch1, obj, lsr((int) CardTableModRefBS::card_shift)));
+}
+
+void MacroAssembler::store_check(Register obj, Address dst) {
+  store_check(obj);
+}
+
+// split the store check operation so that other instructions can be scheduled inbetween
+void MacroAssembler::store_check_part_1(Register obj) {
+  ShouldNotCallThis();
+}
+
+void MacroAssembler::store_check_part_2(Register obj) {
+  ShouldNotCallThis();
 }
 
 void MacroAssembler::load_klass(Register dst, Register src) {
--- old/src/cpu/aarch32/vm/macroAssembler_aarch32.hpp	2016-08-26 13:07:36.000000000 +0300
+++ new/src/cpu/aarch32/vm/macroAssembler_aarch32.hpp	2016-08-26 13:07:35.000000000 +0300
@@ -159,19 +159,19 @@
     // TODO: Do Address end up as address and then passing through this method, after
     // being marked for relocation elsewhere? If not (as I suspect) then this can
     // be relaxed to mov_immediate to potentially produce shorter code sequences.
-    mov_immediate32(dst, (u_int32_t)addr, cond, false);
+    mov_immediate32(dst, (uint32_t)addr, cond, false);
   }
 
   inline void mov(Register dst, long l, Condition cond = C_DFLT) {
-    mov(dst, (u_int32_t)l, cond);
+    mov(dst, (uint32_t)l, cond);
   }
   inline void mov(Register dst, unsigned long l, Condition cond = C_DFLT) {
-    mov(dst, (u_int32_t)l, cond);
+    mov(dst, (uint32_t)l, cond);
   }
   inline void mov(Register dst, int i, Condition cond = C_DFLT) {
-    mov(dst, (u_int32_t)i, cond);
+    mov(dst, (uint32_t)i, cond);
   }
-  inline void mov(Register dst, u_int32_t i, Condition cond = C_DFLT) {
+  inline void mov(Register dst, uint32_t i, Condition cond = C_DFLT) {
     mov_immediate(dst, i, cond, false);
   }
 
@@ -590,9 +590,14 @@
   void bang_stack_with_offset(int offset) {
     // stack grows down, caller passes positive offset
     assert(offset > 0, "must bang with negative offset");
-    mov(rscratch2, -offset);
-    // bang with random number from r0
-    str(r0, Address(sp, rscratch2));
+    // bang with random value from r0
+    if (operand_valid_for_add_sub_immediate(offset)) {
+      sub(rscratch2, sp, offset);
+      strb(r0, Address(rscratch2));
+    } else {
+      mov(rscratch2, offset);
+      strb(r0, Address(sp, rscratch2, Assembler::lsl(), Address::SUB));
+    }
   }
 
   // Writes to stack successive pages until offset reached to check for
@@ -653,7 +658,11 @@
 
   static int far_branch_size() {
     // TODO performance issue: always generate real far jumps
-    return 3 * 4;  // movw, movt, br
+    if (far_branches()) {
+      return 3 * 4;  // movw, movt, br
+    } else {
+      return 4;
+    }
   }
 
   // Emit the CompiledIC call idiom
--- old/src/cpu/aarch32/vm/nativeInst_aarch32.cpp	2016-08-26 13:07:36.000000000 +0300
+++ new/src/cpu/aarch32/vm/nativeInst_aarch32.cpp	2016-08-26 13:07:36.000000000 +0300
@@ -92,10 +92,8 @@
   // and see valid destination value)
 
   if (NativeImmCall::is_at(addr())) {
-    assert(false, "could be patched mt_safe way, but should not be requested to. "
-           "Known mt_safe requests have arbitrary destination offset. "
-           "Use trampoline_call for this.");
-    ShouldNotCallThis();
+    NativeImmCall::from(addr())->set_destination(dest);
+    ICache::invalidate_word(addr());
   } else if (NativeTrampolineCall::is_at(addr())) {
     NativeTrampolineCall::from(addr())->set_destination_mt_safe(dest);
   } else {
--- old/src/cpu/aarch32/vm/relocInfo_aarch32.cpp	2016-08-26 13:07:37.000000000 +0300
+++ new/src/cpu/aarch32/vm/relocInfo_aarch32.cpp	2016-08-26 13:07:37.000000000 +0300
@@ -32,15 +32,16 @@
 #include "runtime/safepoint.hpp"
 
 void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
-  if (verify_only)
-    return;
-
-  int bytes = 0;
+  int bytes;
 
   NativeInstruction *ni = NativeInstruction::from(addr());
   if (ni->is_mov_const_reg()) {
     NativeMovConstReg *nm = NativeMovConstReg::from(addr());
-    nm->set_data((uintptr_t) x);
+    if (verify_only) {
+      assert(nm->data() == (intptr_t) x, "instructions must match");
+      return;
+    }
+    nm->set_data((intptr_t) x);
     bytes = nm->next_instruction_address() - nm->addr();
   } else {
     ShouldNotReachHere();
@@ -59,24 +60,34 @@
 
   NativeInstruction *ni = NativeInstruction::from(addr());
 
-  if (ni->is_call()) {
+  // Checking from shortest encoding size to longets,
+  // to avoid access beyond CodeCache boundary
+  if (NativeImmCall::is_at(addr())) {
+    return NativeImmCall::from(addr())->destination() + adj;
+  } else if (NativeImmJump::is_at(addr())) {
+    return NativeImmJump::from(addr())->destination() + adj;
+  } else if (NativeCall::is_at(addr())) {
     return NativeCall::from(addr())->destination();
-  } else if (ni->is_jump()) {
+  } else if (NativeJump::is_at(addr())) {
     return NativeJump::from(addr())->jump_destination();
   }
 
   ShouldNotReachHere();
-
-  return NULL;
 }
 
 void Relocation::pd_set_call_destination(address x) {
   assert(addr() != x, "call instruction in an infinite loop"); // FIXME what's wrong to _generate_ loop?
   NativeInstruction *ni = NativeInstruction::from(addr());
 
-  if (ni->is_call()) {
+  // Checking from shortest encoding size to longets,
+  // to avoid access beyond CodeCache boundary
+  if (NativeImmCall::is_at(addr())) {
+    NativeImmCall::from(addr())->set_destination(x);
+  } else if (NativeImmJump::is_at(addr())) {
+    NativeImmJump::from(addr())->set_destination(x);
+  } else if (NativeCall::is_at(addr())) {
     NativeCall::from(addr())->set_destination(x);
-  } else if (ni->is_jump()) {
+  } else if (NativeJump::is_at(addr())) {
     NativeJump::from(addr())->set_jump_destination(x);
   } else {
     ShouldNotReachHere();
--- old/src/cpu/aarch32/vm/sharedRuntime_aarch32.cpp	2016-08-26 13:07:37.000000000 +0300
+++ new/src/cpu/aarch32/vm/sharedRuntime_aarch32.cpp	2016-08-26 13:07:37.000000000 +0300
@@ -1046,11 +1046,13 @@
   }
 }
 
-static void save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
+static int save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
   RegSet x;
+  int saved_slots = 0;
   for ( int i = first_arg ; i < arg_count ; i++ ) {
     if (args[i].first()->is_Register()) {
       x = x + args[i].first()->as_Register();
+      ++saved_slots;
     } else if (args[i].first()->is_FloatRegister()) {
       FloatRegister fr = args[i].first()->as_FloatRegister();
 
@@ -1058,13 +1060,16 @@
     assert(args[i].is_single_phys_reg(), "doubles should be 2 consequents float regs");
         __ decrement(sp, 2 * wordSize);
     __ vstr_f64(fr, Address(sp));
+        saved_slots += 2;
       } else {
         __ decrement(sp, wordSize);
     __ vstr_f32(fr, Address(sp));
+        ++saved_slots;
       }
     }
   }
   __ push(x, sp);
+  return saved_slots;
 }
 
 static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
@@ -1765,7 +1770,7 @@
   {
     SkipIfEqual skip(masm, &DTraceMethodProbes, false);
     // protect the args we've loaded
-    save_args(masm, total_c_args, c_arg, out_regs);
+    (void) save_args(masm, total_c_args, c_arg, out_regs);
     __ mov_metadata(c_rarg1, method());
     __ call_VM_leaf(
       CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
@@ -1777,7 +1782,7 @@
   // RedefineClasses() tracing support for obsolete method entry
   if (RC_TRACE_IN_RANGE(0x00001000, 0x00002000)) {
     // protect the args we've loaded
-    save_args(masm, total_c_args, c_arg, out_regs);
+    (void) save_args(masm, total_c_args, c_arg, out_regs);
     __ mov_metadata(c_rarg1, method());
     __ call_VM_leaf(
       CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
@@ -1794,11 +1799,44 @@
 
   if (method->is_synchronized()) {
     assert(!is_critical_native, "unhandled");
-    // TODO Fast path disabled as requires at least 4 registers, which already contain arguments prepared for call
+
+    // registers below are not used to pass parameters
+    // and they are caller save in C1
+    // => safe to use as temporary here
+#ifdef COMPILER2
+    stop("fix temporary register set below");
+#endif
+    const Register swap_reg = r5;
+    const Register obj_reg  = r6;  // Will contain the oop
+    const Register lock_reg = r7;  // Address of compiler lock object (BasicLock)
+
+    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
 
     // Get the handle (the 2nd argument)
     __ mov(oop_handle_reg, c_rarg1);
-    __ b(slow_path_lock);
+
+    // Get address of the box
+
+    __ lea(lock_reg, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+
+    // Load the oop from the handle
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    if (UseBiasedLocking) {
+      __ biased_locking_enter(lock_reg, obj_reg, swap_reg, rscratch2, false, lock_done, &slow_path_lock);
+    }
+
+    // Load (object->mark() | 1) into swap_reg %r0
+    __ ldr(swap_reg, Address(obj_reg, 0));
+    __ orr(swap_reg, swap_reg, 1);
+
+    // Save (object->mark() | 1) into BasicLock's displaced header
+    __ str(swap_reg, Address(lock_reg, mark_word_offset));
+
+    // src -> dest iff dest == r0 else r0 <- dest
+    { Label here;
+      __ cmpxchgptr(swap_reg, lock_reg, obj_reg, rscratch1, lock_done, &slow_path_lock);
+    }
 
     // Slow path will re-enter here
     __ bind(lock_done);
@@ -1856,7 +1894,7 @@
   if(os::is_MP()) {
     if (UseMembar) {
       // Force this write out before the read below
-      __ dmb(Assembler::SY);
+      __ membar(Assembler::AnyAny);
     } else {
       // Write serialization page so VM thread can do a pseudo remote membar.
       // We use the current thread pointer to calculate a thread specific
@@ -1929,8 +1967,29 @@
   Label unlock_done;
   Label slow_path_unlock;
   if (method->is_synchronized()) {
-    // TODO fast path disabled as requires at least 4 registers, but r0,r1 contains result
-    __ b(slow_path_unlock);
+    const Register obj_reg  = r2;  // Will contain the oop
+    const Register lock_reg = rscratch1; // Address of compiler lock object (BasicLock)
+    const Register old_hdr  = r3;  // value of old header at unlock time
+
+    // Get locked oop from the handle we passed to jni
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    if (UseBiasedLocking) {
+      __ biased_locking_exit(obj_reg, old_hdr, unlock_done);
+    }
+
+    // Simple recursive lock?
+    // get address of the stack lock
+    __ lea(lock_reg, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+
+    //  get old displaced header
+    __ ldr(old_hdr, Address(lock_reg, 0));
+    __ cbz(old_hdr, unlock_done);
+
+    // Atomic swap old header if oop still contains the stack lock
+    Label succeed;
+    __ cmpxchgptr(lock_reg, old_hdr, obj_reg, rscratch1, succeed, &slow_path_unlock);
+    __ bind(succeed);
 
     // slow path re-enters here
     __ bind(unlock_done);
@@ -1997,10 +2056,10 @@
     // args are (oop obj, BasicLock* lock, JavaThread* thread)
 
     // protect the args we've loaded
-    save_args(masm, total_c_args, c_arg, out_regs);
+    const int extra_words = save_args(masm, total_c_args, c_arg, out_regs);
 
     __ ldr(c_rarg0, Address(oop_handle_reg));
-    __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ lea(c_rarg1, Address(sp, (extra_words + lock_slot_offset) * VMRegImpl::stack_slot_size));
     __ mov(c_rarg2, rthread);
 
     // Not a leaf but we have last_Java_frame setup as we want
--- old/src/cpu/aarch32/vm/stubGenerator_aarch32.cpp	2016-08-26 13:07:38.000000000 +0300
+++ new/src/cpu/aarch32/vm/stubGenerator_aarch32.cpp	2016-08-26 13:07:38.000000000 +0300
@@ -1216,7 +1216,7 @@
     // if they expect all registers to be preserved.
     // n.b. aarch32 asserts that frame::arg_reg_save_area_bytes == 0
     enum layout {
-      rfp_off = frame::arg_reg_save_area_bytes/BytesPerInt,
+      rfp_off = 0,
       return_off,
       framesize // inclusive of return address
     };
@@ -1237,9 +1237,6 @@
 
     __ enter(); // Save FP and LR before call
 
-    // lr and fp are already in place
-    assert(frame::arg_reg_save_area_bytes == 0, "please modify this code");
-    // __ sub(sp, rfp, frame::arg_reg_save_area_bytes + wordSize); // prolog
     assert(is_even(framesize), "sp not 8-byte aligned");
 
     int frame_complete = __ pc() - start;
@@ -1288,7 +1285,7 @@
       RuntimeStub::new_runtime_stub(name,
                                     &code,
                                     frame_complete,
-                                    (framesize >> (LogBytesPerWord - LogBytesPerInt)),
+                                    framesize,
                                     oop_maps, false);
     return stub->entry_point();
   }
--- old/src/cpu/aarch32/vm/templateInterpreter_aarch32.cpp	2016-08-26 13:07:39.000000000 +0300
+++ new/src/cpu/aarch32/vm/templateInterpreter_aarch32.cpp	2016-08-26 13:07:38.000000000 +0300
@@ -142,6 +142,12 @@
   // expression stack must be empty before entering the VM if an
   // exception happened
   __ empty_expression_stack();
+  // FIXME shouldn't it be in rest of generate_* ?
+  // rdispatch assumed to cache dispatch table. This code can be called from
+  // signal handler, so it can't assume execption caller preserved the register,
+  // so restore it here
+  __ get_dispatch();
+  // FIXME shouldn't get_method be here ?
   // setup parameters
   __ lea(c_rarg1, Address((address)name));
   if (pass_oop) {
@@ -508,7 +514,9 @@
   __ b(after_frame_check, Assembler::HI);
 
   // Remove the incoming args, peeling the machine SP back to where it
-  // was in the caller.
+  // was in the caller.  This is not strictly necessary, but unless we
+  // do so the stack frame may have a garbage FP; this ensures a
+  // correct call stack that we can always unwind.
   __ mov(sp, r4);
 
   // Note: the restored frame is not necessarily interpreted.
@@ -1039,7 +1047,7 @@
   if (os::is_MP()) {
     if (UseMembar) {
       // Force this write out before the read below
-      __ dsb(Assembler::SY);
+      __ membar(Assembler::AnyAny);
     } else {
       // Write serialization page so VM thread can do a pseudo remote membar.
       // We use the current thread pointer to calculate a thread specific
@@ -1992,17 +2000,16 @@
 address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
   address entry = __ pc();
 
-  __ push(lr);
   __ push(state);
-  __ push(RegSet::range(r0, r12), sp);
+  // Save all registers on stack, so omit SP and PC
+  __ push(RegSet::range(r0, r12) + lr, sp);
   __ mov(c_rarg2, r0);  // Pass itos
   __ mov(c_rarg3, r1);  // Pass ltos/dtos high part
   __ call_VM(noreg,
              CAST_FROM_FN_PTR(address, SharedRuntime::trace_bytecode),
              c_rarg1, c_rarg2, c_rarg3);
-  __ pop(RegSet::range(r0, r12), sp);
+  __ pop(RegSet::range(r0, r12) + lr, sp);
   __ pop(state);
-  __ pop(lr);
   __ b(lr);                                   // return from result handler
 
   return entry;
--- old/src/share/vm/c1/c1_LIR.hpp	2016-08-26 13:07:39.000000000 +0300
+++ new/src/share/vm/c1/c1_LIR.hpp	2016-08-26 13:07:39.000000000 +0300
@@ -619,7 +619,7 @@
                                                                              LIR_OprDesc::single_size); }
 #if defined(C1_LIR_MD_HPP)
 # include C1_LIR_MD_HPP
-#elif defined(SPARC)
+#elif defined(SPARC) || defined(AARCH32)
   static LIR_Opr double_fpu(int reg1, int reg2) { return (LIR_Opr)(intptr_t)((reg1 << LIR_OprDesc::reg1_shift) |
                                                                              (reg2 << LIR_OprDesc::reg2_shift) |
                                                                              LIR_OprDesc::double_type          |
--- old/src/share/vm/c1/c1_LinearScan.cpp	2016-08-26 13:07:40.000000000 +0300
+++ new/src/share/vm/c1/c1_LinearScan.cpp	2016-08-26 13:07:40.000000000 +0300
@@ -2141,7 +2141,7 @@
         assert(interval->assigned_regHi() >= pd_first_fpu_reg && interval->assigned_regHi() <= pd_last_fpu_reg, "no fpu register");
         assert(assigned_reg % 2 == 0 && assigned_reg + 1 == interval->assigned_regHi(), "must be sequential and even");
         LIR_Opr result = LIR_OprFact::double_fpu(interval->assigned_regHi() - pd_first_fpu_reg, assigned_reg - pd_first_fpu_reg);
-#elif defined(ARM32)
+#elif defined(ARM32) || defined(AARCH32)
         assert(assigned_reg >= pd_first_fpu_reg && assigned_reg <= pd_last_fpu_reg, "no fpu register");
         assert(interval->assigned_regHi() >= pd_first_fpu_reg && interval->assigned_regHi() <= pd_last_fpu_reg, "no fpu register");
         assert(assigned_reg % 2 == 0 && assigned_reg + 1 == interval->assigned_regHi(), "must be sequential and even");
@@ -2730,9 +2730,9 @@
 #ifdef SPARC
       assert(opr->fpu_regnrLo() == opr->fpu_regnrHi() + 1, "assumed in calculation (only fpu_regnrHi is used)");
 #endif
-#ifdef ARM32
+#if defined(ARM32) || defined(AARCH32)
       assert(opr->fpu_regnrHi() == opr->fpu_regnrLo() + 1, "assumed in calculation (only fpu_regnrLo is used)");
-#endif
+#endif // ARM32 || AARCH32
 #ifdef PPC
       assert(opr->fpu_regnrLo() == opr->fpu_regnrHi(), "assumed in calculation (only fpu_regnrHi is used)");
 #endif
--- old/src/share/vm/c1/c1_Runtime1.cpp	2016-08-26 13:07:41.000000000 +0300
+++ new/src/share/vm/c1/c1_Runtime1.cpp	2016-08-26 13:07:41.000000000 +0300
@@ -1049,7 +1049,7 @@
           ShouldNotReachHere();
         }
 
-#if defined(SPARC) || defined(PPC)
+#if defined(SPARC) || defined(PPC) || defined(AARCH32)
         if (load_klass_or_mirror_patch_id ||
             stub_id == Runtime1::load_appendix_patching_id) {
           // Update the location in the nmethod with the proper
@@ -1086,7 +1086,7 @@
         if (do_patch) {
           // replace instructions
           // first replace the tail, then the call
-#ifdef ARM
+#if defined(ARM) && !defined(AARCH32)
           if((load_klass_or_mirror_patch_id ||
               stub_id == Runtime1::load_appendix_patching_id) &&
               nativeMovConstReg_at(copy_buff)->is_pc_relative()) {
@@ -1134,12 +1134,14 @@
             nmethod* nm = CodeCache::find_nmethod(instr_pc);
             assert(nm != NULL, "invalid nmethod_pc");
 
+#if !defined(AARCH32)
             // The old patch site is now a move instruction so update
             // the reloc info so that it will get updated during
             // future GCs.
             RelocIterator iter(nm, (address)instr_pc, (address)(instr_pc + 1));
             relocInfo::change_reloc_info_for_address(&iter, (address) instr_pc,
                                                      relocInfo::none, rtype);
+#endif
 #ifdef SPARC
             // Sparc takes two relocations for an metadata so update the second one.
             address instr_pc2 = instr_pc + NativeMovConstReg::add_offset;
--- old/src/share/vm/runtime/globals.hpp	2016-08-26 13:07:41.000000000 +0300
+++ new/src/share/vm/runtime/globals.hpp	2016-08-26 13:07:41.000000000 +0300
@@ -536,8 +536,9 @@
   /* UseMembar is theoretically a temp flag used for memory barrier         \
    * removal testing.  It was supposed to be removed before FCS but has     \
    * been re-added (see 6401008) */                                         \
+  NOT_AARCH32(                                                              \
   product_pd(bool, UseMembar,                                               \
-          "(Unstable) Issues membars on thread state transitions")          \
+          "(Unstable) Issues membars on thread state transitions"))         \
                                                                             \
   develop(bool, CleanChunkPoolAsync, falseInEmbedded,                       \
           "Clean the chunk pool asynchronously")                            \
--- /dev/null	2016-08-26 13:07:43.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_CodeStubs_aarch32.cpp	2016-08-26 13:07:43.000000000 +0300
@@ -0,0 +1,533 @@
+/*
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#endif
+
+#define __ ce->masm()->
+
+#define should_not_reach_here() should_not_reach_here_line(__FILE__, __LINE__)
+
+void CounterOverflowStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  ce->store_parameter(_method->as_register(), 1);
+  ce->store_parameter(_bci, 0);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::counter_overflow_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+RangeCheckStub::RangeCheckStub(CodeEmitInfo* info, LIR_Opr index,
+                               bool throw_index_out_of_bounds_exception)
+  : _throw_index_out_of_bounds_exception(throw_index_out_of_bounds_exception)
+  , _index(index)
+{
+  assert(info != NULL, "must have info");
+  _info = new CodeEmitInfo(info);
+}
+
+void RangeCheckStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  if (_info->deoptimize_on_exception()) {
+    address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+    __ far_call(RuntimeAddress(a));
+    ce->add_call_info_here(_info);
+    ce->verify_oop_map(_info);
+    debug_only(__ should_not_reach_here());
+    return;
+  }
+
+  if (_index->is_cpu_register()) {
+    __ mov(rscratch1, _index->as_register());
+  } else {
+    __ mov(rscratch1, _index->as_jint());
+  }
+  Runtime1::StubID stub_id;
+  if (_throw_index_out_of_bounds_exception) {
+    stub_id = Runtime1::throw_index_exception_id;
+  } else {
+    stub_id = Runtime1::throw_range_check_failed_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(stub_id)), NULL, rscratch2);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+PredicateFailedStub::PredicateFailedStub(CodeEmitInfo* info) {
+  _info = new CodeEmitInfo(info);
+}
+
+void PredicateFailedStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+  __ far_call(RuntimeAddress(a));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+void DivByZeroStub::emit_code(LIR_Assembler* ce) {
+  if (_offset != -1) {
+    ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  }
+  __ bind(_entry);
+  __ far_call(Address(Runtime1::entry_for(Runtime1::throw_div0_exception_id), relocInfo::runtime_call_type));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+#ifdef ASSERT
+  __ should_not_reach_here();
+#endif
+}
+
+
+
+// Implementation of NewInstanceStub
+
+NewInstanceStub::NewInstanceStub(LIR_Opr klass_reg, LIR_Opr result, ciInstanceKlass* klass, CodeEmitInfo* info, Runtime1::StubID stub_id) {
+  _result = result;
+  _klass = klass;
+  _klass_reg = klass_reg;
+  _info = new CodeEmitInfo(info);
+  assert(stub_id == Runtime1::new_instance_id                 ||
+         stub_id == Runtime1::fast_new_instance_id            ||
+         stub_id == Runtime1::fast_new_instance_init_check_id,
+         "need new_instance id");
+  _stub_id   = stub_id;
+}
+
+
+
+void NewInstanceStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  __ mov(r3, _klass_reg->as_register());
+  __ far_call(RuntimeAddress(Runtime1::entry_for(_stub_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0,");
+  __ b(_continuation);
+}
+
+
+// Implementation of NewTypeArrayStub
+
+// Implementation of NewTypeArrayStub
+
+NewTypeArrayStub::NewTypeArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _length = length;
+  _result = result;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewTypeArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  assert(_length->as_register() == r6, "length must in r6,");
+  assert(_klass_reg->as_register() == r3, "klass_reg must in r3");
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::new_type_array_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0");
+  __ b(_continuation);
+}
+
+
+// Implementation of NewObjectArrayStub
+
+NewObjectArrayStub::NewObjectArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _result = result;
+  _length = length;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewObjectArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  assert(_length->as_register() == r6, "length must in r6");
+  assert(_klass_reg->as_register() == r3, "klass_reg must in r3");
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::new_object_array_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0");
+  __ b(_continuation);
+}
+// Implementation of MonitorAccessStubs
+
+MonitorEnterStub::MonitorEnterStub(LIR_Opr obj_reg, LIR_Opr lock_reg, CodeEmitInfo* info)
+: MonitorAccessStub(obj_reg, lock_reg)
+{
+  _info = new CodeEmitInfo(info);
+}
+
+
+void MonitorEnterStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  ce->store_parameter(_obj_reg->as_register(),  1);
+  ce->store_parameter(_lock_reg->as_register(), 0);
+  Runtime1::StubID enter_id;
+  if (ce->compilation()->has_fpu_code()) {
+    enter_id = Runtime1::monitorenter_id;
+  } else {
+    enter_id = Runtime1::monitorenter_nofpu_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(enter_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+void MonitorExitStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  if (_compute_lock) {
+    // lock_reg was destroyed by fast unlocking attempt => recompute it
+    ce->monitor_address(_monitor_ix, _lock_reg);
+  }
+  ce->store_parameter(_lock_reg->as_register(), 0);
+  // note: non-blocking leaf routine => no call info needed
+  Runtime1::StubID exit_id;
+  if (ce->compilation()->has_fpu_code()) {
+    exit_id = Runtime1::monitorexit_id;
+  } else {
+    exit_id = Runtime1::monitorexit_nofpu_id;
+  }
+  __ adr(lr, _continuation);
+  __ far_jump(RuntimeAddress(Runtime1::entry_for(exit_id)));
+}
+
+
+// Implementation of patching:
+// - Copy the code at given offset to an inlined buffer (first the bytes, then the number of bytes)
+// - Replace original code with a call to the stub
+// At Runtime:
+// - call to stub, jump to runtime
+// - in runtime: preserve all registers (rspecially objects, i.e., source and destination object)
+// - in runtime: after initializing class, restore original code, reexecute instruction
+
+int PatchingStub::_patch_info_offset = 0;
+
+void PatchingStub::align_patch_site(MacroAssembler* masm) {
+}
+
+void PatchingStub::emit_code(LIR_Assembler* ce) {
+  // NativeCall::instruction_size is dynamically calculated based on CPU,
+  // armv7 -> 3 instructions, armv6 -> 5 instructions. Initialize _patch_info_offset
+  // here, when CPU is determined already.
+  if (!_patch_info_offset)
+    _patch_info_offset = -NativeCall::instruction_size;
+  assert(_patch_info_offset == -NativeCall::instruction_size, "must not change");
+  assert(NativeCall::instruction_size <= _bytes_to_copy && _bytes_to_copy <= 0xFF, "not enough room for call");
+
+  Label call_patch;
+
+  // static field accesses have special semantics while the class
+  // initializer is being run so we emit a test which can be used to
+  // check that this code is being executed by the initializing
+  // thread.
+  address being_initialized_entry = __ pc();
+  if (CommentedAssembly) {
+    __ block_comment(" patch template");
+  }
+  if (_id == load_klass_id) {
+    // produce a copy of the load klass instruction for use by the being initialized case
+#ifdef ASSERT
+    address start = __ pc();
+#endif
+    Metadata* o = NULL;
+    __ mov_metadata(_obj, o);
+    __ nop(); // added to call site by LIR_Assembler::patching_epilog
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      address ptr = (address)(_pc_start + i);
+      int a_byte = (*ptr) & 0xFF;
+      assert(a_byte == *start++, "should be the same code");
+    }
+#endif
+  } else if (_id == load_mirror_id || _id == load_appendix_id) {
+    // produce a copy of the load mirror instruction for use by the being
+    // initialized case
+#ifdef ASSERT
+    address start = __ pc();
+#endif
+    jobject o = NULL;
+    __ movoop(_obj, o, true);
+    __ nop(); // added to call site by LIR_Assembler::patching_epilog
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      address ptr = (address)(_pc_start + i);
+      int a_byte = (*ptr) & 0xFF;
+      assert(a_byte == *start++, "should be the same code");
+    }
+#endif
+  } else {
+    // make a copy the code which is going to be patched.
+    assert(_bytes_to_copy % BytesPerWord == 0, "all instructions are 4byte");
+    assert(((unsigned long) _pc_start) % BytesPerWord == 0, "patch offset should be aligned");
+    const int words_to_copy = _bytes_to_copy / BytesPerWord;
+    for (int i = 0; i < words_to_copy; i++) {
+      int *ptr = ((int *) _pc_start) + i;
+      __ emit_int32(*ptr);
+      *ptr = 0xe320f000; // make the site look like a nop
+    }
+  }
+
+  int bytes_to_skip = _bytes_to_copy;
+
+  // this switch will be patched by NativeGeneralJump::replace_mt_safe,
+  // it inteded to distinguish enters from by being_initialized_entry and
+  // from call site
+  int switch_offset = __ offset();
+  Label patching_switch;
+  __ b(patching_switch);
+  __ bind(patching_switch);
+  bytes_to_skip += __ offset() - switch_offset;
+
+  if (_id == load_mirror_id) {
+    int offset = __ offset();
+    if (CommentedAssembly) {
+      __ block_comment(" being_initialized check");
+    }
+    assert(_obj != noreg, "must be a valid register");
+    // Load without verification to keep code size small. We need it because
+    // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
+    __ ldr(rscratch1, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
+    __ ldr(rscratch1, Address(rscratch1, InstanceKlass::init_thread_offset()));
+    __ cmp(rthread, rscratch1);
+    __ b(call_patch, Assembler::NE);
+
+    // access_field patches may execute the patched code before it's
+    // copied back into place so we need to jump back into the main
+    // code of the nmethod to continue execution.
+    __ b(_patch_site_continuation);
+    // make sure this extra code gets skipped
+    bytes_to_skip += __ offset() - offset;
+  }
+
+  // Now emit the patch record telling the runtime how to find the
+  // pieces of the patch.  We only need 3 bytes but it has to be
+  // aligned as an instruction so emit 4 bytes.
+  int sizeof_patch_record = 4;
+  bytes_to_skip += sizeof_patch_record;
+
+  // emit the offsets needed to find the code to patch
+  int being_initialized_entry_offset = __ pc() - being_initialized_entry + sizeof_patch_record;
+
+  __ emit_int8(0);
+  __ emit_int8(being_initialized_entry_offset);
+  __ emit_int8(bytes_to_skip);
+  __ emit_int8(0);
+
+  address patch_info_pc = __ pc();
+
+  address entry = __ pc();
+  NativeGeneralJump::insert_unconditional((address)_pc_start, entry);
+  address target = NULL;
+  relocInfo::relocType reloc_type = relocInfo::none;
+  switch (_id) {
+    case access_field_id:  target = Runtime1::entry_for(Runtime1::access_field_patching_id); break;
+    case load_klass_id:    target = Runtime1::entry_for(Runtime1::load_klass_patching_id); reloc_type = relocInfo::metadata_type; break;
+    case load_mirror_id:   target = Runtime1::entry_for(Runtime1::load_mirror_patching_id); reloc_type = relocInfo::oop_type; break;
+    case load_appendix_id:      target = Runtime1::entry_for(Runtime1::load_appendix_patching_id); reloc_type = relocInfo::oop_type; break;
+    default: ShouldNotReachHere();
+  }
+  __ bind(call_patch);
+
+  if (CommentedAssembly) {
+    __ block_comment("patch entry point");
+  }
+  __ mov(rscratch1, RuntimeAddress(target));
+  __ bl(rscratch1);
+  // pad with nops to globally known upper bound of patch site size
+  while (patch_info_pc - __ pc() < _patch_info_offset)
+    __ nop();
+  assert(_patch_info_offset == (patch_info_pc - __ pc()), "must not change, required by shared code");
+  ce->add_call_info_here(_info);
+  int jmp_off = __ offset();
+  __ b(_patch_site_entry);
+  // Add enough nops so deoptimization can overwrite the jmp above with a call
+  // and not destroy the world.
+  for (int j = __ offset() ; j < jmp_off + NativeCall::instruction_size; j += NativeInstruction::arm_insn_sz) {
+    __ nop();
+  }
+
+  if (_id == load_klass_id || _id == load_mirror_id || _id == load_appendix_id) {
+    CodeSection* cs = __ code_section();
+    RelocIterator iter(cs, (address)_pc_start, (address)(_pc_start + 1));
+    relocInfo::change_reloc_info_for_address(&iter, (address) _pc_start, reloc_type, relocInfo::none);
+  }
+}
+
+
+void DeoptimizeStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::deoptimize_id)));
+  ce->add_call_info_here(_info);
+  DEBUG_ONLY(__ should_not_reach_here());
+}
+
+
+void ImplicitNullCheckStub::emit_code(LIR_Assembler* ce) {
+  address a;
+  if (_info->deoptimize_on_exception()) {
+    // Deoptimize, do not throw the exception, because it is probably wrong to do it here.
+    a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+  } else {
+    a = Runtime1::entry_for(Runtime1::throw_null_pointer_exception_id);
+  }
+
+  ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  __ bind(_entry);
+  __ far_call(RuntimeAddress(a));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+
+void SimpleExceptionStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+
+  __ bind(_entry);
+  // pass the object in a scratch register because all other registers
+  // must be preserved
+  if (_obj->is_cpu_register()) {
+    __ mov(rscratch1, _obj->as_register());
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(_stub)), NULL, rscratch2);
+  ce->add_call_info_here(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+
+void ArrayCopyStub::emit_code(LIR_Assembler* ce) {
+  //---------------slow case: call to native-----------------
+  __ bind(_entry);
+  // Figure out where the args should go
+  // This should really convert the IntrinsicID to the Method* and signature
+  // but I don't know how to do that.
+  //
+  VMRegPair args[5];
+  BasicType signature[5] = { T_OBJECT, T_INT, T_OBJECT, T_INT, T_INT};
+  SharedRuntime::java_calling_convention(signature, args, 5, true);
+
+  // push parameters
+  // (src, src_pos, dest, destPos, length)
+  Register r[5];
+  r[0] = src()->as_register();
+  r[1] = src_pos()->as_register();
+  r[2] = dst()->as_register();
+  r[3] = dst_pos()->as_register();
+  r[4] = length()->as_register();
+
+  // next registers will get stored on the stack
+  for (int i = 0; i < 5 ; i++ ) {
+    VMReg r_1 = args[i].first();
+    if (r_1->is_stack()) {
+      int st_off = r_1->reg2stack() * wordSize;
+      __ str (r[i], Address(sp, st_off));
+    } else {
+      assert(r[i] == args[i].first()->as_Register(), "Wrong register for arg ");
+    }
+  }
+
+  ce->align_call(lir_static_call);
+
+  ce->emit_static_call_stub();
+  Address resolve(SharedRuntime::get_resolve_static_call_stub(),
+                  relocInfo::static_call_type);
+  __ trampoline_call(resolve);
+  ce->add_call_info_here(info());
+
+#ifndef PRODUCT
+  __ lea(rscratch2, ExternalAddress((address)&Runtime1::_arraycopy_slowcase_cnt));
+  __ increment(Address(rscratch2));
+#endif
+
+  __ b(_continuation);
+}
+
+
+/////////////////////////////////////////////////////////////////////////////
+#if INCLUDE_ALL_GCS
+
+void G1PreBarrierStub::emit_code(LIR_Assembler* ce) {
+  // At this point we know that marking is in progress.
+  // If do_load() is true then we have to emit the
+  // load of the previous value; otherwise it has already
+  // been loaded into _pre_val.
+
+  __ bind(_entry);
+  assert(pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = pre_val()->as_register();
+
+  if (do_load()) {
+    ce->mem2reg(addr(), pre_val(), T_OBJECT, patch_code(), info(), false /*wide*/, false /*unaligned*/);
+  }
+  __ cbz(pre_val_reg, _continuation);
+  ce->store_parameter(pre_val()->as_register(), 0);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_pre_barrier_slow_id)));
+  __ b(_continuation);
+}
+
+void G1PostBarrierStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  assert(addr()->is_register(), "Precondition.");
+  assert(new_val()->is_register(), "Precondition.");
+  Register new_val_reg = new_val()->as_register();
+  __ cbz(new_val_reg, _continuation);
+  ce->store_parameter(addr()->as_pointer_register(), 0);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::g1_post_barrier_slow_id)));
+  __ b(_continuation);
+}
+
+#endif // INCLUDE_ALL_GCS
+/////////////////////////////////////////////////////////////////////////////
+
+#undef __
--- /dev/null	2016-08-26 13:07:44.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_Defs_aarch32.hpp	2016-08-26 13:07:44.000000000 +0300
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
+
+// Native word offsets from memory address (little endian format)
+enum {
+  pd_lo_word_offset_in_bytes = 0,
+  pd_hi_word_offset_in_bytes = BytesPerWord
+};
+
+// TODO: We should understand what values are correct for the following 3 flags
+// relevant to floating point operations:
+// - UseSSE
+//   Highest supported SSE instruction set on x86/x64. I believe we should
+//   set it to 0 in VM_Version::initialize(), like other non-x86 ports do.
+// - RoundFPResults
+//   Indicates whether rounding is needed for floating point results
+// - pd_strict_fp_requires_explicit_rounding
+//   The same as above but for the strictfp mode
+
+// Explicit rounding operations are not required to implement the strictfp mode
+enum {
+  pd_strict_fp_requires_explicit_rounding = false
+};
+
+// Registers
+enum {
+  // Number of registers used during code emission
+  pd_nof_cpu_regs_frame_map = RegisterImpl::number_of_registers,
+  pd_nof_fpu_regs_frame_map = FloatRegisterImpl::number_of_registers,
+
+  // Number of registers killed by calls
+  pd_nof_caller_save_cpu_regs_frame_map = 8,
+  pd_nof_caller_save_fpu_regs_frame_map = 32,
+
+  // The following two constants need to be defined since they are referenced
+  // from c1_FrameMap.hpp, but actually they are never used, so can be set to
+  // arbitrary values.
+  pd_nof_cpu_regs_reg_alloc = -1,
+  pd_nof_fpu_regs_reg_alloc = -1,
+
+  // All the constants below are used by linear scan register allocator only.
+  // Number of registers visible to register allocator
+  pd_nof_cpu_regs_linearscan = pd_nof_cpu_regs_frame_map,
+  pd_nof_fpu_regs_linearscan = pd_nof_fpu_regs_frame_map,
+  pd_nof_xmm_regs_linearscan = 0,
+
+  // Register allocator specific register numbers corresponding to first/last
+  // CPU/FPU registers available for allocation
+  pd_first_cpu_reg = 0,
+  pd_last_cpu_reg = 7,
+  pd_first_fpu_reg = pd_nof_cpu_regs_frame_map,
+  pd_last_fpu_reg = pd_first_fpu_reg + 31,
+
+  // Register allocator specific register numbers corresponding to first/last
+  // CPU/FPU callee-saved registers. These constants are used in
+  // LinearScan::is_caller_save() only.
+  pd_first_callee_saved_cpu_reg = 4,
+  pd_last_callee_saved_cpu_reg = 11,
+  pd_first_callee_saved_fpu_reg = pd_first_fpu_reg + 16,
+  pd_last_callee_saved_fpu_reg = pd_first_fpu_reg + 31
+};
+
+// This flag must be in sync with how the floating point registers are stored
+// on the stack by RegisterSaver::save_live_registers() method
+// (sharedRuntime_aarch32.cpp) and save_live_registers() function
+// (c1_Runtime1_aarch32.cpp). On AArch32 the floating point registers keep
+// floats and doubles in their native form. No float to double conversion
+// happens when the registers are stored on the stack. This is opposite to
+// what happens on x86, where the FPU stack registers are 80 bits wide,
+// and storing them in either 4 byte or 8 byte stack slot is a conversion
+// operation.
+enum {
+  pd_float_saved_as_double = false
+};
+
+#endif // CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
--- /dev/null	2016-08-26 13:07:46.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_FpuStackSim_aarch32.cpp	2016-08-26 13:07:46.000000000 +0300
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_FpuStackSim.hpp"
+
+// No FPU stack on AArch32
--- /dev/null	2016-08-26 13:07:47.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_FpuStackSim_aarch32.hpp	2016-08-26 13:07:47.000000000 +0300
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
+
+// No FPU stack on AArch32
+
+#endif // CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
--- /dev/null	2016-08-26 13:07:49.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_FrameMap_aarch32.cpp	2016-08-26 13:07:49.000000000 +0300
@@ -0,0 +1,258 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIR.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+LIR_Opr FrameMap::r0_opr;
+LIR_Opr FrameMap::r1_opr;
+LIR_Opr FrameMap::r2_opr;
+LIR_Opr FrameMap::r3_opr;
+LIR_Opr FrameMap::r4_opr;
+LIR_Opr FrameMap::r5_opr;
+LIR_Opr FrameMap::r6_opr;
+LIR_Opr FrameMap::r7_opr;
+LIR_Opr FrameMap::r8_opr;
+LIR_Opr FrameMap::r9_opr;
+LIR_Opr FrameMap::r10_opr;
+LIR_Opr FrameMap::r11_opr;
+LIR_Opr FrameMap::r12_opr;
+LIR_Opr FrameMap::r13_opr;
+LIR_Opr FrameMap::r14_opr;
+LIR_Opr FrameMap::r15_opr;
+
+LIR_Opr FrameMap::r0_oop_opr;
+LIR_Opr FrameMap::r1_oop_opr;
+LIR_Opr FrameMap::r2_oop_opr;
+LIR_Opr FrameMap::r3_oop_opr;
+LIR_Opr FrameMap::r4_oop_opr;
+LIR_Opr FrameMap::r5_oop_opr;
+LIR_Opr FrameMap::r6_oop_opr;
+LIR_Opr FrameMap::r7_oop_opr;
+LIR_Opr FrameMap::r8_oop_opr;
+LIR_Opr FrameMap::r9_oop_opr;
+LIR_Opr FrameMap::r10_oop_opr;
+LIR_Opr FrameMap::r11_oop_opr;
+LIR_Opr FrameMap::r12_oop_opr;
+LIR_Opr FrameMap::r13_oop_opr;
+LIR_Opr FrameMap::r14_oop_opr;
+LIR_Opr FrameMap::r15_oop_opr;
+
+LIR_Opr FrameMap::r0_metadata_opr;
+LIR_Opr FrameMap::r1_metadata_opr;
+LIR_Opr FrameMap::r2_metadata_opr;
+LIR_Opr FrameMap::r3_metadata_opr;
+LIR_Opr FrameMap::r4_metadata_opr;
+LIR_Opr FrameMap::r5_metadata_opr;
+
+LIR_Opr FrameMap::sp_opr;
+LIR_Opr FrameMap::receiver_opr;
+
+LIR_Opr FrameMap::rscratch1_opr;
+LIR_Opr FrameMap::rscratch2_opr;
+LIR_Opr FrameMap::rscratch_long_opr;
+
+LIR_Opr FrameMap::long0_opr;
+LIR_Opr FrameMap::long1_opr;
+LIR_Opr FrameMap::long2_opr;
+LIR_Opr FrameMap::fpu0_float_opr;
+LIR_Opr FrameMap::fpu0_double_opr;
+
+LIR_Opr FrameMap::_caller_save_cpu_regs[] = { 0, };
+LIR_Opr FrameMap::_caller_save_fpu_regs[] = { 0, };
+
+void FrameMap::initialize() {
+  assert(!_init_done, "must be called once");
+
+  int i = 0;
+  map_register(i, r0); r0_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r1); r1_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r2); r2_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r3); r3_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r4); r4_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r5); r5_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r6); r6_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r7); r7_opr = LIR_OprFact::single_cpu(i); i++;
+  // Mapping lines in this block may be arbitrarily mixed, but all allocatable
+  // registers should go above this comment, and unallocatable registers -
+  // below.
+  map_register(i, r8); r8_opr = LIR_OprFact::single_cpu(i); i++;   // rthread
+  map_register(i, r9); r9_opr = LIR_OprFact::single_cpu(i); i++;   // rscratch1
+  map_register(i, r10); r10_opr = LIR_OprFact::single_cpu(i); i++; // rmethod
+  map_register(i, r11); r11_opr = LIR_OprFact::single_cpu(i); i++; // rfp
+  map_register(i, r12); r12_opr = LIR_OprFact::single_cpu(i); i++; // rscratch2
+  map_register(i, r13); r13_opr = LIR_OprFact::single_cpu(i); i++; // sp
+  map_register(i, r14); r14_opr = LIR_OprFact::single_cpu(i); i++; // lr
+  map_register(i, r15); r15_opr = LIR_OprFact::single_cpu(i); i++; // r15_pc
+
+  // This flag must be set after all integer registers are mapped but before
+  // the first use of as_*_opr() methods.
+  _init_done = true;
+
+  r0_oop_opr = as_oop_opr(r0);
+  r1_oop_opr = as_oop_opr(r1);
+  r2_oop_opr = as_oop_opr(r2);
+  r3_oop_opr = as_oop_opr(r3);
+  r4_oop_opr = as_oop_opr(r4);
+  r5_oop_opr = as_oop_opr(r5);
+  r6_oop_opr = as_oop_opr(r6);
+  r7_oop_opr = as_oop_opr(r7);
+  r8_oop_opr = as_oop_opr(r8);
+  r9_oop_opr = as_oop_opr(r9);
+  r10_oop_opr = as_oop_opr(r10);
+  r11_oop_opr = as_oop_opr(r11);
+  r12_oop_opr = as_oop_opr(r12);
+  r13_oop_opr = as_oop_opr(r13);
+  r14_oop_opr = as_oop_opr(r14);
+  r15_oop_opr = as_oop_opr(r15);
+
+  r0_metadata_opr = as_metadata_opr(r0);
+  r1_metadata_opr = as_metadata_opr(r1);
+  r2_metadata_opr = as_metadata_opr(r2);
+  r3_metadata_opr = as_metadata_opr(r3);
+  r4_metadata_opr = as_metadata_opr(r4);
+  r5_metadata_opr = as_metadata_opr(r5);
+
+  sp_opr = as_pointer_opr(sp);
+
+  VMRegPair regs;
+  BasicType sig_bt = T_OBJECT;
+  SharedRuntime::java_calling_convention(&sig_bt, &regs, 1, true);
+  receiver_opr = as_oop_opr(regs.first()->as_Register());
+
+  rscratch1_opr = as_opr(rscratch1);
+  rscratch2_opr = as_opr(rscratch2);
+  rscratch_long_opr = as_long_opr(rscratch1, rscratch2);
+
+  long0_opr = as_long_opr(r0, r1);
+  long1_opr = as_long_opr(r2, r3);
+  long2_opr = as_long_opr(r4, r5);
+  fpu0_float_opr = LIR_OprFact::single_fpu(0);
+  fpu0_double_opr = LIR_OprFact::double_fpu(0, 1);
+
+  _caller_save_cpu_regs[0] = r0_opr;
+  _caller_save_cpu_regs[1] = r1_opr;
+  _caller_save_cpu_regs[2] = r2_opr;
+  _caller_save_cpu_regs[3] = r3_opr;
+  _caller_save_cpu_regs[4] = r4_opr;
+  _caller_save_cpu_regs[5] = r5_opr;
+  _caller_save_cpu_regs[6] = r6_opr;
+  _caller_save_cpu_regs[7] = r7_opr;
+
+  for (i = 0; i < nof_caller_save_fpu_regs; i++) {
+    _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
+  }
+}
+
+LIR_Opr FrameMap::stack_pointer() {
+  return sp_opr;
+}
+
+// TODO: Make sure that neither method handle intrinsics nor compiled lambda
+// forms modify sp register (i.e., vmIntrinsics::{_invokeBasic, _linkToVirtual,
+// _linkToStatic, _linkToSpecial, _linkToInterface, _compiledLambdaForm})
+LIR_Opr FrameMap::method_handle_invoke_SP_save_opr() {
+  return LIR_OprFact::illegalOpr;
+}
+
+// Return LIR_Opr corresponding to the given VMRegPair and data type
+LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool) {
+  LIR_Opr opr = LIR_OprFact::illegalOpr;
+  VMReg r_1 = reg->first();
+  VMReg r_2 = reg->second();
+  if (r_1->is_stack()) {
+    // Convert stack slot to sp-based address. The calling convention does not
+    // count the SharedRuntime::out_preserve_stack_slots() value, so we must
+    // add it in here.
+    int st_off =
+        (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) *
+        VMRegImpl::stack_slot_size;
+    opr = LIR_OprFact::address(new LIR_Address(sp_opr, st_off, type));
+  } else if (r_1->is_Register()) {
+    Register reg1 = r_1->as_Register();
+    if (type == T_LONG) {
+      assert(r_2->is_Register(), "wrong VMReg");
+      Register reg2 = r_2->as_Register();
+      opr = as_long_opr(reg1, reg2);
+    } else if (type == T_OBJECT || type == T_ARRAY) {
+      opr = as_oop_opr(reg1);
+    } else if (type == T_METADATA) {
+      opr = as_metadata_opr(reg1);
+    } else {
+      opr = as_opr(reg1);
+    }
+  } else if (r_1->is_FloatRegister()) {
+    assert(type == T_DOUBLE || type == T_FLOAT, "wrong type");
+    int num = r_1->as_FloatRegister()->encoding();
+    if (type == T_FLOAT) {
+      opr = LIR_OprFact::single_fpu(num);
+    } else {
+      assert(is_even(num) && r_2->as_FloatRegister()->encoding() == (num + 1),
+             "wrong VMReg");
+      opr = LIR_OprFact::double_fpu(num, num + 1);
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  return opr;
+}
+
+// Return VMReg corresponding to the given FPU register number as it is
+// encoded in LIR_Opr. The conversion is straightforward because in this
+// implementation the encoding of FPU registers in LIR_Opr's is the same as
+// in FloatRegister's.
+VMReg FrameMap::fpu_regname(int n) {
+  return as_FloatRegister(n)->as_VMReg();
+}
+
+// Check that the frame is properly addressable on the platform. The sp-based
+// address of every frame slot must have the offset expressible as AArch32's
+// imm12 with the separately stored sign.
+bool FrameMap::validate_frame() {
+  int max_offset = in_bytes(framesize_in_bytes());
+  int java_index = 0;
+  for (int i = 0; i < _incoming_arguments->length(); i++) {
+    LIR_Opr opr = _incoming_arguments->at(i);
+    if (opr->is_stack()) {
+      max_offset = MAX2(_argument_locations->at(java_index), max_offset);
+    }
+    java_index += type2size[opr->type()];
+  }
+  return Assembler::is_valid_for_offset_imm(max_offset, 12);
+}
+
+Address FrameMap::make_new_address(ByteSize sp_offset) const {
+  return Address(sp, in_bytes(sp_offset));
+}
--- /dev/null	2016-08-26 13:07:50.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_FrameMap_aarch32.hpp	2016-08-26 13:07:50.000000000 +0300
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
+
+// The following schema visualizes how a C1 frame looks like on AArch32.
+// It corresponds to the case of an unextended frame. Each line of text
+// represents one 4-byte slot. Every monitor takes two slots. Positions of
+// incoming arguments are determined by the Java calling convention. Spill
+// area and monitor area are not required to be 8-byte aligned. The slot
+// for deoptimization support is used by frame::deoptimize() method to save
+// the original pc before patching in the new one.
+//
+// When LIR_Opr's reference stack slots, they use virtual stack slot indices.
+// They are mapped to the real stack slots by FrameMap::sp_offset_for_slot()
+// and FrameMap::sp_offset_for_double_slot() methods. The first _argcount
+// virtual stack slots correspond to the real stack slots occupied by the
+// incoming arguments. Their mapping is defined by _argument_locations array
+// (which is filled in by applying the Java calling convention). All other
+// virtual stack slots correspond to spill slots.
+//
+// Higher addresses
+//                  |              incoming              |      virtual stack slots
+//                  |                                    |      [0 ... _arg_count - 1]
+//                  |             arguments              |
+//                  |====================================|----X- 8-byte aligned
+//                  |            previous lr             |   /|\ address
+//         rfp ===> |------------------------------------|    |
+//                  |            previous rfp            |    |
+//                  |====================================|    |
+//                  |     alignment slot (if needed)     |    |
+//                  |====================================|    |
+//                  |  slot for deoptimization support   |    |
+//                  |====================================|    |
+//                  | monitor [_num_monitors - 1] object |    |
+//                  |                                    |    |
+//                  |  monitor [_num_monitors - 1] lock  |    |
+//                  |------------------------------------|    |
+//                  |                                    |    |
+// Direction of     |                ...                 |    | _framesize
+// stack growth     |                                    |    | slots
+//      |           |------------------------------------|    |
+//      V           |         monitor [0] object         |    |
+//                  |                                    |    |
+//                  |          monitor [0] lock          |    |
+//                  |====================================|    |
+//                  |    spill slot [_num_spills - 1]    |    | virtual stack slot
+//                  |------------------------------------|    | [_arg_count + _num_spills - 1]
+//                  |                ...                 |    | ...
+//                  |------------------------------------|    |
+//                  |           spill slot [0]           |    | virtual stack slot
+//                  |====================================|    | [_arg_count]
+//                  |     reserved argument area for     |    |
+//                  |                ...                 |    |
+//                  |  outgoing calls (8-byte aligned)   |   \|/
+//          sp ===> |====================================|----X- 8-byte aligned
+//                  |                                    |       address
+// Lower addresses
+
+ public:
+  enum {
+    first_available_sp_in_frame = 0,
+    frame_pad_in_bytes = 8
+  };
+
+ public:
+  static LIR_Opr r0_opr;
+  static LIR_Opr r1_opr;
+  static LIR_Opr r2_opr;
+  static LIR_Opr r3_opr;
+  static LIR_Opr r4_opr;
+  static LIR_Opr r5_opr;
+  static LIR_Opr r6_opr;
+  static LIR_Opr r7_opr;
+  static LIR_Opr r8_opr;
+  static LIR_Opr r9_opr;
+  static LIR_Opr r10_opr;
+  static LIR_Opr r11_opr;
+  static LIR_Opr r12_opr;
+  static LIR_Opr r13_opr;
+  static LIR_Opr r14_opr;
+  static LIR_Opr r15_opr;
+
+  static LIR_Opr r0_oop_opr;
+  static LIR_Opr r1_oop_opr;
+  static LIR_Opr r2_oop_opr;
+  static LIR_Opr r3_oop_opr;
+  static LIR_Opr r4_oop_opr;
+  static LIR_Opr r5_oop_opr;
+  static LIR_Opr r6_oop_opr;
+  static LIR_Opr r7_oop_opr;
+  static LIR_Opr r8_oop_opr;
+  static LIR_Opr r9_oop_opr;
+  static LIR_Opr r10_oop_opr;
+  static LIR_Opr r11_oop_opr;
+  static LIR_Opr r12_oop_opr;
+  static LIR_Opr r13_oop_opr;
+  static LIR_Opr r14_oop_opr;
+  static LIR_Opr r15_oop_opr;
+
+  static LIR_Opr r0_metadata_opr;
+  static LIR_Opr r1_metadata_opr;
+  static LIR_Opr r2_metadata_opr;
+  static LIR_Opr r3_metadata_opr;
+  static LIR_Opr r4_metadata_opr;
+  static LIR_Opr r5_metadata_opr;
+
+  static LIR_Opr sp_opr;
+  static LIR_Opr receiver_opr;
+
+  static LIR_Opr rscratch1_opr;
+  static LIR_Opr rscratch2_opr;
+  static LIR_Opr rscratch_long_opr;
+
+  static LIR_Opr long0_opr;
+  static LIR_Opr long1_opr;
+  static LIR_Opr long2_opr;
+  static LIR_Opr fpu0_float_opr;
+  static LIR_Opr fpu0_double_opr;
+
+  static LIR_Opr as_long_opr(Register r1, Register r2) {
+    return LIR_OprFact::double_cpu(cpu_reg2rnr(r1), cpu_reg2rnr(r2));
+  }
+  static LIR_Opr as_pointer_opr(Register r) {
+    return LIR_OprFact::single_cpu(cpu_reg2rnr(r));
+  }
+
+  static VMReg fpu_regname(int n);
+
+  static bool is_caller_save_register(LIR_Opr opr) {
+    // On AArch32, unlike on SPARC, we never explicitly request the C1 register
+    // allocator to allocate a callee-saved register. Since the only place this
+    // method is called is the assert in LinearScan::color_lir_opr(), we can
+    // safely just always return true here.
+    return true;
+  }
+  static int nof_caller_save_cpu_regs() {
+    return pd_nof_caller_save_cpu_regs_frame_map;
+  }
+  static int last_cpu_reg() {
+    return pd_last_cpu_reg;
+  }
+
+#endif // CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
--- /dev/null	2016-08-26 13:07:52.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_LIRAssembler_aarch32.cpp	2016-08-26 13:07:52.000000000 +0300
@@ -0,0 +1,3272 @@
+/*
+ * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArrayKlass.hpp"
+#include "ci/ciInstance.hpp"
+#include "gc_interface/collectedHeap.hpp"
+#include "memory/barrierSet.hpp"
+#include "memory/cardTableModRefBS.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+#ifndef PRODUCT
+#define COMMENT(x)   do { __ block_comment(x); } while (0)
+#else
+#define COMMENT(x)
+#endif
+
+NEEDS_CLEANUP // remove this definitions ?
+const Register IC_Klass    = rscratch2;   // where the IC klass is cached
+const Register SYNC_header = r0;   // synchronization header
+const Register SHIFT_count = r0;   // where count for shift operations must be
+
+#define __ _masm->
+
+
+static void select_different_registers(Register preserve,
+                                       Register extra,
+                                       Register &tmp1,
+                                       Register &tmp2) {
+  if (tmp1 == preserve) {
+    assert_different_registers(tmp1, tmp2, extra);
+    tmp1 = extra;
+  } else if (tmp2 == preserve) {
+    assert_different_registers(tmp1, tmp2, extra);
+    tmp2 = extra;
+  }
+  assert_different_registers(preserve, tmp1, tmp2);
+}
+
+
+
+static void select_different_registers(Register preserve,
+                                       Register extra,
+                                       Register &tmp1,
+                                       Register &tmp2,
+                                       Register &tmp3) {
+  if (tmp1 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp1 = extra;
+  } else if (tmp2 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp2 = extra;
+  } else if (tmp3 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp3 = extra;
+  }
+  assert_different_registers(preserve, tmp1, tmp2, tmp3);
+}
+
+bool LIR_Assembler::is_small_constant(LIR_Opr opr) { Unimplemented(); return false; }
+
+
+LIR_Opr LIR_Assembler::receiverOpr() {
+  return FrameMap::receiver_opr;
+}
+
+LIR_Opr LIR_Assembler::osrBufferPointer() {
+  return FrameMap::as_pointer_opr(receiverOpr()->as_register());
+}
+
+//--------------fpu register translations-----------------------
+
+
+address LIR_Assembler::float_constant(float f) {
+  address const_addr = __ float_constant(f);
+  if (const_addr == NULL) {
+    bailout("const section overflow");
+    return __ code()->consts()->start();
+  } else {
+    return const_addr;
+  }
+}
+
+
+address LIR_Assembler::double_constant(double d) {
+  address const_addr = __ double_constant(d);
+  if (const_addr == NULL) {
+    bailout("const section overflow");
+    return __ code()->consts()->start();
+  } else {
+    return const_addr;
+  }
+}
+
+address LIR_Assembler::int_constant(jlong n) {
+  address const_addr = __ long_constant(n);
+  if (const_addr == NULL) {
+    bailout("const section overflow");
+    return __ code()->consts()->start();
+  } else {
+    return const_addr;
+  }
+}
+
+void LIR_Assembler::set_24bit_FPU() { Unimplemented(); }
+
+void LIR_Assembler::reset_FPU() { Unimplemented(); }
+
+void LIR_Assembler::fpop() { Unimplemented(); }
+
+void LIR_Assembler::fxch(int i) { Unimplemented(); }
+
+void LIR_Assembler::fld(int i) { Unimplemented(); }
+
+void LIR_Assembler::ffree(int i) { Unimplemented(); }
+
+void LIR_Assembler::breakpoint() { __ bkpt(0); }
+
+void LIR_Assembler::push(LIR_Opr opr) { Unimplemented(); }
+
+void LIR_Assembler::pop(LIR_Opr opr) { Unimplemented(); }
+
+//-------------------------------------------
+
+static Register as_reg(LIR_Opr op) {
+  return op->is_double_cpu() ? op->as_register_lo() : op->as_register();
+}
+
+Address LIR_Assembler::as_Address(LIR_Address* addr) {
+  // as_Address(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+}
+
+Address LIR_Assembler::as_Address_hi(LIR_Address* addr) {
+  // as_Address_hi(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+}
+
+Address LIR_Assembler::as_Address_lo(LIR_Address* addr) {
+  // as_Address_lo(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+}
+
+Address LIR_Assembler::as_Address(LIR_Address* addr, Register tmp, Address::InsnDataType type) {
+  if (addr->base()->is_illegal()) {
+    assert(addr->index()->is_illegal(), "must be illegal too");
+    __ mov(tmp, addr->disp());
+    return Address(tmp); // encoding is ok for any data type
+  }
+
+  Register base = addr->base()->as_pointer_register();
+
+  if (addr->index()->is_illegal()) {
+    return Address(base, addr->disp()).safe_for(type, _masm, tmp);
+  } else if (addr->index()->is_cpu_register()) {
+    assert(addr->disp() == 0, "must be");
+    Register index = addr->index()->as_pointer_register();
+    return Address(base, index, lsl(addr->scale())).safe_for(type, _masm, tmp);
+  } else if (addr->index()->is_constant()) {
+    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp();
+    return Address(base, addr_offset).safe_for(type, _masm, tmp);
+  }
+
+  Unimplemented();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_hi(LIR_Address* addr, Address::InsnDataType type) {
+  assert(type == Address::IDT_INT, "only to be used for accessing high word of jlong");
+
+  if (addr->base()->is_illegal()) {
+    assert(addr->index()->is_illegal(), "must be illegal too");
+    __ mov(rscratch1, addr->disp() + wordSize);
+    return Address(rscratch1); // encoding is ok for IDR_INT
+  }
+
+  Register base = addr->base()->as_pointer_register();
+
+  if (addr->index()->is_illegal()) {
+    return Address(base, addr->disp() + wordSize).safe_for(Address::IDT_INT, _masm, rscratch1);
+  } else if (addr->index()->is_cpu_register()) {
+    assert(addr->disp() == 0, "must be");
+    Register index = addr->index()->as_pointer_register();
+    __ add(rscratch1, base, wordSize);
+    return Address(rscratch1, index, lsl(addr->scale())); // encoding is ok for IDT_INT
+  } else if (addr->index()->is_constant()) {
+    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp() + wordSize;
+    return Address(base, addr_offset).safe_for(Address::IDT_INT, _masm, rscratch1);
+  }
+
+  Unimplemented();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_lo(LIR_Address* addr, Address::InsnDataType type) {
+  return as_Address(addr, rscratch1, type);
+}
+
+
+void LIR_Assembler::osr_entry() {
+  offsets()->set_value(CodeOffsets::OSR_Entry, code_offset());
+  BlockBegin* osr_entry = compilation()->hir()->osr_entry();
+  ValueStack* entry_state = osr_entry->state();
+  int number_of_locks = entry_state->locks_size();
+
+  // we jump here if osr happens with the interpreter
+  // state set up to continue at the beginning of the
+  // loop that triggered osr - in particular, we have
+  // the following registers setup:
+  //
+  // r1: osr buffer
+  //
+
+  // build frame
+  ciMethod* m = compilation()->method();
+  __ build_frame(initial_frame_size_in_bytes(), bang_size_in_bytes());
+
+  // OSR buffer is
+  //
+  // locals[nlocals-1..0]
+  // monitors[0..number_of_locks]
+  //
+  // locals is a direct copy of the interpreter frame so in the osr buffer
+  // so first slot in the local array is the last local from the interpreter
+  // and last slot is local[0] (receiver) from the interpreter
+  //
+  // Similarly with locks. The first lock slot in the osr buffer is the nth lock
+  // from the interpreter frame, the nth lock slot in the osr buffer is 0th lock
+  // in the interpreter frame (the method lock if a sync method)
+
+  // Initialize monitors in the compiled activation.
+  //   r1: pointer to osr buffer
+  //
+  // All other registers are dead at this point and the locals will be
+  // copied into place by code emitted in the IR.
+
+  Register OSR_buf = osrBufferPointer()->as_pointer_register();
+  { assert(frame::interpreter_frame_monitor_size() == BasicObjectLock::size(), "adjust code below");
+    int monitor_offset = BytesPerWord * method()->max_locals() +
+      (2 * BytesPerWord) * (number_of_locks - 1);
+    // SharedRuntime::OSR_migration_begin() packs BasicObjectLocks in
+    // the OSR buffer using 2 word entries: first the lock and then
+    // the oop.
+    for (int i = 0; i < number_of_locks; i++) {
+      int slot_offset = monitor_offset - ((i * 2) * BytesPerWord);
+#ifdef ASSERT
+      // verify the interpreter's monitor has a non-null object
+      {
+        Label L;
+        __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
+        __ cbnz(rscratch1, L);
+        __ stop("locked object is NULL");
+        __ bind(L);
+      }
+#endif
+      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 0));
+      __ str(rscratch1, frame_map()->address_for_monitor_lock(i));
+      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
+      __ str(rscratch1, frame_map()->address_for_monitor_object(i));
+    }
+  }
+}
+
+
+// inline cache check; done before the frame is built.
+int LIR_Assembler::check_icache() {
+  Register receiver = FrameMap::receiver_opr->as_register();
+  Register ic_klass = IC_Klass;
+  int start_offset = __ offset();
+  __ inline_cache_check(receiver, ic_klass);
+
+  // if icache check fails, then jump to runtime routine
+  // Note: RECEIVER must still contain the receiver!
+  Label dont;
+  __ b(dont, Assembler::EQ);
+  __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+  // We align the verified entry point unless the method body
+  // (including its inline cache check) will fit in a single 64-byte
+  // icache line.
+  if (! method()->is_accessor() || __ offset() - start_offset > 4 * 4) {
+    // force alignment after the cache check.
+    __ align(CodeEntryAlignment);
+  }
+
+  __ bind(dont);
+  return start_offset;
+}
+
+
+void LIR_Assembler::jobject2reg(jobject o, Register reg) {
+  if (o == NULL) {
+    __ mov(reg, 0);
+  } else {
+    __ movoop(reg, o, /*immediate*/true);
+  }
+}
+
+void LIR_Assembler::deoptimize_trap(CodeEmitInfo *info) {
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::deoptimize_id)));
+  add_call_info_here(info);
+}
+
+void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo *info) {
+  jobject o = NULL;
+  PatchingStub* patch = new PatchingStub(_masm, patching_id(info));
+  __ movoop(reg, o, true);
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+// Return sp decrement needed to build a frame
+int LIR_Assembler::initial_frame_size_in_bytes() const {
+  // We need to subtract two words to take into account saved lr and rfp.
+  return in_bytes(frame_map()->framesize_in_bytes()) -
+         FrameMap::frame_pad_in_bytes;
+}
+
+int LIR_Assembler::emit_exception_handler() {
+  // if the last instruction is a call (typically to do a throw which
+  // is coming at the end after block reordering) the return address
+  // must still point into the code area in order to avoid assertion
+  // failures when searching for the corresponding bci => add a nop
+  // (was bug 5/14/1999 - gri)
+  __ nop();
+
+  // generate code for exception handler
+  address handler_base = __ start_a_stub(exception_handler_size);
+  if (handler_base == NULL) {
+    // not enough space left for the handler
+    bailout("exception handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  // the exception oop and pc are in r0, and r3
+  // no other registers need to be preserved, so invalidate them
+  __ invalidate_registers(false, true, false);
+
+  // check that there is really an exception
+  __ verify_not_null_oop(r0);
+
+  // search an exception handler (r0: exception oop, r3: throwing pc)
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::handle_exception_from_callee_id)));  __ should_not_reach_here();
+  guarantee(code_offset() - offset <= exception_handler_size, "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+
+// Emit the code to remove the frame from the stack in the exception
+// unwind path.
+int LIR_Assembler::emit_unwind_handler() {
+#ifndef PRODUCT
+  if (CommentedAssembly) {
+    _masm->block_comment("Unwind handler");
+  }
+#endif
+
+  int offset = code_offset();
+
+  // Fetch the exception from TLS and clear out exception related thread state
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+
+  __ bind(_unwind_handler_entry);
+  __ verify_not_null_oop(r0);
+
+  // Preform needed unlocking
+  MonitorExitStub* stub = NULL;
+  if (method()->is_synchronized()) {
+    monitor_address(0, FrameMap::r1_opr);
+    stub = new MonitorExitStub(FrameMap::r1_opr, true, 0);
+    __ unlock_object(r5, r4, r1, *stub->entry());
+    __ bind(*stub->continuation());
+  }
+
+  if (compilation()->env()->dtrace_method_probes()) {
+    __ call_Unimplemented();
+#if 0
+    // FIXME check exception_store is not clobbered below!
+    __ movptr(Address(rsp, 0), rax);
+    __ mov_metadata(Address(rsp, sizeof(void*)), method()->constant_encoding());
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit)));
+#endif
+  }
+
+  // remove the activation and dispatch to the unwind handler
+  __ block_comment("remove_frame and dispatch to the unwind handler");
+  __ remove_frame(initial_frame_size_in_bytes());
+  __ far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::unwind_exception_id)));
+
+  // Emit the slow path assembly
+  if (stub != NULL) {
+    stub->emit_code(this);
+  }
+
+  return offset;
+}
+
+
+int LIR_Assembler::emit_deopt_handler() {
+  // if the last instruction is a call (typically to do a throw which
+  // is coming at the end after block reordering) the return address
+  // must still point into the code area in order to avoid assertion
+  // failures when searching for the corresponding bci => add a nop
+  // (was bug 5/14/1999 - gri)
+  __ nop();
+
+  // generate code for exception handler
+  address handler_base = __ start_a_stub(deopt_handler_size);
+  if (handler_base == NULL) {
+    // not enough space left for the handler
+    bailout("deopt handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  __ adr(lr, pc());
+  __ far_jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
+  guarantee(code_offset() - offset <= deopt_handler_size, "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+
+// This is the fast version of java.lang.String.compare; it has not
+// OSR-entry and therefore, we generate a slow version for OSR's
+void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info)  {
+  __ mov(r2, (address)__FUNCTION__);
+  __ call_Unimplemented();
+}
+
+
+void LIR_Assembler::add_debug_info_for_branch(address adr, CodeEmitInfo* info) {
+  _masm->code_section()->relocate(adr, relocInfo::poll_type);
+  int pc_offset = code_offset();
+  flush_debug_info(pc_offset);
+  info->record_debug_info(compilation()->debug_info_recorder(), pc_offset);
+  if (info->exception_handlers() != NULL) {
+    compilation()->add_exception_handlers_for_pco(pc_offset, info->exception_handlers());
+  }
+}
+
+void LIR_Assembler::return_op(LIR_Opr result) {
+  assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == r0, "word returns are in r0,");
+  // Pop the stack before the safepoint code
+  __ remove_frame(initial_frame_size_in_bytes());
+  address polling_page(os::get_polling_page());
+  __ read_polling_page(rscratch1, polling_page, relocInfo::poll_return_type);
+  __ ret(lr);
+}
+
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+  address polling_page(os::get_polling_page());
+  guarantee(info != NULL, "Shouldn't be NULL");
+  assert(os::is_poll_address(polling_page), "should be");
+  __ mov(rscratch1, Address(polling_page, relocInfo::poll_type));
+  add_debug_info_for_branch(info);  // This isn't just debug info:
+  // it's the oop map
+  __ read_polling_page(rscratch1, relocInfo::poll_type);
+  return __ offset();
+}
+
+void LIR_Assembler::move_regs(Register from_reg, Register to_reg) {
+  if (from_reg != to_reg) {
+    __ mov(to_reg, from_reg);
+  }
+}
+
+void LIR_Assembler::swap_reg(Register a, Register b) {
+  Unimplemented();
+}
+
+void LIR_Assembler::const2reg(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info) {
+  assert(src->is_constant(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+  LIR_Const* c = src->as_constant_ptr();
+
+  switch (c->type()) {
+    case T_INT: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register(), c->as_jint());
+      break;
+    }
+
+    case T_ADDRESS: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register(), c->as_jint());
+      break;
+    }
+
+    case T_LONG: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register_lo(), c->as_jint_lo());
+      __ mov(dest->as_register_hi(), c->as_jint_hi());
+      break;
+    }
+
+    case T_OBJECT: {
+        if (patch_code == lir_patch_none) {
+          jobject2reg(c->as_jobject(), dest->as_register());
+        } else {
+          jobject2reg_with_patching(dest->as_register(), info);
+        }
+      break;
+    }
+
+    case T_METADATA: {
+      if (patch_code != lir_patch_none) {
+        klass2reg_with_patching(dest->as_register(), info);
+      } else {
+        __ mov_metadata(dest->as_register(), c->as_metadata());
+      }
+      break;
+    }
+
+    case T_FLOAT: {
+#ifdef __ARM_PCS_VFP
+        if (__ operand_valid_for_float_immediate(c->as_jfloat())) {
+            __ vmov_f32(dest->as_float_reg(), c->as_jfloat());
+        } else {
+            __ lea(rscratch1, InternalAddress(float_constant(c->as_jfloat())));
+            __ vldr_f32(dest->as_float_reg(), Address(rscratch1));
+        }
+#else
+#error "unimplemented"
+#endif
+      break;
+    }
+
+    case T_DOUBLE: {
+#ifdef __ARM_PCS_VFP
+        if (__ operand_valid_for_double_immediate(c->as_jdouble())) {
+            __ vmov_f64(dest->as_double_reg(), c->as_jdouble());
+        } else {
+            __ lea(rscratch1, InternalAddress(double_constant(c->as_jdouble())));
+            __ vldr_f64(dest->as_double_reg(), Address(rscratch1));
+        }
+#else
+#error "unimplemented"
+#endif
+      break;
+    }
+
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::const2stack(LIR_Opr src, LIR_Opr dest) {
+  LIR_Const* c = src->as_constant_ptr();
+  switch (c->type()) {
+  case T_OBJECT:
+    {
+      if (! c->as_jobject()) {
+        __ mov(rscratch1, 0);
+        __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
+      } else {
+        const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
+        reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
+      }
+    }
+    break;
+  case T_ADDRESS:
+    {
+      const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
+      reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
+    }
+  case T_INT:
+  case T_FLOAT:
+    {
+      __ mov(rscratch1, c->as_jint_bits());
+      __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
+    }
+    break;
+  case T_LONG:
+  case T_DOUBLE:
+    {
+        __ mov(rscratch1, c->as_jint_lo());
+        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
+                                                        lo_word_offset_in_bytes));
+        if (c->as_jint_lo() != c->as_jint_hi())
+            __ mov(rscratch1, c->as_jint_hi());
+        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
+                                                        hi_word_offset_in_bytes));
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+/*
+ * For now this code can load only zero constants as in aarch32.
+ * It seems like this implementation can break some tests in future.
+ * TODO: ensure, write test, and rewrite if need.
+ */
+void LIR_Assembler::const2mem(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info, bool wide) {
+  assert(src->is_constant(), "should not call otherwise");
+  LIR_Const* c = src->as_constant_ptr();
+  LIR_Address* to_addr = dest->as_address_ptr();
+
+  void (Assembler::* insn)(Register Rt, const Address &adr, Assembler::Condition cnd);
+
+  __ mov(rscratch2, 0);
+
+  int null_check_here = code_offset();
+
+  Address::InsnDataType idt = Address::toInsnDataType(type);
+  switch (type) {
+  case T_ADDRESS:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_LONG: {
+    assert(c->as_jlong() == 0, "should be");
+    insn = &Assembler::str;
+    Address addr = as_Address_hi(to_addr, Address::IDT_INT);
+    null_check_here = code_offset();
+    __ str(rscratch2, addr);
+    idt = Address::IDT_INT;
+    break;
+  }
+  case T_INT:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_OBJECT:
+  case T_ARRAY:
+    assert(c->as_jobject() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_CHAR:
+  case T_SHORT:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::strh;
+    break;
+  case T_BOOLEAN:
+  case T_BYTE:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::strb;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  (_masm->*insn)(rscratch2, as_Address(to_addr, idt), Assembler::C_DFLT);
+  if (info) add_debug_info_for_null_check(null_check_here, info);
+}
+
+void LIR_Assembler::reg2reg(LIR_Opr src, LIR_Opr dest) {
+  assert(src->is_register(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+
+  // move between cpu-registers
+  if (dest->is_single_cpu()) {
+    if (src->type() == T_LONG) {
+      // Can do LONG -> OBJECT
+      __ stop("investigate how \"LONG -> OBJECT\" works especially when high part is != 0");
+      move_regs(src->as_register_lo(), dest->as_register());
+      return;
+    }
+    assert(src->is_single_cpu(), "must match");
+    if (src->type() == T_OBJECT) {
+      __ verify_oop(src->as_register());
+    }
+    move_regs(src->as_register(), dest->as_register());
+
+  } else if (dest->is_double_cpu()) {
+    if (src->type() == T_OBJECT || src->type() == T_ARRAY) {
+      // Surprising to me but we can see move of a long to t_object
+      __ verify_oop(src->as_register());
+      move_regs(src->as_register(), dest->as_register_lo());
+      __ mov(dest->as_register_hi(), 0);
+      return;
+    }
+    assert(src->is_double_cpu(), "must match");
+    Register f_lo = src->as_register_lo();
+    Register f_hi = src->as_register_hi();
+    Register t_lo = dest->as_register_lo();
+    Register t_hi = dest->as_register_hi();
+    assert(f_hi != f_lo, "must be different");
+    assert(t_hi != t_lo, "must be different");
+    check_register_collision(t_lo, &f_hi);
+    move_regs(f_lo, t_lo);
+    move_regs(f_hi, t_hi);
+  } else if (dest->is_single_fpu()) {
+    __ vmov_f32(dest->as_float_reg(), src->as_float_reg());
+
+  } else if (dest->is_double_fpu()) {
+    __ vmov_f64(dest->as_double_reg(), src->as_double_reg());
+
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::reg2stack(LIR_Opr src, LIR_Opr dest, BasicType type, bool pop_fpu_stack) {
+  if (src->is_single_cpu()) {
+    if (type == T_ARRAY || type == T_OBJECT) {
+      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
+      __ verify_oop(src->as_register());
+    } else {
+      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
+    }
+
+  } else if (src->is_double_cpu()) {
+    Address dest_addr_LO = frame_map()->address_for_slot(dest->double_stack_ix(), lo_word_offset_in_bytes);
+    __ strd(src->as_register_lo(), src->as_register_hi(), dest_addr_LO);
+
+  } else if (src->is_single_fpu()) {
+    Address dest_addr = frame_map()->address_for_slot(dest->single_stack_ix());
+#ifdef __ARM_PCS_VFP
+    __ vstr_f32(src->as_float_reg(), dest_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
+#else
+#error "unimplemented"
+#endif
+  } else if (src->is_double_fpu()) {
+    Address dest_addr = frame_map()->address_for_slot(dest->double_stack_ix());
+#ifdef __ARM_PCS_VFP
+    __ vstr_f64(src->as_double_reg(), dest_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
+#else
+#error "unimplemented"
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+
+}
+
+
+void LIR_Assembler::reg2mem(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool pop_fpu_stack, bool wide, bool /* unaligned */) {
+  LIR_Address* to_addr = dest->as_address_ptr();
+
+  if (type == T_ARRAY || type == T_OBJECT) {
+    __ verify_oop(src->as_register());
+  }
+
+  PatchingStub* patch = NULL;
+  if (patch_code != lir_patch_none) {
+    assert(to_addr->disp() != 0, "must have");
+
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+    __ mov(rscratch1, (address) to_addr->disp());
+    patching_epilog(patch, patch_code, to_addr->base()->as_register(), info);
+
+    to_addr = new LIR_Address(to_addr->base(), FrameMap::rscratch1_opr, to_addr->type());
+  }
+
+
+  int null_check_here = code_offset();
+  switch (type) {
+    case T_FLOAT: {
+#ifdef __ARM_PCS_VFP
+      Address addr = as_Address(to_addr, Address::IDT_FLOAT);
+      null_check_here = code_offset();
+      __ vstr_f32(src->as_float_reg(), addr);
+#else
+#error "unimplemented"
+#endif
+      break;
+    }
+
+    case T_DOUBLE: {
+#ifdef __ARM_PCS_VFP
+      Address addr = as_Address(to_addr, Address::IDT_DOUBLE);
+      null_check_here = code_offset();
+      __ vstr_f64(src->as_double_reg(), addr);
+#else
+#error "unimplemented"
+#endif
+
+      break;
+    }
+
+    case T_ARRAY:   // fall through
+    case T_OBJECT:  // fall through
+    case T_ADDRESS: // fall though
+    case T_INT: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ str(src->as_register(), addr);
+      break;
+    }
+    case T_METADATA:
+      // We get here to store a method pointer to the stack to pass to
+      // a dtrace runtime call. This can't work on 64 bit with
+      // compressed klass ptrs: T_METADATA can be a compressed klass
+      // ptr or a 64 bit method pointer.
+      ShouldNotReachHere();
+//      __ str(src->as_register(), as_Address(to_addr));
+      break;
+
+    case T_LONG: {
+      Address addr = as_Address_lo(to_addr, Address::IDT_LONG);
+      null_check_here = code_offset();
+      null_check_here += __ strd(src->as_register_lo(), src->as_register_hi(), addr);
+      break;
+    }
+
+    case T_BYTE:    // fall through
+    case T_BOOLEAN: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ strb(src->as_register(), addr);
+      break;
+    }
+    case T_CHAR:    // fall through
+    case T_SHORT: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ strh(src->as_register(), addr);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_here, info);
+  }
+}
+
+
+void LIR_Assembler::stack2reg(LIR_Opr src, LIR_Opr dest, BasicType type) {
+  assert(src->is_stack(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+
+  if (dest->is_single_cpu()) {
+    if (type == T_ARRAY || type == T_OBJECT) {
+      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
+      __ verify_oop(dest->as_register());
+    } else {
+      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
+    }
+
+  } else if (dest->is_double_cpu()) {
+    Address src_addr_LO = frame_map()->address_for_slot(src->double_stack_ix(), lo_word_offset_in_bytes);
+    __ ldrd(dest->as_register_lo(), dest->as_register_hi(), src_addr_LO);
+
+  } else if (dest->is_single_fpu()) {
+#ifdef __ARM_PCS_VFP
+    Address src_addr = frame_map()->address_for_slot(src->single_stack_ix());
+    __ vldr_f32(dest->as_float_reg(), src_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
+#else
+#error "unimplemented"
+#endif
+  } else if (dest->is_double_fpu()) {
+#ifdef __ARM_PCS_VFP
+    Address src_addr = frame_map()->address_for_slot(src->double_stack_ix());
+    __ vldr_f64(dest->as_double_reg(), src_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
+#else
+#error "unimplemented"
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
+  Metadata* o = NULL;
+  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id);
+  __ mov_metadata(reg, o);
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {
+
+  LIR_Opr temp;
+  if (type == T_LONG || type == T_DOUBLE)
+    temp = FrameMap::rscratch_long_opr;
+  else
+    temp = FrameMap::rscratch1_opr;
+
+  stack2reg(src, temp, src->type());
+  reg2stack(temp, dest, dest->type(), false);
+}
+
+
+void LIR_Assembler::mem2reg(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool wide, bool /* unaligned */) {
+  LIR_Address* from_addr = src->as_address_ptr();
+
+  if (from_addr->base()->type() == T_OBJECT) {
+    __ verify_oop(from_addr->base()->as_pointer_register());
+  }
+
+  PatchingStub* patch = NULL;
+  if (patch_code != lir_patch_none) {
+    assert(from_addr->disp() != 0, "must have");
+
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+    __ mov(rscratch1, (address) from_addr->disp());
+    patching_epilog(patch, patch_code, from_addr->base()->as_register(), info);
+
+    from_addr = new LIR_Address(from_addr->base(), FrameMap::rscratch1_opr, from_addr->type());
+  }
+
+  int null_check_here = code_offset();
+
+  switch (type) {
+    case T_FLOAT: {
+#ifdef __ARM_PCS_VFP
+    Address addr = as_Address(from_addr, Address::IDT_FLOAT);
+    null_check_here = code_offset();
+    __ vldr_f32(dest->as_float_reg(), addr);
+#else
+#error "unimplemented"
+#endif
+      break;
+    }
+
+    case T_DOUBLE: {
+#ifdef __ARM_PCS_VFP
+    Address addr = as_Address(from_addr, Address::IDT_DOUBLE);
+    null_check_here = code_offset();
+    __ vldr_f64(dest->as_double_reg(), addr);
+#else
+#error "unimplemented"
+#endif
+      break;
+    }
+
+    case T_ARRAY:   // fall through
+    case T_OBJECT:  // fall through
+    case T_ADDRESS: // fall through
+    case T_INT: {
+      Address addr = as_Address(from_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ ldr(dest->as_register(), addr);
+      break;
+    }
+    case T_METADATA:
+      // We get here to store a method pointer to the stack to pass to
+      // a dtrace runtime call. This can't work on 64 bit with
+      // compressed klass ptrs: T_METADATA can be a compressed klass
+      // ptr or a 64 bit method pointer.
+      ShouldNotReachHere();
+//      __ ldr(dest->as_register(), as_Address(from_addr));
+      break;
+
+    case T_LONG: {
+      Address addr = as_Address_lo(from_addr, Address::IDT_LONG);
+      null_check_here = code_offset();
+      null_check_here += __ ldrd(dest->as_register_lo(), dest->as_register_hi(), addr);
+      break;
+    }
+
+    case T_BYTE: {
+      Address addr =  as_Address(from_addr, Address::IDT_BYTE);
+      null_check_here = code_offset();
+      __ ldrsb(dest->as_register(), addr);
+      break;
+    }
+    case T_BOOLEAN: {
+      Address addr = as_Address(from_addr, Address::IDT_BOOLEAN);
+      null_check_here = code_offset();
+      __ ldrb(dest->as_register(), addr);
+      break;
+    }
+
+    case T_CHAR: {
+      Address addr = as_Address(from_addr, Address::IDT_CHAR);
+      null_check_here = code_offset();
+      __ ldrh(dest->as_register(), addr);
+      break;
+    }
+    case T_SHORT: {
+      Address addr = as_Address(from_addr, Address::IDT_SHORT);
+      null_check_here = code_offset();
+      __ ldrsh(dest->as_register(), addr);
+      break;
+    }
+
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (type == T_ARRAY || type == T_OBJECT) {
+    __ verify_oop(dest->as_register());
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_here, info);
+  }
+}
+
+void LIR_Assembler::prefetchr(LIR_Opr src) {
+  Unimplemented();
+}
+
+void LIR_Assembler::prefetchw(LIR_Opr src) {
+  Unimplemented();
+}
+
+int LIR_Assembler::array_element_size(BasicType type) const {
+  int elem_size = type2aelembytes(type);
+  return exact_log2(elem_size);
+}
+
+void LIR_Assembler::emit_op3(LIR_Op3* op) {
+  Register Rdividend = op->in_opr1()->as_register();
+  Register Rdivisor  = op->in_opr2()->as_register();
+  Register Rscratch  = op->in_opr3()->as_register();
+  Register Rresult   = op->result_opr()->as_register();
+  int divisor = -1;
+
+  /*
+  TODO: For some reason, using the Rscratch that gets passed in is
+  not possible because the register allocator does not see the tmp reg
+  as used, and assignes it the same register as Rdividend. We use rscratch1
+   instead.
+
+  assert(Rdividend != Rscratch, "");
+  assert(Rdivisor  != Rscratch, "");
+  */
+
+  if (Rdivisor == noreg && is_power_of_2(divisor)) {
+    // convert division by a power of two into some shifts and logical operations
+  }
+
+  assert(op->code() == lir_irem || op->code() == lir_idiv, "should be irem or idiv");
+  bool want_remainder = op->code() == lir_irem;
+
+  __ divide(Rresult, Rdividend, Rdivisor, 32, want_remainder);
+}
+
+void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
+#ifdef ASSERT
+  assert(op->block() == NULL || op->block()->label() == op->label(), "wrong label");
+  if (op->block() != NULL)  _branch_target_blocks.append(op->block());
+  if (op->ublock() != NULL) _branch_target_blocks.append(op->ublock());
+#endif
+
+  if (op->cond() == lir_cond_always) {
+    if (op->info() != NULL) add_debug_info_for_branch(op->info());
+    __ b(*(op->label()));
+  } else {
+    Assembler::Condition acond;
+    if (op->code() == lir_cond_float_branch) {
+      bool is_unordered = (op->ublock() == op->block());
+      // Assembler::EQ does not permit unordered branches, so we add
+      // another branch here.  Likewise, Assembler::NE does not permit
+      // ordered branches.
+      if (is_unordered && op->cond() == lir_cond_equal
+          || !is_unordered && op->cond() == lir_cond_notEqual)
+        __ b(*(op->ublock()->label()), Assembler::VS);
+      switch(op->cond()) {
+      case lir_cond_equal:        acond = Assembler::EQ; break;
+      case lir_cond_notEqual:     acond = Assembler::NE; break;
+      case lir_cond_less:         acond = (is_unordered ? Assembler::LT : Assembler::LO); break;
+      case lir_cond_lessEqual:    acond = (is_unordered ? Assembler::LE : Assembler::LS); break;
+      case lir_cond_greaterEqual: acond = (is_unordered ? Assembler::HS : Assembler::GE); break;
+      case lir_cond_greater:      acond = (is_unordered ? Assembler::HI : Assembler::GT); break;
+      default:                    ShouldNotReachHere();
+      }
+    } else {
+      switch (op->cond()) {
+        case lir_cond_equal:        acond = Assembler::EQ; break;
+        case lir_cond_notEqual:     acond = Assembler::NE; break;
+        case lir_cond_less:         acond = Assembler::LT; break;
+        case lir_cond_greaterEqual: acond = Assembler::GE; break;
+        case lir_cond_lessEqual:    acond = Assembler::LE; break;
+        case lir_cond_greater:      acond = Assembler::GT; break;
+        case lir_cond_belowEqual:   acond = Assembler::LS; break;
+        case lir_cond_aboveEqual:   acond = Assembler::HS; break;
+        default:                         ShouldNotReachHere();
+      }
+      if (op->type() == T_LONG) {
+        // a special trick here to be able to effectively compare jlongs
+        // for the lessEqual and greater conditions the jlong operands are swapped
+        // during comparison and hence should use mirror condition in conditional
+        // instruction
+        // see LIR_Assembler::comp_op and LIR_Assembler::cmove
+        switch (op->cond()) {
+          case lir_cond_lessEqual: acond = Assembler::GE;  break;
+          case lir_cond_greater: acond = Assembler::LT;    break;
+        }
+      }
+    }
+    __ b(*(op->label()), acond);
+  }
+}
+
+FloatRegister LIR_Assembler::as_float_reg(LIR_Opr doubleReg) {
+    assert(doubleReg->is_double_fpu(), "must be f64");
+    return as_FloatRegister(doubleReg->fpu_regnrLo());
+}
+
+void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {
+  LIR_Opr src  = op->in_opr();
+  LIR_Opr dest = op->result_opr();
+
+  switch (op->bytecode()) {
+    case Bytecodes::_i2f:
+      {
+        __ vmov_f32(dest->as_float_reg(), src->as_register());
+        __ vcvt_f32_s32(dest->as_float_reg(), dest->as_float_reg());
+        break;
+      }
+    case Bytecodes::_i2d:
+      {
+        __ vmov_f32(as_float_reg(dest), src->as_register());
+        __ vcvt_f64_s32(dest->as_double_reg(), as_float_reg(dest));
+        break;
+      }
+    case Bytecodes::_f2d:
+      {
+        __ vcvt_f64_f32(dest->as_double_reg(), src->as_float_reg());
+        break;
+      }
+    case Bytecodes::_d2f:
+      {
+        __ vcvt_f32_f64(dest->as_float_reg(), src->as_double_reg());
+        break;
+      }
+    case Bytecodes::_i2c:
+      {
+        __ uxth(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_i2l:
+      {
+        const Register dst_hi = dest->as_register_hi();
+        const Register dst_lo = dest->as_register_lo();
+        const Register src_lo = as_reg(src);
+        __ mov(dst_lo, src_lo);
+        __ asr(dst_hi, src_lo, 31);
+        break;
+      }
+    case Bytecodes::_i2s:
+      {
+        __ sxth(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_i2b:
+      {
+        __ sxtb(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_l2i:
+      {
+        assert(dest->is_single_cpu(), "must be single register");
+        __ mov(dest->as_register(), src->as_register_lo());
+        break;
+      }
+    case Bytecodes::_f2i:
+      {
+        __ vcvt_s32_f32(src->as_float_reg(), src->as_float_reg());
+        __ vmov_f32(dest->as_register(), src->as_float_reg());
+        break;
+      }
+    case Bytecodes::_d2i:
+      {
+        __ vcvt_s32_f64(as_float_reg(src), src->as_double_reg());
+        __ vmov_f32(dest->as_register(), as_float_reg(src));
+        break;
+      }
+    default: ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
+  if (op->init_check()) {
+    __ ldrb(rscratch1, Address(op->klass()->as_register(),
+                               InstanceKlass::init_state_offset()));
+    __ cmp(rscratch1, InstanceKlass::fully_initialized);
+    add_debug_info_for_null_check_here(op->stub()->info());
+    __ b(*op->stub()->entry(), Assembler::NE);
+  }
+  __ allocate_object(op->obj()->as_register(),
+                     op->tmp1()->as_register(),
+                     op->tmp2()->as_register(),
+                     op->header_size(),
+                     op->object_size(),
+                     op->klass()->as_register(),
+                     *op->stub()->entry());
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
+  Register len =  as_reg(op->len());
+
+  if (UseSlowPath ||
+      (!UseFastNewObjectArray && (op->type() == T_OBJECT || op->type() == T_ARRAY)) ||
+      (!UseFastNewTypeArray   && (op->type() != T_OBJECT && op->type() != T_ARRAY))) {
+    __ b(*op->stub()->entry());
+  } else {
+    Register tmp1 = op->tmp1()->as_register();
+    Register tmp2 = op->tmp2()->as_register();
+    Register tmp3 = op->tmp3()->as_register();
+    if (len == tmp1) {
+      tmp1 = tmp3;
+    } else if (len == tmp2) {
+      tmp2 = tmp3;
+    } else if (len == tmp3) {
+      // everything is ok
+    } else {
+      __ mov(tmp3, len);
+    }
+    __ allocate_array(op->obj()->as_register(),
+                      len,
+                      tmp1,
+                      tmp2,
+                      arrayOopDesc::header_size(op->type()),
+                      array_element_size(op->type()),
+                      op->klass()->as_register(),
+                      *op->stub()->entry());
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::type_profile_helper(Register mdo,
+                                        ciMethodData *md, ciProfileData *data,
+                                        Register recv, Label* update_done) {
+  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
+    Label next_test;
+    // See if the receiver is receiver[n].
+    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
+    __ ldr(rscratch1, Address(rscratch2));
+    __ cmp(recv, rscratch1);
+    __ b(next_test, Assembler::NE);
+    Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)));
+    __ addptr(data_addr, DataLayout::counter_increment);
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+
+  // Didn't find receiver; find next empty slot and fill it in
+  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
+    Label next_test;
+    __ lea(rscratch2,
+           Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
+    Address recv_addr(rscratch2);
+    __ ldr(rscratch1, recv_addr);
+    __ cbnz(rscratch1, next_test);
+    __ str(recv, recv_addr);
+    __ mov(rscratch1, DataLayout::counter_increment);
+    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i))));
+    __ str(rscratch1, Address(rscratch2));
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+}
+
+void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, Label* failure, Label* obj_is_null) {
+  // we always need a stub for the failure case.
+  CodeStub* stub = op->stub();
+  Register obj = op->object()->as_register();
+  Register k_RInfo = op->tmp1()->as_register();
+  Register klass_RInfo = op->tmp2()->as_register();
+  Register dst = op->result_opr()->as_register();
+  ciKlass* k = op->klass();
+  Register Rtmp1 = noreg;
+
+  // check if it needs to be profiled
+  ciMethodData* md;
+  ciProfileData* data;
+
+  if (op->should_profile()) {
+    ciMethod* method = op->profiled_method();
+    assert(method != NULL, "Should have method");
+    int bci = op->profiled_bci();
+    md = method->method_data_or_null();
+    assert(md != NULL, "Sanity");
+    data = md->bci_to_data(bci);
+    assert(data != NULL,                "need data for type check");
+    assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
+  }
+  Label profile_cast_success, profile_cast_failure;
+  Label *success_target = op->should_profile() ? &profile_cast_success : success;
+  Label *failure_target = op->should_profile() ? &profile_cast_failure : failure;
+
+  if (obj == k_RInfo) {
+    k_RInfo = dst;
+  } else if (obj == klass_RInfo) {
+    klass_RInfo = dst;
+  }
+  if (k->is_loaded()) {
+    select_different_registers(obj, dst, k_RInfo, klass_RInfo);
+  } else {
+    Rtmp1 = op->tmp3()->as_register();
+    select_different_registers(obj, dst, k_RInfo, klass_RInfo, Rtmp1);
+  }
+
+  assert_different_registers(obj, k_RInfo, klass_RInfo);
+
+    if (op->should_profile()) {
+      Label not_null;
+      __ cbnz(obj, not_null);
+      // Object is null; update MDO and exit
+      Register mdo  = klass_RInfo;
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address data_addr
+        = __ form_address(rscratch2, mdo,
+                          md->byte_offset_of_slot(data, DataLayout::DataLayout::header_offset()),
+                          LogBytesPerWord);
+      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
+      __ ldr(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, header_bits);
+      __ str(rscratch1, data_addr);
+      __ b(*obj_is_null);
+      __ bind(not_null);
+    } else {
+      __ cbz(obj, *obj_is_null);
+    }
+
+  if (!k->is_loaded()) {
+    klass2reg_with_patching(k_RInfo, op->info_for_patch());
+  } else {
+    __ mov_metadata(k_RInfo, k->constant_encoding());
+  }
+  __ verify_oop(obj);
+
+  if (op->fast_check()) {
+    // get object class
+    // not a safepoint as obj null check happens earlier
+    __ load_klass(rscratch1, obj);
+    __ cmp( rscratch1, k_RInfo);
+
+    __ b(*failure_target, Assembler::NE);
+    // successful cast, fall through to profile or jump
+  } else {
+    // get object class
+    // not a safepoint as obj null check happens earlier
+    __ load_klass(klass_RInfo, obj);
+    if (k->is_loaded()) {
+      // See if we get an immediate positive hit
+      __ ldr(rscratch1, Address(klass_RInfo, long(k->super_check_offset())));
+      __ cmp(k_RInfo, rscratch1);
+      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
+        __ b(*failure_target, Assembler::NE);
+        // successful cast, fall through to profile or jump
+      } else {
+        // See if we get an immediate positive hit
+        __ b(*success_target, Assembler::EQ);
+        // check for self
+        __ cmp(klass_RInfo, k_RInfo);
+        __ b(*success_target, Assembler::EQ);
+
+        __ push(klass_RInfo);
+        __ push(k_RInfo);
+        __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+        __ ldr(klass_RInfo, Address(__ post(sp, 2 * wordSize)));
+
+        // result is a boolean
+        __ cbz(klass_RInfo, *failure_target);
+        // successful cast, fall through to profile or jump
+      }
+    } else {
+      // perform the fast part of the checking logic
+      __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
+      // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+      __ push(klass_RInfo);
+      __ push(k_RInfo);
+      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+      __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));
+
+      // result is a boolean
+      __ cbz(k_RInfo, *failure_target);
+      // successful cast, fall through to profile or jump
+    }
+  }
+  if (op->should_profile()) {
+    Register mdo  = klass_RInfo, recv = k_RInfo;
+    __ bind(profile_cast_success);
+    __ mov_metadata(mdo, md->constant_encoding());
+    __ load_klass(recv, obj);
+    Label update_done;
+    type_profile_helper(mdo, md, data, recv, success);
+    __ b(*success);
+
+    __ bind(profile_cast_failure);
+    __ mov_metadata(mdo, md->constant_encoding());
+    Address counter_addr
+      = __ form_address(rscratch2, mdo,
+                        md->byte_offset_of_slot(data, CounterData::count_offset()),
+                        LogBytesPerWord);
+    __ ldr(rscratch1, counter_addr);
+    __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
+    __ str(rscratch1, counter_addr);
+    __ b(*failure);
+  }
+  __ b(*success);
+}
+
+
+void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
+  LIR_Code code = op->code();
+  if (code == lir_store_check) {
+    Register value = op->object()->as_register();
+    Register array = op->array()->as_register();
+    Register k_RInfo = op->tmp1()->as_register();
+    Register klass_RInfo = op->tmp2()->as_register();
+    Register Rtmp1 = op->tmp3()->as_register();
+
+    CodeStub* stub = op->stub();
+
+    // check if it needs to be profiled
+    ciMethodData* md;
+    ciProfileData* data;
+
+    if (op->should_profile()) {
+      ciMethod* method = op->profiled_method();
+      assert(method != NULL, "Should have method");
+      int bci = op->profiled_bci();
+      md = method->method_data_or_null();
+      assert(md != NULL, "Sanity");
+      data = md->bci_to_data(bci);
+      assert(data != NULL,                "need data for type check");
+      assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
+    }
+    Label profile_cast_success, profile_cast_failure, done;
+    Label *success_target = op->should_profile() ? &profile_cast_success : &done;
+    Label *failure_target = op->should_profile() ? &profile_cast_failure : stub->entry();
+
+    if (op->should_profile()) {
+      Label not_null;
+      __ cbnz(value, not_null);
+      // Object is null; update MDO and exit
+      Register mdo  = klass_RInfo;
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address data_addr
+        = __ form_address(rscratch2, mdo,
+                          md->byte_offset_of_slot(data, DataLayout::header_offset()),
+                          LogBytesPerInt);
+      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
+      __ ldr(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, header_bits);
+      __ str(rscratch1, data_addr);
+      __ b(done);
+      __ bind(not_null);
+    } else {
+      __ cbz(value, done);
+    }
+
+    add_debug_info_for_null_check_here(op->info_for_exception());
+    __ load_klass(k_RInfo, array);
+    __ load_klass(klass_RInfo, value);
+
+    // get instance klass (it's already uncompressed)
+    __ ldr(k_RInfo, Address(k_RInfo, ObjArrayKlass::element_klass_offset()));
+    // perform the fast part of the checking logic
+    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
+    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+    __ push(klass_RInfo);
+    __ push(k_RInfo);
+    __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+    __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));
+    // result is a boolean
+    __ cbz(k_RInfo, *failure_target);
+    // fall through to the success case
+
+    if (op->should_profile()) {
+      Register mdo  = klass_RInfo, recv = k_RInfo;
+      __ bind(profile_cast_success);
+      __ mov_metadata(mdo, md->constant_encoding());
+      __ load_klass(recv, value);
+      type_profile_helper(mdo, md, data, recv, &done);
+      __ b(done);
+
+      __ bind(profile_cast_failure);
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
+      __ lea(rscratch2, counter_addr);
+      __ ldr(rscratch1, Address(rscratch2));
+      __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
+      __ str(rscratch1, Address(rscratch2));
+      __ b(*stub->entry());
+    }
+
+    __ bind(done);
+  } else if (code == lir_checkcast) {
+    Register obj = op->object()->as_register();
+    Register dst = op->result_opr()->as_register();
+    Label success;
+    emit_typecheck_helper(op, &success, op->stub()->entry(), &success);
+    __ bind(success);
+    if (dst != obj) {
+      __ mov(dst, obj);
+    }
+  } else if (code == lir_instanceof) {
+    Register obj = op->object()->as_register();
+    Register dst = op->result_opr()->as_register();
+    Label success, failure, done;
+    emit_typecheck_helper(op, &success, &failure, &failure);
+    __ bind(failure);
+    __ mov(dst, 0);
+    __ b(done);
+    __ bind(success);
+    __ mov(dst, 1);
+    __ bind(done);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// TODO: reuse masm cmpxchgw
+void LIR_Assembler::casw(Register addr, Register newval, Register cmpval, Register result) {
+  assert(newval != cmpval, "must be different");
+  Label retry_load, nope;
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  __ bind(retry_load);
+  __ ldrex(result, addr);
+  __ cmp(result, cmpval);
+  __ mov(result, 1, Assembler::NE);
+  __ b(nope, Assembler::NE);
+  // if we store+flush with no intervening write rscratch1 wil be zero
+  __ strex(result, newval, addr);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  __ cbnz(result, retry_load);
+  __ membar(__ AnyAny);
+  __ bind(nope);
+}
+
+void LIR_Assembler::casl(Register addr, Register newval_lo, Register newval_hi, Register cmpval_lo,  Register cmpval_hi,  Register tmp_lo, Register tmp_hi, Register result) {
+  assert(newval_lo->successor() == newval_hi, "must be contiguous");
+  assert(tmp_lo->successor() == tmp_hi, "must be contiguous");
+  assert(tmp_lo->encoding_nocheck() % 2 == 0,  "Must be an even register");
+  assert_different_registers(newval_lo, newval_hi, cmpval_lo,  cmpval_hi,  tmp_lo, tmp_hi);
+
+  Label retry_load, nope;
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  __ bind(retry_load);
+  __ mov(result, 1);
+  __ ldrexd(tmp_lo, addr);
+  __ cmp(tmp_lo, cmpval_lo);
+  __ b(nope, Assembler::NE);
+  __ cmp(tmp_hi, cmpval_hi);
+  __ b(nope, Assembler::NE);
+  // if we store+flush with no intervening write rscratch1 wil be zero
+  __ strexd(result, newval_lo, addr);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  __ cbnz(result, retry_load);
+  __ membar(__ AnyAny);
+  __ bind(nope);
+}
+
+
+void LIR_Assembler::emit_compare_and_swap(LIR_OpCompareAndSwap* op) {
+  Register addr = as_reg(op->addr());
+  Register result = as_reg(op->result_opr());
+  if (op->code() == lir_cas_obj || op->code() == lir_cas_int) {
+    Register newval = as_reg(op->new_value());
+    Register cmpval = as_reg(op->cmp_value());
+    casw(addr, newval, cmpval, result);
+  } else if (op->code() == lir_cas_long){
+    Register newval_lo = op->new_value()->as_register_lo();
+    Register newval_hi = op->new_value()->as_register_hi();
+    Register cmpval_lo = op->cmp_value()->as_register_lo();
+    Register cmpval_hi = op->cmp_value()->as_register_hi();
+    Register tmp_lo = op->tmp1()->as_register_lo();
+    Register tmp_hi = op->tmp1()->as_register_hi();
+    casl(addr, newval_lo, newval_hi, cmpval_lo, cmpval_hi, tmp_lo, tmp_hi, result);
+  } else {
+      ShouldNotReachHere();
+  }
+}
+
+static void patch_condition(address start_insn, address end_insn, Assembler::Condition cond) {
+  for (uint32_t* insn_p = (uint32_t*) start_insn; (address) insn_p < end_insn; ++insn_p) {
+    uint32_t insn = *insn_p;
+    assert((insn >> 28) == Assembler::AL, "instructions in patch"
+     " should allow conditional form and be in ALWAYS condition");
+    *insn_p = (insn & 0x0fffffff) | (cond << 28);
+  }
+}
+
+void LIR_Assembler::cmove(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, BasicType type) {
+
+  Assembler::Condition acond, ncond;
+  switch (condition) {
+  case lir_cond_equal:        acond = Assembler::EQ; ncond = Assembler::NE; break;
+  case lir_cond_notEqual:     acond = Assembler::NE; ncond = Assembler::EQ; break;
+  case lir_cond_less:         acond = Assembler::LT; ncond = Assembler::GE; break;
+  case lir_cond_greaterEqual: acond = Assembler::GE; ncond = Assembler::LT; break;
+  case lir_cond_lessEqual:    acond = Assembler::LE; ncond = Assembler::GT; break;
+  case lir_cond_greater:      acond = Assembler::GT; ncond = Assembler::LE; break;
+  case lir_cond_belowEqual:   Unimplemented(); break;
+  case lir_cond_aboveEqual:   Unimplemented(); break;
+  default:                    ShouldNotReachHere();
+  }
+  if (type == T_LONG) {
+      // for the lessEqual and greater conditions the jlong operands are swapped
+      // during comparison and hence should use mirror condition in conditional
+      // instruction. see comp_op())
+    switch (condition) {
+    case lir_cond_lessEqual:    acond = Assembler::GE; ncond = Assembler::LT; break;
+    case lir_cond_greater:      acond = Assembler::LT; ncond = Assembler::GE; break;
+    }
+  }
+
+  address true_instrs = __ pc();
+  if (opr1->is_cpu_register()) {
+    reg2reg(opr1, result);
+  } else if (opr1->is_stack()) {
+    stack2reg(opr1, result, result->type());
+  } else if (opr1->is_constant()) {
+    const2reg(opr1, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+  patch_condition(true_instrs, __ pc(), acond);
+
+  address false_instrs = __ pc();
+  if (opr2->is_cpu_register()) {
+    reg2reg(opr2, result);
+  } else if (opr2->is_stack()) {
+    stack2reg(opr2, result, result->type());
+  } else if (opr2->is_constant()) {
+    const2reg(opr2, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+  patch_condition(false_instrs, __ pc(), ncond);
+}
+
+void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest, CodeEmitInfo* info, bool pop_fpu_stack) {
+  assert(info == NULL, "should never be used, idiv/irem and ldiv/lrem not handled by this method");
+
+  if (left->is_single_cpu()) {
+    Register lreg = left->as_register();
+    Register dreg = as_reg(dest);
+
+    if (right->is_single_cpu()) {
+      // cpu register - cpu register
+
+      assert((left->type() == T_INT || left->type() == T_OBJECT)
+             && right->type() == T_INT
+             && dest->type() == T_INT,
+             "should be");
+      Register rreg = right->as_register();
+      switch (code) {
+      case lir_add: __ add (dest->as_register(), lreg, rreg); break;
+      case lir_sub: __ sub (dest->as_register(), lreg, rreg); break;
+      case lir_mul: __ mul (dest->as_register(), lreg, rreg); break;
+      default:      ShouldNotReachHere();
+      }
+
+    } else if (right->is_double_cpu()) {
+      ShouldNotReachHere(); // for obj+long op the generator casts long to int before invoking add
+    } else if (right->is_constant()) {
+      // cpu register - constant
+      jint c = right->as_constant_ptr()->as_jint();
+
+      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
+      if (c == 0 && dreg == lreg) {
+        COMMENT("effective nop elided");
+        return;
+      }
+
+      if (Assembler::operand_valid_for_add_sub_immediate(c)) {
+        switch (code) {
+        case lir_add: __ add(dreg, lreg, c); break;
+        case lir_sub: __ sub(dreg, lreg, c); break;
+        default: ShouldNotReachHere();
+        }
+      } else {
+        __ mov(rscratch1, c);
+        switch (code) {
+        case lir_add: __ add(dreg, lreg, rscratch1); break;
+        case lir_sub: __ sub(dreg, lreg, rscratch1); break;
+        default: ShouldNotReachHere();
+        }
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+
+  } else if (left->is_double_cpu()) {
+    Register lreg_lo = left->as_register_lo();
+    Register lreg_hi = left->as_register_hi();
+
+    if (right->is_double_cpu()) {
+      // cpu register - cpu register
+      Register rreg_lo = right->as_register_lo();
+      Register rreg_hi = right->as_register_hi();
+      Register dreg_lo = dest->as_register_lo();
+      Register dreg_hi = dest->as_register_hi();
+      if (code == lir_add || code == lir_sub) {
+        check_register_collision(dreg_lo, &lreg_hi, &rreg_hi);
+      }
+      switch (code) {
+      case lir_add: __ adds (dreg_lo, lreg_lo, rreg_lo);
+                    __ adc (dreg_hi, lreg_hi, rreg_hi); break;
+      case lir_sub: __ subs (dreg_lo, lreg_lo, rreg_lo);
+                    __ sbc (dreg_hi, lreg_hi, rreg_hi); break;
+      case lir_mul: __ mult_long (dreg_lo, dreg_hi,
+                        lreg_lo, lreg_hi, rreg_lo, rreg_hi); break;
+      default:
+        ShouldNotReachHere();
+      }
+
+    } else if (right->is_constant()) {
+      const jint c_lo = right->as_constant_ptr()->as_jint_lo_bits();
+      const jint c_hi = right->as_constant_ptr()->as_jint_hi_bits();
+      const Register dreg_lo = dest->as_register_lo();
+      const Register dreg_hi = dest->as_register_hi();
+      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
+      if (c_lo == 0 && c_hi == 0 && dreg_lo == lreg_lo && dreg_hi == lreg_hi) {
+        COMMENT("effective nop elided");
+        return;
+      }
+      check_register_collision(dreg_lo, &lreg_hi, NULL, rscratch2);
+      switch (code) {
+        case lir_add:
+          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
+            __ adds(dreg_lo, lreg_lo, c_lo);
+          else {
+            __ mov(rscratch1, c_lo);
+            __ adds(dreg_lo, lreg_lo, rscratch1);
+          }
+          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
+            __ adc(dreg_hi, lreg_hi, c_hi);
+          else {
+            __ mov(rscratch1, c_hi);
+            __ adc(dreg_lo, lreg_hi, rscratch1);
+          }
+          break;
+        case lir_sub:
+          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
+            __ subs(dreg_lo, lreg_lo, c_lo);
+          else {
+            __ mov(rscratch1, c_lo);
+            __ subs(dreg_lo, lreg_lo, rscratch1);
+          }
+          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
+            __ sbc(dreg_hi, lreg_hi, c_hi);
+          else {
+            __ mov(rscratch1, c_hi);
+            __ sbc(dreg_hi, lreg_hi, rscratch1);
+          }
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (left->is_single_fpu()) {
+    assert(right->is_single_fpu(), "right hand side of float arithmetics needs to be float register");
+    switch (code) {
+    case lir_add: __ vadd_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_sub: __ vsub_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_mul: __ vmul_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_div: __ vdiv_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    default:
+      ShouldNotReachHere();
+    }
+  } else if (left->is_double_fpu()) {
+    if (right->is_double_fpu()) {
+      // cpu register - cpu register
+      switch (code) {
+      case lir_add: __ vadd_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_sub: __ vsub_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_mul: __ vmul_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_div: __ vdiv_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else {
+      if (right->is_constant()) {
+        ShouldNotReachHere();
+      }
+      ShouldNotReachHere();
+    }
+  } else if (left->is_single_stack() || left->is_address()) {
+    assert(left == dest, "left and dest must be equal");
+    ShouldNotReachHere();
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
+  switch(code) {
+  case lir_abs : __ vabs_f64(dest->as_double_reg(), value->as_double_reg()); break;
+  case lir_sqrt: __ vsqrt_f64(dest->as_double_reg(), value->as_double_reg()); break;
+  default      : ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {
+
+  assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
+  Register Rleft = left->is_single_cpu() ? left->as_register() :
+                                           left->as_register_lo();
+   if (dst->is_single_cpu()) {
+     Register Rdst = dst->as_register();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jint()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   } else {
+     assert(dst->is_double_cpu(), "mismatched logic op operand size");
+     const Register Rdst_lo = dst->as_register_lo();
+     const Register Rdst_hi = dst->as_register_hi();
+     Register Rleft_hi = left->as_register_hi();
+     if (right->is_constant()) {
+       // LIR generator enforces jlong constants to be valid_immediate12
+       // so we know they fit into 32-bit int
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         case lir_logic_or:  __ orr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         case lir_logic_xor: __ eor (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       assert(right->is_double_cpu(), "mismatched logic op operand size");
+       Register Rright_lo = right->as_register_lo();
+       Register Rright_hi = right->as_register_hi();
+       check_register_collision(Rdst_lo, &Rleft_hi, &Rright_hi);
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst_lo, Rleft, Rright_lo);
+                             __ andr (Rdst_hi, Rleft_hi, Rright_hi); break;
+         case lir_logic_or:  __ orr (Rdst_lo, Rleft, Rright_lo);
+                             __ orr (Rdst_hi, Rleft_hi, Rright_hi); break;
+         case lir_logic_xor: __ eor (Rdst_lo, Rleft, Rright_lo);
+                             __ eor (Rdst_hi, Rleft_hi, Rright_hi); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   }
+}
+
+
+
+void LIR_Assembler::arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr temp, LIR_Opr result, CodeEmitInfo* info) { Unimplemented(); }
+
+void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
+  if (opr1->is_single_cpu()) {
+    Register reg1 = as_reg(opr1);
+    if (opr2->is_single_cpu()) {
+      // cpu register - cpu register
+      Register reg2 = opr2->as_register();
+      __ cmp(reg1, reg2);
+    } else if (opr2->is_constant()) {
+      LIR_Const* c = opr2->as_constant_ptr();
+      if (c->type() == T_INT) {
+        __ cmp(reg1, c->as_jint(), rscratch1, Assembler::C_DFLT);
+      } else if (c->type() == T_OBJECT || c->type() == T_ARRAY) {
+        jobject o = c->as_jobject();
+        if (o == NULL) {
+          __ cmp(reg1, (int32_t)NULL_WORD);
+        } else {
+          __ movoop(rscratch1, o);
+          __ cmpptr(reg1, rscratch1);
+        }
+      } else {
+        fatal(err_msg("unexpected type: %s", basictype_to_str(c->type())));
+      }
+    } else if (opr2->is_address()) {
+      __ ldr(rscratch2, as_Address(opr2->as_address_ptr(), rscratch1, Address::IDT_INT));
+      __ cmp(reg1, rscratch2);
+    } else {
+      ShouldNotReachHere();
+    }
+
+  } else if (opr1->is_double_cpu()) {
+    assert(opr1->type() == T_LONG, "expect jlong type");
+    assert(opr2->type() == T_LONG, "expect jlong type");
+    Register xlo = opr1->as_register_lo();
+    Register xhi = opr1->as_register_hi();
+    if (opr2->is_double_cpu()) {
+      // cpu register - cpu register
+      Register ylo = opr2->as_register_lo();
+      Register yhi = opr2->as_register_hi();
+      switch (condition) {
+        case lir_cond_equal:
+        case lir_cond_notEqual:
+        case lir_cond_belowEqual:
+        case lir_cond_aboveEqual:
+          // these need APSR.ZC. the ops below set them correctly (but not APSR.V)
+          __ cmp(xhi, yhi);
+          __ cmp(xlo, ylo, Assembler::EQ);
+          break;
+        case lir_cond_less:
+        case lir_cond_greaterEqual:
+          __ cmp(xlo, ylo);
+          __ sbcs(rscratch1, xhi, yhi);
+          break;
+        case lir_cond_lessEqual:
+        case lir_cond_greater:
+          // here goes a trick: the below operations do not produce the valid
+          // value for the APSR.Z flag and there is no easy way to set it. so
+          // we exchange the order of arguments in the comparison and use the
+          // opposite condition in the conditional statement that follows.
+          // GE should be used instead of LE and LT in place of GT.
+          // the comp_op() could only be followed by: emit_opBranch(), cmove() and
+          // emit_assert(). these are patched to be aware of this trick
+          __ cmp(ylo, xlo);
+          __ sbcs(rscratch1, yhi, xhi);
+          break;
+      }
+    } else if (opr2->is_constant()) {
+      jlong y = opr2->as_jlong();
+      assert(Assembler::operand_valid_for_add_sub_immediate(y), "immediate overflow");
+      switch (condition) {
+        case lir_cond_equal:
+        case lir_cond_notEqual:
+        case lir_cond_belowEqual:
+        case lir_cond_aboveEqual:
+          __ cmp(xhi, (int)(y >> 32));
+          __ cmp(xlo, (int)y, Assembler::EQ);
+          break;
+        case lir_cond_less:
+        case lir_cond_greaterEqual:
+          __ cmp(xlo, (int)y);
+          __ sbcs(rscratch1, xhi, (int)(y >> 32));
+          break;
+        case lir_cond_lessEqual:
+        case lir_cond_greater:
+          __ rsbs(rscratch1, xlo, (int)y);
+          __ rscs(rscratch1, xhi, (int)(y >> 32));
+          break;
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (opr1->is_single_fpu()) {
+    FloatRegister reg1 = opr1->as_float_reg();
+    assert(opr2->is_single_fpu(), "expect single float register");
+    FloatRegister reg2 = opr2->as_float_reg();
+    __ vcmp_f32(reg1, reg2);
+    __ get_fpsr();
+  } else if (opr1->is_double_fpu()) {
+    FloatRegister reg1 = opr1->as_double_reg();
+    assert(opr2->is_double_fpu(), "expect double float register");
+    FloatRegister reg2 = opr2->as_double_reg();
+    __ vcmp_f64(reg1, reg2);
+    __ get_fpsr();
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::comp_fl2i(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst, LIR_Op2* op){
+  if (code == lir_cmp_fd2i || code == lir_ucmp_fd2i) {
+    bool is_unordered_less = (code == lir_ucmp_fd2i);
+    if (left->is_single_fpu()) {
+      __ float_cmp(true, is_unordered_less ? -1 : 1, left->as_float_reg(), right->as_float_reg(), dst->as_register());
+    } else if (left->is_double_fpu()) {
+      __ float_cmp(false, is_unordered_less ? -1 : 1, left->as_double_reg(), right->as_double_reg(), dst->as_register());
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (code == lir_cmp_l2i) {
+    __ mov(dst->as_register(), 1);
+    __ subs(rscratch1, left->as_register_lo(), right->as_register_lo());
+    __ sbc(rscratch2, left->as_register_hi(), right->as_register_hi());
+    __ orrs(rscratch1, rscratch1, rscratch2);
+    __ mov(dst->as_register(), -1, Assembler::MI);
+    __ mov(dst->as_register(), 0, Assembler::EQ);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::align_call(LIR_Code code) {  }
+
+
+void LIR_Assembler::call(LIR_OpJavaCall* op, relocInfo::relocType rtype) {
+  __ trampoline_call(Address(op->addr(), rtype));
+  add_call_info(code_offset(), op->info());
+}
+
+
+void LIR_Assembler::ic_call(LIR_OpJavaCall* op) {
+  __ ic_call(op->addr());
+  add_call_info(code_offset(), op->info());
+}
+
+
+/* Currently, vtable-dispatch is only enabled for sparc platforms */
+void LIR_Assembler::vtable_call(LIR_OpJavaCall* op) {
+  ShouldNotReachHere();
+}
+
+
+void LIR_Assembler::emit_static_call_stub() {
+  address call_pc = __ pc();
+  address stub = __ start_a_stub(call_stub_size);
+  if (stub == NULL) {
+    bailout("static call stub overflow");
+    return;
+  }
+
+  int start = __ offset();
+
+  __ relocate(static_stub_Relocation::spec(call_pc));
+  __ mov_metadata(rmethod, (Metadata*)NULL);
+  __ movptr(rscratch1, 0);
+  __ b(rscratch1);
+
+  assert(__ offset() - start <= call_stub_size, "stub too big");
+  __ end_a_stub();
+}
+
+
+void LIR_Assembler::throw_op(LIR_Opr exceptionPC, LIR_Opr exceptionOop, CodeEmitInfo* info) {
+  assert(exceptionOop->as_register() == r0, "must match");
+  assert(exceptionPC->as_register() == r3, "must match");
+
+  // exception object is not added to oop map by LinearScan
+  // (LinearScan assumes that no oops are in fixed registers)
+  info->add_register_oop(exceptionOop);
+  Runtime1::StubID unwind_id;
+
+  // get current pc information
+  // pc is only needed if the method has an exception handler, the unwind code does not need it.
+  int pc_for_athrow_offset = __ offset();
+  __ add(exceptionPC->as_register(), r15_pc, -8);
+  add_call_info(pc_for_athrow_offset, info); // for exception handler
+
+  __ verify_not_null_oop(r0);
+  // search an exception handler (r0: exception oop, r3: throwing pc)
+  if (compilation()->has_fpu_code()) {
+    unwind_id = Runtime1::handle_exception_id;
+  } else {
+    unwind_id = Runtime1::handle_exception_nofpu_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(unwind_id)));
+
+  // FIXME: enough room for two byte trap   ????
+  __ nop();
+}
+
+
+void LIR_Assembler::unwind_op(LIR_Opr exceptionOop) {
+  assert(exceptionOop->as_register() == r0, "must match");
+
+  __ b(_unwind_handler_entry);
+}
+
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, LIR_Opr count, LIR_Opr dest, LIR_Opr tmp) {
+  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();
+  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();
+
+  switch (left->type()) {
+    case T_INT:
+    case T_ADDRESS:
+    case T_OBJECT:
+      __ andr(rscratch1, count->as_register(), 0x1f);
+      switch (code) {
+        case lir_shl: __ lsl(dreg, lreg, rscratch1); break;
+        case lir_shr: __ asr(dreg, lreg, rscratch1); break;
+        case lir_ushr: __ lsr(dreg, lreg, rscratch1); break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+      break;
+    case T_LONG:
+    {
+      Register lreg_hi = left->as_register_hi();
+      Register dreg_hi = dest->as_register_hi();
+      const int word_bits = 8 * wordSize;
+
+      if (code == lir_shl || code == lir_ushr) {
+        check_register_collision(dreg, &lreg, &lreg_hi, rscratch1);
+        check_register_collision(dreg_hi, &lreg, &lreg_hi, rscratch2);
+      }
+
+      switch (code) {
+        case lir_shl:
+          __ andr(dreg, count->as_register(), 0x3f);
+          __ sub(dreg_hi, dreg, word_bits);
+          __ lsl(lreg_hi, lreg_hi, dreg);
+          __ orr(lreg_hi, lreg_hi, lreg, lsl(dreg_hi));
+          __ rsb(dreg_hi, dreg, word_bits);
+          __ orr(dreg_hi, lreg_hi, lreg, lsr(dreg_hi));
+          __ lsl(dreg, lreg, dreg);
+          break;
+        case lir_shr: {
+          __ mov(rscratch2, lreg_hi);
+          __ andr(rscratch1, count->as_register(), 0x3f);
+          __ lsr(dreg, lreg, rscratch1);
+          __ rsb(dreg_hi, rscratch1, word_bits);
+          __ orr(dreg, dreg, rscratch2, lsl(dreg_hi));
+          __ asr(dreg_hi, rscratch2, rscratch1);
+          __ subs(rscratch1, rscratch1, word_bits);
+          __ mov(dreg, rscratch2, asr(rscratch1), Assembler::GT);
+        }
+          break;
+        case lir_ushr:
+          __ andr(dreg, count->as_register(), 0x3f);
+          __ lsr(lreg, lreg, dreg);
+          __ rsb(dreg_hi, dreg, word_bits);
+          __ orr(lreg, lreg, lreg_hi, lsl(dreg_hi));
+          __ lsr(dreg_hi, lreg_hi, dreg);
+          __ sub(dreg, dreg, word_bits);
+          __ orr(dreg, lreg, lreg_hi, lsr(dreg));
+          break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+    }
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+  }
+}
+
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, jint count, LIR_Opr dest) {
+  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();
+  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();
+
+  if (!count) {
+      reg2reg(left, dest);
+      return;
+   }
+
+  switch (left->type()) {
+    case T_INT:
+    case T_ADDRESS:
+    case T_OBJECT:
+      switch (code) {
+        case lir_shl: __ lsl(dreg, lreg, count); break;
+        case lir_shr: __ asr(dreg, lreg, count); break;
+        case lir_ushr: __ lsr(dreg, lreg, count); break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+      break;
+    case T_LONG: {
+      Register lreg_hi = left->as_register_hi();
+      Register dreg_hi = dest->as_register_hi();
+      const int word_bits = 8 * wordSize;
+
+      switch (code) {
+        case lir_shl:
+          if (count >= word_bits) {
+            __ lsl(dreg_hi, lreg, count - word_bits);
+            __ mov(dreg, 0);
+          } else {
+            check_register_collision(dreg_hi, &lreg);
+            __ lsl(dreg_hi, lreg_hi, count);
+            __ orr(dreg_hi, dreg_hi, lreg, lsr(word_bits - count));
+            __ lsl(dreg, lreg, count);
+          }
+          break;
+        case lir_shr:
+          if (count >= word_bits) {
+            __ asr(dreg, lreg_hi, count - word_bits);
+            __ asr(dreg_hi, lreg_hi, word_bits);
+          } else {
+            check_register_collision(dreg, &lreg_hi);
+            __ lsr(dreg, lreg, count);
+            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
+            __ asr(dreg_hi, lreg_hi, count);
+          }
+          break;
+        case lir_ushr:
+          if (count >= word_bits) {
+            __ lsr(dreg, lreg_hi, count - word_bits);
+            __ mov(dreg_hi, 0);
+          } else {
+            check_register_collision(dreg, &lreg_hi);
+            __ lsr(dreg, lreg, count);
+            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
+            __ lsr(dreg_hi, lreg_hi, count);
+          }
+          break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+    }
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+  }
+}
+
+
+void LIR_Assembler::store_parameter(Register r, int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
+  __ str (r, Address(sp, offset_from_sp_in_bytes));
+}
+
+
+void LIR_Assembler::store_parameter(jint c,     int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
+  __ mov (rscratch1, c);
+  __ str (rscratch1, Address(sp, offset_from_sp_in_bytes));
+}
+
+// This code replaces a call to arraycopy; no exception may
+// be thrown in this code, they must be thrown in the System.arraycopy
+// activation frame; we could save some checks if this would not be the case
+void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
+  ciArrayKlass* default_type = op->expected_type();
+  Register src = op->src()->as_register();
+  Register dst = op->dst()->as_register();
+  Register src_pos = op->src_pos()->as_register();
+  Register dst_pos = op->dst_pos()->as_register();
+  Register length  = op->length()->as_register();
+  Register tmp = op->tmp()->as_register();
+  // due to limited number of registers available and in order to simplify
+  // the code we fix the registers used by the arguments to this intrinsic.
+  // see the comment in LIRGenerator::do_ArrayCopy
+  assert(src == j_rarg0, "assumed by implementation");
+  assert(src_pos == j_rarg1, "assumed by implementation");
+  assert(dst == j_rarg2, "assumed by implementation");
+  assert(dst_pos == j_rarg3, "assumed by implementation");
+  assert(length == r4, "assumed by implementation");
+  assert(tmp == r5, "assumed by implementation");
+
+  CodeStub* stub = op->stub();
+  int flags = op->flags();
+  BasicType basic_type = default_type != NULL ? default_type->element_type()->basic_type() : T_ILLEGAL;
+  if (basic_type == T_ARRAY) basic_type = T_OBJECT;
+
+  // if we don't know anything, just go through the generic arraycopy
+  if (default_type == NULL // || basic_type == T_OBJECT
+      ) {
+    Label done;
+    assert(src == r1 && src_pos == r2, "mismatch in calling convention");
+
+    // Save the arguments in case the generic arraycopy fails and we
+    // have to fall back to the JNI stub
+    __ str(dst,     Address(sp, 0*BytesPerWord));
+    __ str(dst_pos, Address(sp, 1*BytesPerWord));
+    __ str(length,  Address(sp, 2*BytesPerWord));
+    __ str(src_pos, Address(sp, 3*BytesPerWord));
+    __ str(src,     Address(sp, 4*BytesPerWord));
+
+    address C_entry = CAST_FROM_FN_PTR(address, Runtime1::arraycopy);
+    address copyfunc_addr = StubRoutines::generic_arraycopy();
+
+    // The arguments are in java calling convention so we shift them
+    // to C convention
+    assert(c_rarg0 == j_rarg3, "assumed in the code below");
+    __ mov(rscratch1, c_rarg0);
+    assert_different_registers(c_rarg0, j_rarg1, j_rarg2);
+    __ mov(c_rarg0, j_rarg0);
+    assert_different_registers(c_rarg1, j_rarg2, j_rarg3);
+    __ mov(c_rarg1, j_rarg1);
+    assert_different_registers(c_rarg2, j_rarg3);
+    __ mov(c_rarg2, j_rarg2);
+    __ mov(c_rarg3, rscratch1);
+    __ str(length, Address(sp)); // the below C function follows C calling convention,
+                                 // so should put 5th arg to stack
+
+    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
+      __ mov(rscratch1, RuntimeAddress(C_entry));
+      __ bl(rscratch1);
+    } else {
+#ifndef PRODUCT
+      if (PrintC1Statistics) {
+        __ increment(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
+      }
+#endif
+      __ far_call(RuntimeAddress(copyfunc_addr));
+    }
+
+    __ cbz(r0, *stub->continuation());
+
+    // Reload values from the stack so they are where the stub
+    // expects them.
+    __ ldr(dst,     Address(sp, 0*BytesPerWord));
+    __ ldr(dst_pos, Address(sp, 1*BytesPerWord));
+    __ ldr(length,  Address(sp, 2*BytesPerWord));
+    __ ldr(src_pos, Address(sp, 3*BytesPerWord));
+    __ ldr(src,     Address(sp, 4*BytesPerWord));
+
+    if (copyfunc_addr != NULL) {
+      // r0 is -1^K where K == partial copied count
+      __ inv(rscratch1, r0);
+      // adjust length down and src/end pos up by partial copied count
+      __ sub(length, length, rscratch1);
+      __ add(src_pos, src_pos, rscratch1);
+      __ add(dst_pos, dst_pos, rscratch1);
+    }
+    __ b(*stub->entry());
+
+    __ bind(*stub->continuation());
+    return;
+  }
+
+  assert(default_type != NULL && default_type->is_array_klass() && default_type->is_loaded(), "must be true at this point");
+
+  int elem_size = type2aelembytes(basic_type);
+  int scale = exact_log2(elem_size);
+
+  Address src_length_addr = Address(src, arrayOopDesc::length_offset_in_bytes());
+  Address dst_length_addr = Address(dst, arrayOopDesc::length_offset_in_bytes());
+  Address src_klass_addr = Address(src, oopDesc::klass_offset_in_bytes());
+  Address dst_klass_addr = Address(dst, oopDesc::klass_offset_in_bytes());
+
+  // test for NULL
+  if (flags & LIR_OpArrayCopy::src_null_check) {
+    __ cbz(src, *stub->entry());
+  }
+  if (flags & LIR_OpArrayCopy::dst_null_check) {
+    __ cbz(dst, *stub->entry());
+  }
+
+  // check if negative
+  if (flags & LIR_OpArrayCopy::src_pos_positive_check) {
+    __ cmp(src_pos, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+  if (flags & LIR_OpArrayCopy::dst_pos_positive_check) {
+    __ cmp(dst_pos, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+
+  if (flags & LIR_OpArrayCopy::length_positive_check) {
+    __ cmp(length, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+
+  if (flags & LIR_OpArrayCopy::src_range_check) {
+    __ add(tmp, src_pos, length);
+    __ ldr(rscratch1, src_length_addr);
+    __ cmp(tmp, rscratch1);
+    __ b(*stub->entry(), Assembler::HI);
+  }
+  if (flags & LIR_OpArrayCopy::dst_range_check) {
+    __ add(tmp, dst_pos, length);
+    __ ldr(rscratch1, dst_length_addr);
+    __ cmp(tmp, rscratch1);
+    __ b(*stub->entry(), Assembler::HI);
+  }
+
+  // FIXME: The logic in LIRGenerator::arraycopy_helper clears
+  // length_positive_check if the source of our length operand is an
+  // arraylength.  However, that arraylength might be zero, and the
+  // stub that we're about to call contains an assertion that count !=
+  // 0 .  So we make this check purely in order not to trigger an
+  // assertion failure.
+  __ cbz(length, *stub->continuation());
+
+  if (flags & LIR_OpArrayCopy::type_check) {
+    // We don't know the array types are compatible
+    if (basic_type != T_OBJECT) {
+      // Simple test for basic type arrays
+      __ ldr(tmp, src_klass_addr);
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(*stub->entry(), Assembler::NE);
+    } else {
+      // For object arrays, if src is a sub class of dst then we can
+      // safely do the copy.
+      Label cont, slow;
+
+      __ push(RegSet::of(src, dst), sp);
+
+      __ load_klass(src, src);
+      __ load_klass(dst, dst);
+
+      __ check_klass_subtype_fast_path(src, dst, tmp, &cont, &slow, NULL);
+
+      __ push(src); // sub
+      __ push(dst); // super
+      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+      // result on TOS
+      __ pop(src); // result
+      __ pop(dst);
+
+      __ cbnz(src, cont);
+
+      __ bind(slow);
+      __ pop(RegSet::of(src, dst), sp);
+
+      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
+      if (copyfunc_addr != NULL) { // use stub if available
+        // src is not a sub class of dst so we have to do a
+        // per-element check.
+
+        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
+        if ((flags & mask) != mask) {
+          // Check that at least both of them object arrays.
+          assert(flags & mask, "one of the two should be known to be an object array");
+
+          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
+            __ load_klass(tmp, src);
+          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
+            __ load_klass(tmp, dst);
+          }
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
+          Address klass_lh_addr(tmp, lh_offset);
+          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+          __ ldr(rscratch1, klass_lh_addr);
+          __ mov(rscratch2, objArray_lh);
+          __ eor(rscratch1, rscratch1, rscratch2);
+          __ cbnz(rscratch1, *stub->entry());
+        }
+
+       // Spill because stubs can use any register they like and it's
+       // easier to restore just those that we care about.
+        __ str(dst,     Address(sp, 0*BytesPerWord));
+        __ str(dst_pos, Address(sp, 1*BytesPerWord));
+        __ str(length,  Address(sp, 2*BytesPerWord));
+        __ str(src_pos, Address(sp, 3*BytesPerWord));
+        __ str(src,     Address(sp, 4*BytesPerWord));
+
+        assert(dst_pos == r0, "assumed in the code below");
+        __ mov(rscratch1, dst_pos); // save dst_pos which is r0
+        __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
+        __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
+        assert_different_registers(c_rarg0, dst, length);
+        __ lea(c_rarg1, Address(dst, rscratch1, lsl(scale)));
+        __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
+        assert_different_registers(c_rarg1, dst, length);
+
+        __ load_klass(c_rarg2, dst);
+        __ ldr(c_rarg2, Address(c_rarg2, ObjArrayKlass::element_klass_offset()));
+        __ ldr(c_rarg3, Address(c_rarg2, Klass::super_check_offset_offset()));
+        __ far_call(RuntimeAddress(copyfunc_addr));
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          Label failed;
+          __ cbnz(r0, failed);
+          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_cnt));
+          __ bind(failed);
+        }
+#endif
+
+        __ cbz(r0, *stub->continuation());
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_attempt_cnt));
+        }
+#endif
+        assert_different_registers(dst, dst_pos, length, src_pos, src, rscratch1);
+        __ mov(rscratch1, r0);
+
+        // Restore previously spilled arguments
+        __ ldr(dst,     Address(sp, 0*BytesPerWord));
+        __ ldr(dst_pos, Address(sp, 1*BytesPerWord));
+        __ ldr(length,  Address(sp, 2*BytesPerWord));
+        __ ldr(src_pos, Address(sp, 3*BytesPerWord));
+        __ ldr(src,     Address(sp, 4*BytesPerWord));
+
+        // return value is -1^K where K is partial copied count
+        __ mvn(rscratch1, rscratch1);
+        // adjust length down and src/end pos up by partial copied count
+        __ sub(length, length, rscratch1);
+        __ add(src_pos, src_pos, rscratch1);
+        __ add(dst_pos, dst_pos, rscratch1);
+      }
+
+      __ b(*stub->entry());
+
+      __ bind(cont);
+      __ pop(RegSet::of(src, dst), sp);
+    }
+  }
+
+#ifdef ASSERT
+  if (basic_type != T_OBJECT || !(flags & LIR_OpArrayCopy::type_check)) {
+    // Sanity check the known type with the incoming class.  For the
+    // primitive case the types must match exactly with src.klass and
+    // dst.klass each exactly matching the default type.  For the
+    // object array case, if no type check is needed then either the
+    // dst type is exactly the expected type and the src type is a
+    // subtype which we can't check or src is the same array as dst
+    // but not necessarily exactly of type default_type.
+    Label known_ok, halt;
+    __ mov_metadata(tmp, default_type->constant_encoding());
+
+    if (basic_type != T_OBJECT) {
+
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(halt, Assembler::NE);
+      __ ldr(rscratch1, src_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(known_ok, Assembler::EQ);
+    } else {
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(known_ok, Assembler::EQ);
+      __ cmp(src, dst);
+      __ b(known_ok, Assembler::EQ);
+    }
+    __ bind(halt);
+    __ stop("incorrect type information in arraycopy");
+    __ bind(known_ok);
+  }
+#endif
+
+  assert(dst_pos == r0, "assumed in the code below");
+  __ mov(rscratch1, dst_pos); // save r0
+  __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
+  __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
+  assert_different_registers(c_rarg0, dst, rscratch1, length);
+  __ lea(c_rarg1, Address(dst, rscratch1, lsl(scale)));
+  __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
+  assert_different_registers(c_rarg1, dst, length);
+  __ mov(c_rarg2, length);
+
+  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
+  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
+  const char *name;
+  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
+
+  CodeBlob *cb = CodeCache::find_blob(entry);
+  if (cb) {
+    __ far_call(RuntimeAddress(entry));
+  } else {
+    __ call_VM_leaf(entry, 3);
+  }
+
+  __ bind(*stub->continuation());
+}
+
+void LIR_Assembler::emit_lock(LIR_OpLock* op) {
+  Register obj = op->obj_opr()->as_register();  // may not be an oop
+  Register hdr = op->hdr_opr()->as_register();
+  Register lock = op->lock_opr()->as_register();
+  if (!UseFastLocking) {
+    __ b(*op->stub()->entry());
+  } else if (op->code() == lir_lock) {
+    Register scratch = noreg;
+    if (UseBiasedLocking) {
+      scratch = op->scratch_opr()->as_register();
+    }
+    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
+    // add debug info for NullPointerException only if one is possible
+    int null_check_offset = __ lock_object(hdr, obj, lock, scratch, *op->stub()->entry());
+    if (op->info() != NULL) {
+      add_debug_info_for_null_check(null_check_offset, op->info());
+    }
+    // done
+  } else if (op->code() == lir_unlock) {
+    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
+    __ unlock_object(hdr, obj, lock, *op->stub()->entry());
+  } else {
+    Unimplemented();
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+
+void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
+  ciMethod* method = op->profiled_method();
+  int bci          = op->profiled_bci();
+  ciMethod* callee = op->profiled_callee();
+
+  // Update counter for all call types
+  ciMethodData* md = method->method_data_or_null();
+  assert(md != NULL, "Sanity");
+  ciProfileData* data = md->bci_to_data(bci);
+  assert(data->is_CounterData(), "need CounterData for calls");
+  assert(op->mdo()->is_single_cpu(),  "mdo must be allocated");
+  Register mdo  = op->mdo()->as_register();
+  __ mov_metadata(mdo, md->constant_encoding());
+  Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
+  Bytecodes::Code bc = method->java_code_at_bci(bci);
+  const bool callee_is_static = callee->is_loaded() && callee->is_static();
+  // Perform additional virtual call profiling for invokevirtual and
+  // invokeinterface bytecodes
+  if ((bc == Bytecodes::_invokevirtual || bc == Bytecodes::_invokeinterface) &&
+      !callee_is_static &&  // required for optimized MH invokes
+      C1ProfileVirtualCalls) {
+    assert(op->recv()->is_single_cpu(), "recv must be allocated");
+    Register recv = op->recv()->as_register();
+    assert_different_registers(mdo, recv);
+    assert(data->is_VirtualCallData(), "need VirtualCallData for virtual calls");
+    ciKlass* known_klass = op->known_holder();
+    if (C1OptimizeVirtualCallProfiling && known_klass != NULL) {
+      // We know the type that will be seen at this call site; we can
+      // statically update the MethodData* rather than needing to do
+      // dynamic tests on the receiver type
+
+      // NOTE: we should probably put a lock around this search to
+      // avoid collisions by concurrent compilations
+      ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
+      uint i;
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (known_klass->equals(receiver)) {
+          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
+          __ addptr(data_addr, DataLayout::counter_increment);
+          return;
+        }
+      }
+
+      // Receiver type not found in profile data; select an empty slot
+
+      // Note that this is less efficient than it should be because it
+      // always does a write to the receiver part of the
+      // VirtualCallData rather than just the first time
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (receiver == NULL) {
+          Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)));
+          __ mov_metadata(rscratch1, known_klass->constant_encoding());
+          __ lea(rscratch2, recv_addr);
+          __ str(rscratch1, Address(rscratch2));
+          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
+          __ addptr(data_addr, DataLayout::counter_increment);
+          return;
+        }
+      }
+    } else {
+      __ load_klass(recv, recv);
+      Label update_done;
+      type_profile_helper(mdo, md, data, recv, &update_done);
+      // Receiver did not match any saved receiver and there is no empty row for it.
+      // Increment total counter to indicate polymorphic case.
+      __ addptr(counter_addr, DataLayout::counter_increment);
+
+      __ bind(update_done);
+    }
+  } else {
+    // Static call
+    __ addptr(counter_addr, DataLayout::counter_increment);
+  }
+}
+
+
+void LIR_Assembler::emit_delay(LIR_OpDelay*) {
+  Unimplemented();
+}
+
+
+void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
+  __ lea(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
+}
+
+void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
+  assert(op->crc()->is_single_cpu(),  "crc must be register");
+  assert(op->val()->is_single_cpu(),  "byte value must be register");
+  assert(op->result_opr()->is_single_cpu(), "result must be register");
+  Register crc = op->crc()->as_register();
+  Register val = op->val()->as_register();
+  Register res = op->result_opr()->as_register();
+
+  assert_different_registers(val, crc, res);
+  __ lea(res, ExternalAddress(StubRoutines::crc_table_addr()));
+
+  __ inv(crc, crc);
+  __ update_byte_crc32(crc, val, res);
+  __ inv(res, crc);
+}
+
+void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
+  COMMENT("emit_profile_type {");
+  Register obj = op->obj()->as_register();
+  Register tmp = op->tmp()->as_pointer_register();
+  Address mdo_addr = as_Address(op->mdp()->as_address_ptr(), noreg, Address::IDT_INT);
+  ciKlass* exact_klass = op->exact_klass();
+  intptr_t current_klass = op->current_klass();
+  bool not_null = op->not_null();
+  bool no_conflict = op->no_conflict();
+
+  Label update, next, none;
+
+  bool do_null = !not_null;
+  bool exact_klass_set = exact_klass != NULL && ciTypeEntries::valid_ciklass(current_klass) == exact_klass;
+  bool do_update = !TypeEntries::is_type_unknown(current_klass) && !exact_klass_set;
+
+  assert(do_null || do_update, "why are we here?");
+  assert(!TypeEntries::was_null_seen(current_klass) || do_update, "why are we here?");
+  assert(mdo_addr.base() != rscratch1, "wrong register");
+
+  __ verify_oop(obj);
+
+  if (tmp != obj) {
+    __ mov(tmp, obj);
+  }
+  if (do_null) {
+    __ cbnz(tmp, update);
+    if (!TypeEntries::was_null_seen(current_klass)) {
+      __ ldr(rscratch2, mdo_addr);
+      __ orr(rscratch2, rscratch2, TypeEntries::null_seen);
+      __ str(rscratch2, mdo_addr);
+    }
+    if (do_update) {
+#ifndef ASSERT
+      __ b(next);
+    }
+#else
+      __ b(next);
+    }
+  } else {
+    __ cbnz(tmp, update);
+    __ stop("unexpected null obj");
+#endif
+  }
+
+  __ bind(update);
+
+  if (do_update) {
+#ifdef ASSERT
+    if (exact_klass != NULL) {
+      Label ok;
+      __ load_klass(tmp, tmp);
+      __ mov_metadata(rscratch1, exact_klass->constant_encoding());
+      __ eor(rscratch1, tmp, rscratch1);
+      __ cbz(rscratch1, ok);
+      __ stop("exact klass and actual klass differ");
+      __ bind(ok);
+    }
+#endif
+    if (!no_conflict) {
+      if (exact_klass == NULL || TypeEntries::is_type_none(current_klass)) {
+        if (exact_klass != NULL) {
+          __ mov_metadata(tmp, exact_klass->constant_encoding());
+        } else {
+          __ load_klass(tmp, tmp);
+        }
+
+        __ ldr(rscratch2, mdo_addr);
+        __ eor(tmp, tmp, rscratch2);
+        __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
+        // klass seen before, nothing to do. The unknown bit may have been
+        // set already but no need to check.
+        __ cbz(rscratch1, next);
+
+        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
+        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.
+
+        if (TypeEntries::is_type_none(current_klass)) {
+          __ cbz(rscratch2, none);
+          __ cmp(rscratch2, TypeEntries::null_seen);
+          __ b(none, Assembler::EQ);
+          // There is a chance that the checks above (re-reading profiling
+          // data from memory) fail if another thread has just set the
+          // profiling to this obj's klass
+          __ dmb(Assembler::ISH);
+          __ ldr(rscratch2, mdo_addr);
+          __ eor(tmp, tmp, rscratch2);
+          __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
+          __ cbz(rscratch1, next);
+        }
+      } else {
+        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
+               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "conflict only");
+
+        __ ldr(tmp, mdo_addr);
+        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
+        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.
+      }
+
+      // different than before. Cannot keep accurate profile.
+      __ ldr(rscratch2, mdo_addr);
+      __ orr(rscratch2, rscratch2, TypeEntries::type_unknown);
+      __ str(rscratch2, mdo_addr);
+
+      if (TypeEntries::is_type_none(current_klass)) {
+        __ b(next);
+
+        __ bind(none);
+        // first time here. Set profile type.
+        __ str(tmp, mdo_addr);
+      }
+    } else {
+      // There's a single possible klass at this profile point
+      assert(exact_klass != NULL, "should be");
+      if (TypeEntries::is_type_none(current_klass)) {
+        __ mov_metadata(tmp, exact_klass->constant_encoding());
+        __ ldr(rscratch2, mdo_addr);
+        __ eor(tmp, tmp, rscratch2);
+        __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
+        __ cbz(rscratch1, next);
+#ifdef ASSERT
+        {
+          Label ok;
+          __ ldr(rscratch1, mdo_addr);
+          __ cbz(rscratch1, ok);
+          __ cmp(rscratch1, TypeEntries::null_seen);
+          __ b(ok, Assembler::EQ);
+          // may have been set by another thread
+          __ dmb(Assembler::ISH);
+          __ mov_metadata(rscratch1, exact_klass->constant_encoding());
+          __ ldr(rscratch2, mdo_addr);
+          __ eor(rscratch2, rscratch1, rscratch2);
+          __ andr(rscratch2, rscratch2, TypeEntries::type_mask);
+          __ cbz(rscratch2, ok);
+
+          __ stop("unexpected profiling mismatch");
+          __ bind(ok);
+        }
+#endif
+        // first time here. Set profile type.
+        __ ldr(tmp, mdo_addr);
+      } else {
+        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
+               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "inconsistent");
+
+        __ ldr(tmp, mdo_addr);
+        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
+        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.
+
+        __ orr(tmp, tmp, TypeEntries::type_unknown);
+        __ str(tmp, mdo_addr);
+        // FIXME: Write barrier needed here?
+      }
+    }
+
+    __ bind(next);
+  }
+  COMMENT("} emit_profile_type");
+}
+
+
+void LIR_Assembler::align_backward_branch_target() {
+}
+
+
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+  if (left->is_single_cpu()) {
+    assert(dest->is_single_cpu(), "expect single result reg");
+    __ neg(dest->as_register(), left->as_register());
+  } else if (left->is_double_cpu()) {
+    assert(dest->is_double_cpu(), "expect double result reg");
+    const Register l_lo = left->as_register_lo();
+    Register l_hi = left->as_register_hi();
+    check_register_collision(dest->as_register_lo(), &l_hi);
+    __ rsbs(dest->as_register_lo(), l_lo, 0);
+    __ rsc(dest->as_register_hi(), l_hi, 0);
+  } else if (left->is_single_fpu()) {
+    assert(dest->is_single_fpu(), "expect single float result reg");
+    __ vneg_f32(dest->as_float_reg(), left->as_float_reg());
+  } else {
+    assert(left->is_double_fpu(), "expect double float operand reg");
+    assert(dest->is_double_fpu(), "expect double float result reg");
+    __ vneg_f64(dest->as_double_reg(), left->as_double_reg());
+  }
+}
+
+
+void LIR_Assembler::leal(LIR_Opr addr, LIR_Opr dest) {
+  __ lea(dest->as_register(), as_Address(addr->as_address_ptr(), noreg, Address::IDT_LEA));
+}
+
+
+void LIR_Assembler::rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info) {
+  assert(!tmp->is_valid(), "don't need temporary");
+  CodeBlob *cb = CodeCache::find_blob(dest);
+  if (cb) {
+    __ far_call(RuntimeAddress(dest));
+  } else {
+    __ lea(rscratch1, RuntimeAddress(dest));
+    __ bl(rscratch1);
+  }
+  if (info != NULL) {
+    add_call_info_here(info);
+  }
+  __ maybe_isb();
+}
+
+void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
+  if (type == T_LONG || type == T_DOUBLE) {
+    const LIR_Opr long_val = FrameMap::long0_opr;
+
+    int null_check_offset = -1;
+
+    if (src->is_register() && dest->is_address()) {
+      // long1 reserved as temp by LinearScan::pd_add_temps
+      const LIR_Opr long_tmp = FrameMap::long1_opr;
+      __ lea(rscratch1, as_Address_lo(dest->as_address_ptr(), Address::IDT_LEA));
+
+      if (type == T_DOUBLE) {
+        // long0 reserved as temp by LinearScan::pd_add_temps
+        __ vmov_f64(long_val->as_register_lo(), long_val->as_register_hi(), src->as_double_reg());
+      } else {
+        assert(type == T_LONG && src->is_same_register(long_val), "T_LONG src should be in long0 (by LIRGenerator)");
+      }
+
+      null_check_offset = __ offset();
+      __ atomic_strd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1,
+          long_tmp->as_register_lo(), long_tmp->as_register_hi());
+
+    } else if (src->is_address() && dest->is_register()) {
+      __ lea(rscratch1, as_Address_lo(src->as_address_ptr(), Address::IDT_LEA));
+
+      null_check_offset = __ offset();
+      __ atomic_ldrd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1);
+
+      if (type == T_DOUBLE) {
+        __ vmov_f64(dest->as_double_reg(), long_val->as_register_lo(), long_val->as_register_hi());
+      } else {
+        assert(type != T_LONG || dest->is_same_register(long_val), "T_LONG dest should be in long0 (by LIRGenerator)");
+      }
+    } else {
+      Unimplemented();
+    }
+
+    if (info != NULL) {
+      add_debug_info_for_null_check(null_check_offset, info);
+    }
+
+  } else {
+    move_op(src, dest, type, lir_patch_none, info,
+            /*pop_fpu_stack*/false, /*unaligned*/false, /*wide*/false);
+  }
+}
+
+#ifdef ASSERT
+// emit run-time assertion
+void LIR_Assembler::emit_assert(LIR_OpAssert* op) {
+  assert(op->code() == lir_assert, "must be");
+
+  if (op->in_opr1()->is_valid()) {
+    assert(op->in_opr2()->is_valid(), "both operands must be valid");
+    comp_op(op->condition(), op->in_opr1(), op->in_opr2(), op);
+  } else {
+    assert(op->in_opr2()->is_illegal(), "both operands must be illegal");
+    assert(op->condition() == lir_cond_always, "no other conditions allowed");
+  }
+
+  Label ok;
+  if (op->condition() != lir_cond_always) {
+    Assembler::Condition acond = Assembler::AL;
+    switch (op->condition()) {
+      case lir_cond_equal:        acond = Assembler::EQ;  break;
+      case lir_cond_notEqual:     acond = Assembler::NE;  break;
+      case lir_cond_less:         acond = Assembler::LT;  break;
+      case lir_cond_greaterEqual: acond = Assembler::GE;  break;
+      case lir_cond_lessEqual:    acond = Assembler::LE;  break;
+      case lir_cond_greater:      acond = Assembler::GT;  break;
+      case lir_cond_belowEqual:   acond = Assembler::LS;  break;
+      case lir_cond_aboveEqual:   acond = Assembler::HS;  break;
+      default:                    ShouldNotReachHere();
+    }
+    if (op->in_opr1()->type() == T_LONG) {
+      // a special trick here to be able to effectively compare jlongs
+      // for the lessEqual and greater conditions the jlong operands are swapped
+      // during comparison and hence should use mirror condition in conditional
+      // instruction
+      // see LIR_Assembler::comp_op and LIR_Assembler::cmove
+      switch (op->condition()) {
+        case lir_cond_lessEqual:    acond = Assembler::GE;  break;
+        case lir_cond_greater:      acond = Assembler::LT;  break;
+      }
+    }
+    __ b(ok, acond);
+  }
+  if (op->halt()) {
+    const char* str = __ code_string(op->msg());
+    __ stop(str);
+  } else {
+    breakpoint();
+  }
+  __ bind(ok);
+}
+#endif
+
+#ifndef PRODUCT
+#define COMMENT(x)   do { __ block_comment(x); } while (0)
+#else
+#define COMMENT(x)
+#endif
+
+void LIR_Assembler::membar() {
+  COMMENT("membar");
+  __ membar(MacroAssembler::AnyAny);
+}
+
+void LIR_Assembler::membar_acquire() {
+  __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+}
+
+void LIR_Assembler::membar_release() {
+  __ membar(Assembler::LoadStore|Assembler::StoreStore);
+}
+
+void LIR_Assembler::membar_loadload() {
+  __ membar(Assembler::LoadLoad);
+}
+
+void LIR_Assembler::membar_storestore() {
+  __ membar(MacroAssembler::StoreStore);
+}
+
+void LIR_Assembler::membar_loadstore() { __ membar(MacroAssembler::LoadStore); }
+
+void LIR_Assembler::membar_storeload() { __ membar(MacroAssembler::StoreLoad); }
+
+void LIR_Assembler::get_thread(LIR_Opr result_reg) {
+  __ mov(result_reg->as_register(), rthread);
+}
+
+
+void LIR_Assembler::peephole(LIR_List *lir) {
+#if 0
+  if (tableswitch_count >= max_tableswitches)
+    return;
+
+  /*
+    This finite-state automaton recognizes sequences of compare-and-
+    branch instructions.  We will turn them into a tableswitch.  You
+    could argue that C1 really shouldn't be doing this sort of
+    optimization, but without it the code is really horrible.
+  */
+
+  enum { start_s, cmp1_s, beq_s, cmp_s } state;
+  int first_key, last_key = -2147483648;
+  int next_key = 0;
+  int start_insn = -1;
+  int last_insn = -1;
+  Register reg = noreg;
+  LIR_Opr reg_opr;
+  state = start_s;
+
+  LIR_OpList* inst = lir->instructions_list();
+  for (int i = 0; i < inst->length(); i++) {
+    LIR_Op* op = inst->at(i);
+    switch (state) {
+    case start_s:
+      first_key = -1;
+      start_insn = i;
+      switch (op->code()) {
+      case lir_cmp:
+        LIR_Opr opr1 = op->as_Op2()->in_opr1();
+        LIR_Opr opr2 = op->as_Op2()->in_opr2();
+        if (opr1->is_cpu_register() && opr1->is_single_cpu()
+            && opr2->is_constant()
+            && opr2->type() == T_INT) {
+          reg_opr = opr1;
+          reg = opr1->as_register();
+          first_key = opr2->as_constant_ptr()->as_jint();
+          next_key = first_key + 1;
+          state = cmp_s;
+          goto next_state;
+        }
+        break;
+      }
+      break;
+    case cmp_s:
+      switch (op->code()) {
+      case lir_branch:
+        if (op->as_OpBranch()->cond() == lir_cond_equal) {
+          state = beq_s;
+          last_insn = i;
+          goto next_state;
+        }
+      }
+      state = start_s;
+      break;
+    case beq_s:
+      switch (op->code()) {
+      case lir_cmp: {
+        LIR_Opr opr1 = op->as_Op2()->in_opr1();
+        LIR_Opr opr2 = op->as_Op2()->in_opr2();
+        if (opr1->is_cpu_register() && opr1->is_single_cpu()
+            && opr1->as_register() == reg
+            && opr2->is_constant()
+            && opr2->type() == T_INT
+            && opr2->as_constant_ptr()->as_jint() == next_key) {
+          last_key = next_key;
+          next_key++;
+          state = cmp_s;
+          goto next_state;
+        }
+      }
+      }
+      last_key = next_key;
+      state = start_s;
+      break;
+    default:
+      assert(false, "impossible state");
+    }
+    if (state == start_s) {
+      if (first_key < last_key - 5L && reg != noreg) {
+        {
+          // printf("found run register %d starting at insn %d low value %d high value %d\n",
+          //        reg->encoding(),
+          //        start_insn, first_key, last_key);
+          //   for (int i = 0; i < inst->length(); i++) {
+          //     inst->at(i)->print();
+          //     tty->print("\n");
+          //   }
+          //   tty->print("\n");
+        }
+
+        struct tableswitch *sw = &switches[tableswitch_count];
+        sw->_insn_index = start_insn, sw->_first_key = first_key,
+          sw->_last_key = last_key, sw->_reg = reg;
+        inst->insert_before(last_insn + 1, new LIR_OpLabel(&sw->_after));
+        {
+          // Insert the new table of branches
+          int offset = last_insn;
+          for (int n = first_key; n < last_key; n++) {
+            inst->insert_before
+              (last_insn + 1,
+               new LIR_OpBranch(lir_cond_always, T_ILLEGAL,
+                                inst->at(offset)->as_OpBranch()->label()));
+            offset -= 2, i++;
+          }
+        }
+        // Delete all the old compare-and-branch instructions
+        for (int n = first_key; n < last_key; n++) {
+          inst->remove_at(start_insn);
+          inst->remove_at(start_insn);
+        }
+        // Insert the tableswitch instruction
+        inst->insert_before(start_insn,
+                            new LIR_Op2(lir_cmp, lir_cond_always,
+                                        LIR_OprFact::intConst(tableswitch_count),
+                                        reg_opr));
+        inst->insert_before(start_insn + 1, new LIR_OpLabel(&sw->_branches));
+        tableswitch_count++;
+      }
+      reg = noreg;
+      last_key = -2147483648;
+    }
+  next_state:
+    ;
+  }
+#endif
+}
+
+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp_op) {
+  BasicType type = src->type();
+  Address addr = as_Address(src->as_address_ptr(), Address::toInsnDataType(type));
+
+  bool is_long = false;
+
+  switch(type) {
+  case T_INT:
+  case T_OBJECT:
+  case T_ARRAY:
+    break;
+  case T_LONG:
+    is_long = true;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  switch (code) {
+  case lir_xadd:
+    {
+      Register tmp = tmp_op->as_register();
+      Register dst = as_reg(dest);
+      Label again;
+      __ lea(tmp, addr);
+      __ bind(again);
+      if(is_long) {
+          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
+          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");
+          _masm->ldrexd(dst, tmp);
+      } else {
+          _masm->ldrex(dst, tmp);
+      }
+      arith_op(lir_add, dest, data, dest, NULL, false);
+      if (is_long) {
+        _masm->strexd(rscratch1, dst, tmp);
+      } else {
+        _masm->strex(rscratch1, dst, tmp);
+      }
+      __ cbnz(rscratch1, again);
+      arith_op(lir_sub, dest, data, dest, NULL, false);
+      break;
+    }
+  case lir_xchg:
+    {
+      Register tmp = tmp_op->as_register();
+      Register obj = as_reg(data);
+      Register dst = as_reg(dest);
+      assert_different_registers(obj, addr.base(), tmp, rscratch1, dst);
+      Label again;
+      __ lea(tmp, addr);
+      __ bind(again);
+      if(is_long) {
+          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
+          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");
+
+          assert(data->is_double_cpu(), "should be double register");
+          assert(data->as_register_lo()->successor() == data->as_register_hi(), "must be contiguous");
+          assert((data->as_register_lo()->encoding() & 1) == 0, "must be even");
+
+          _masm->ldrexd(dst, tmp);
+          _masm->strexd(rscratch1, obj, tmp);
+      } else {
+         _masm->ldrex(dst, tmp);
+         _masm->strex(rscratch1, obj, tmp);
+      }
+      __ cbnz(rscratch1, again);
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  __ membar(__ AnyAny);
+}
+
+void LIR_Assembler::check_register_collision(Register d, Register *s1, Register *s2, Register tmp) {
+  // use a temp if any of the registers used as a source of operation
+  // collide with result register of the prerequisite operation
+  if (d == *s1) {
+    __ mov(tmp, d);
+    *s1 = tmp;
+  } else if (s2 && d == *s2) {
+    __ mov(tmp, d);
+    *s2 = tmp;
+  }
+}
+
+#undef __
--- /dev/null	2016-08-26 13:07:53.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_LIRAssembler_aarch32.hpp	2016-08-26 13:07:53.000000000 +0300
@@ -0,0 +1,96 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_X86_VM_C1_LIRASSEMBLER_AARCH32_HPP
+#define CPU_X86_VM_C1_LIRASSEMBLER_AARCH32_HPP
+
+#include "assembler_aarch32.hpp"
+
+
+ private:
+
+  int array_element_size(BasicType type) const;
+
+  // helper functions which checks for overflow and sets bailout if it
+  // occurs.  Always returns a valid embeddable pointer but in the
+  // bailout case the pointer won't be to unique storage.
+  address float_constant(float f);
+  address double_constant(double d);
+
+  address int_constant(jlong n);
+
+  Address as_Address(LIR_Address* addr, Register tmp, Address::InsnDataType type);
+  Address as_Address_hi(LIR_Address* addr, Address::InsnDataType type);
+  Address as_Address_lo(LIR_Address* addr, Address::InsnDataType type);
+
+  Address as_Address(LIR_Address* addr, Address::InsnDataType type) {
+    return as_Address(addr, rscratch1, type);
+  }
+
+
+  // Record the type of the receiver in ReceiverTypeData
+  void type_profile_helper(Register mdo,
+                           ciMethodData *md, ciProfileData *data,
+                           Register recv, Label* update_done);
+  void add_debug_info_for_branch(address adr, CodeEmitInfo* info);
+
+  void casw(Register addr, Register newval, Register cmpval, Register result);
+  void casl(Register addr, Register newval_lo, Register newval_hi,
+            Register cmpval_lo,  Register cmpval_hi,
+            Register tmp_lo, Register tmp_hi, Register result);
+
+  FloatRegister as_float_reg(LIR_Opr doubleReg);
+
+  static const int max_tableswitches = 20;
+  struct tableswitch switches[max_tableswitches];
+  int tableswitch_count;
+
+  void init() { tableswitch_count = 0; }
+
+  void deoptimize_trap(CodeEmitInfo *info);
+
+  // remap input register (*s1 or *s2) to a temp one if it is at the same time
+  // used a result register (d) of a preceeding operation (so otherwise its
+  // contents gets effectively corrupt)
+  void check_register_collision(Register d, Register *s1, Register *s2 = NULL, Register tmp = rscratch1);
+
+public:
+
+  void store_parameter(Register r, int offset_from_sp_in_words);
+  void store_parameter(jint c,     int offset_from_sp_in_words);
+  void store_parameter(jobject c,  int offset_from_sp_in_words);
+
+enum { call_stub_size = 12 * NativeInstruction::arm_insn_sz,
+       exception_handler_size = DEBUG_ONLY(1*K) NOT_DEBUG(175),
+       deopt_handler_size = 7 * NativeInstruction::arm_insn_sz };
+
+#endif // CPU_X86_VM_C1_LIRASSEMBLER_AARCH32_HPP
--- /dev/null	2016-08-26 13:07:55.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_LIRGenerator_aarch32.cpp	2016-08-26 13:07:55.000000000 +0300
@@ -0,0 +1,1452 @@
+/*
+ * Copyright (c) 2005, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_Instruction.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_LIRGenerator.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArray.hpp"
+#include "ci/ciObjArrayKlass.hpp"
+#include "ci/ciTypeArrayKlass.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+#ifdef ASSERT
+#define __ gen()->lir(__FILE__, __LINE__)->
+#else
+#define __ gen()->lir()->
+#endif
+
+// Item will be loaded into a byte register; Intel only
+void LIRItem::load_byte_item() {
+  load_item();
+}
+
+
+void LIRItem::load_nonconstant() {
+  LIR_Opr r = value()->operand();
+  if (r->is_constant()) {
+    _result = r;
+  } else {
+    load_item();
+  }
+}
+
+//--------------------------------------------------------------
+//               LIRGenerator
+//--------------------------------------------------------------
+
+
+LIR_Opr LIRGenerator::exceptionOopOpr() { return FrameMap::r0_oop_opr; }
+LIR_Opr LIRGenerator::exceptionPcOpr()  { return FrameMap::r3_opr; }
+LIR_Opr LIRGenerator::divInOpr()        { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::divOutOpr()       { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::remOutOpr()       { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::shiftCountOpr()   { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::syncTempOpr()     { return FrameMap::r0_opr; }
+LIR_Opr LIRGenerator::getThreadTemp()   { return LIR_OprFact::illegalOpr; }
+
+
+LIR_Opr LIRGenerator::result_register_for(ValueType* type, bool callee) {
+  LIR_Opr opr;
+  switch (type->tag()) {
+    case intTag:     opr = FrameMap::r0_opr;          break;
+    case objectTag:  opr = FrameMap::r0_oop_opr;      break;
+    case longTag:    opr = FrameMap::long0_opr;        break;
+    case floatTag:   opr = FrameMap::fpu0_float_opr;  break;
+    case doubleTag:  opr = FrameMap::fpu0_double_opr;  break;
+
+    case addressTag:
+    default: ShouldNotReachHere(); return LIR_OprFact::illegalOpr;
+  }
+
+  assert(opr->type_field() == as_OprType(as_BasicType(type)), "type mismatch");
+  return opr;
+}
+
+
+LIR_Opr LIRGenerator::rlock_byte(BasicType type) {
+  LIR_Opr reg = new_register(T_INT);
+  set_vreg_flag(reg, LIRGenerator::byte_reg);
+  return reg;
+}
+
+
+//--------- loading items into registers --------------------------------
+
+
+bool LIRGenerator::can_store_as_constant(Value v, BasicType type) const {
+  if (v->type()->as_IntConstant() != NULL) {
+    return v->type()->as_IntConstant()->value() == 0L;
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return v->type()->as_LongConstant()->value() == 0L;
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else {
+    return false;
+  }
+}
+
+bool LIRGenerator::can_inline_as_constant(Value v) const {
+  if (v->type()->as_IntConstant() != NULL) {
+    return Assembler::operand_valid_for_add_sub_immediate(v->type()->as_IntConstant()->value());
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return Assembler::operand_valid_for_add_sub_immediate(v->type()->as_LongConstant()->value());
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else {
+    return false;
+  }
+}
+
+
+bool LIRGenerator::can_inline_as_constant(LIR_Const* c) const {
+  switch (c->type()) {
+  case T_BOOLEAN:
+  case T_CHAR:
+  case T_BYTE:
+  case T_SHORT:
+  case T_INT:
+    return Assembler::operand_valid_for_add_sub_immediate(c->as_jint());
+  case T_LONG:
+    return Assembler::operand_valid_for_add_sub_immediate(c->as_jlong());
+
+  case T_OBJECT:
+    return c->as_jobject() == (jobject) NULL;
+  case T_METADATA:
+    return c->as_metadata() == (Metadata*) NULL;
+
+  case T_FLOAT:
+    return Assembler::operand_valid_for_float_immediate(c->as_jfloat());
+  case T_DOUBLE:
+    return Assembler::operand_valid_for_float_immediate(c->as_jdouble());
+  }
+  return false;
+}
+
+LIR_Opr LIRGenerator::safepoint_poll_register() {
+  return LIR_OprFact::illegalOpr;
+}
+
+LIR_Address* LIRGenerator::generate_address(LIR_Opr base, LIR_Opr index,
+                                            int shift, int disp, BasicType type) {
+  const Address::InsnDataType insn_type = Address::toInsnDataType(type);
+  assert(base->is_register(), "must be");
+
+  // accumulate fixed displacements
+  if (index->is_constant()) {
+    disp += index->as_constant_ptr()->as_jint() << shift;
+    index = LIR_OprFact::illegalOpr;
+    shift = 0;
+  }
+
+  // aarch32 cannot handle natively both index and offset at the same time
+  // need to calculate effective value
+  if (index->is_register()) {
+    if ((disp != 0) &&
+        Address::shift_ok_for_index(lsl(shift), insn_type) &&
+        Assembler::operand_valid_for_add_sub_immediate(disp)) {
+      // add tmp, base, disp
+      // ldr r, [tmp, index, LSL #shift ]
+      LIR_Opr tmp = new_pointer_register();
+      __ add(base, LIR_OprFact::intptrConst(disp), tmp);
+      base = tmp;
+      disp = 0;
+    } else {
+      assert(shift <= (int) LIR_Address::times_8, "no large shift could be here");
+      // add tmp, base, index, LSL #shift
+      // ...
+      // ldr r, [tmp, ...]
+      LIR_Opr tmp = new_pointer_register();
+      __ leal(LIR_OprFact::address(new LIR_Address(base, index, (LIR_Address::Scale) shift, 0, type)), tmp);
+      base = tmp;
+      index = LIR_OprFact::illegalOpr;
+      shift = 0;
+    }
+  }
+
+  assert(!index->is_register() || (disp == 0), "should be");
+
+  if (!Address::offset_ok_for_immed(disp, insn_type)) {
+    assert(!index->is_valid(), "should be");
+    // here index should be illegal so we can replace it with the displacement
+    // loaded into a register
+    // mov tmp, disp
+    // ldr r, [base, tmp]
+    index = new_pointer_register();
+    __ move(LIR_OprFact::intptrConst(disp), index);
+    disp = 0;
+  }
+
+  assert(Address::offset_ok_for_immed(disp, Address::toInsnDataType(type)), "must be");
+  return new LIR_Address(base, index, (LIR_Address::Scale) shift, disp, type);
+}
+
+
+LIR_Address* LIRGenerator::emit_array_address(LIR_Opr array_opr, LIR_Opr index_opr,
+                                              BasicType type, bool needs_card_mark) {
+  int offset_in_bytes = arrayOopDesc::base_offset_in_bytes(type);
+  int elem_size = type2aelembytes(type);
+  int shift = exact_log2(elem_size);
+
+  LIR_Address* addr = generate_address(array_opr, index_opr, shift, offset_in_bytes, type);
+
+  if (needs_card_mark) {
+    // This store will need a precise card mark, so go ahead and
+    // compute the full adddres instead of computing once for the
+    // store and again for the card mark.
+    LIR_Opr tmp = new_pointer_register();
+    __ leal(LIR_OprFact::address(addr), tmp);
+    return new LIR_Address(tmp, type);
+  } else {
+    return addr;
+  }
+}
+
+LIR_Opr LIRGenerator::load_immediate(int x, BasicType type) {
+  LIR_Opr r;
+  if (type == T_LONG) {
+    r = LIR_OprFact::longConst(x);
+    if (!Assembler::operand_valid_for_logical_immediate(false, x)) {
+      LIR_Opr tmp = new_register(type);
+      __ move(r, tmp);
+      return tmp;
+    }
+  } else if (type == T_INT) {
+    r = LIR_OprFact::intConst(x);
+    if (!Assembler::operand_valid_for_logical_immediate(true, x)) {
+      // This is all rather nasty.  We don't know whether our constant
+      // is required for a logical or an arithmetic operation, wo we
+      // don't know what the range of valid values is!!
+      LIR_Opr tmp = new_register(type);
+      __ move(r, tmp);
+      return tmp;
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  return r;
+}
+
+
+
+void LIRGenerator::increment_counter(address counter, BasicType type, int step) {
+  LIR_Opr pointer = new_pointer_register();
+  __ move(LIR_OprFact::intptrConst(counter), pointer);
+  LIR_Address* addr = new LIR_Address(pointer, type);
+  increment_counter(addr, step);
+}
+
+
+void LIRGenerator::increment_counter(LIR_Address* addr, int step) {
+  LIR_Opr imm = NULL;
+  switch(addr->type()) {
+  case T_INT:
+    imm = LIR_OprFact::intConst(step);
+    break;
+  case T_LONG:
+    imm = LIR_OprFact::longConst(step);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  LIR_Opr reg = new_register(addr->type());
+  __ load(addr, reg);
+  __ add(reg, imm, reg);
+  __ store(reg, addr);
+}
+
+void LIRGenerator::cmp_mem_int(LIR_Condition condition, LIR_Opr base, int disp, int c, CodeEmitInfo* info) {
+  LIR_Opr reg = new_register(T_INT);
+  __ load(generate_address(base, disp, T_INT), reg, info);
+  __ cmp(condition, reg, LIR_OprFact::intConst(c));
+}
+
+void LIRGenerator::cmp_reg_mem(LIR_Condition condition, LIR_Opr reg, LIR_Opr base, int disp, BasicType type, CodeEmitInfo* info) {
+  LIR_Opr reg1 = new_register(T_INT);
+  __ load(generate_address(base, disp, type), reg1, info);
+  __ cmp(condition, reg, reg1);
+}
+
+
+bool LIRGenerator::strength_reduce_multiply(LIR_Opr left, int c, LIR_Opr result, LIR_Opr tmp) {
+
+  if (is_power_of_2(c - 1)) {
+    __ shift_left(left, exact_log2(c - 1), tmp);
+    __ add(tmp, left, result);
+    return true;
+  } else if (is_power_of_2(c + 1)) {
+    __ shift_left(left, exact_log2(c + 1), tmp);
+    __ sub(tmp, left, result);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void LIRGenerator::store_stack_parameter (LIR_Opr item, ByteSize offset_from_sp) {
+  BasicType type = item->type();
+  __ store(item, new LIR_Address(FrameMap::sp_opr, in_bytes(offset_from_sp), type));
+}
+
+//----------------------------------------------------------------------
+//             visitor functions
+//----------------------------------------------------------------------
+
+
+void LIRGenerator::do_StoreIndexed(StoreIndexed* x) {
+  assert(x->is_pinned(),"");
+  bool needs_range_check = x->compute_needs_range_check();
+  bool use_length = x->length() != NULL;
+  bool obj_store = x->elt_type() == T_ARRAY || x->elt_type() == T_OBJECT;
+  bool needs_store_check = obj_store && (x->value()->as_Constant() == NULL ||
+                                         !get_jobject_constant(x->value())->is_null_object() ||
+                                         x->should_profile());
+
+  LIRItem array(x->array(), this);
+  LIRItem index(x->index(), this);
+  LIRItem value(x->value(), this);
+  LIRItem length(this);
+
+  array.load_item();
+  index.load_nonconstant();
+
+  if (use_length && needs_range_check) {
+    length.set_instruction(x->length());
+    length.load_item();
+
+  }
+  if (needs_store_check  || x->check_boolean()) {
+    value.load_item();
+  } else {
+    value.load_for_store(x->elt_type());
+  }
+
+  set_no_result(x);
+
+  // the CodeEmitInfo must be duplicated for each different
+  // LIR-instruction because spilling can occur anywhere between two
+  // instructions and so the debug information must be different
+  CodeEmitInfo* range_check_info = state_for(x);
+  CodeEmitInfo* null_check_info = NULL;
+  if (x->needs_null_check()) {
+    null_check_info = new CodeEmitInfo(range_check_info);
+  }
+
+  // emit array address setup early so it schedules better
+  // FIXME?  No harm in this on aarch64, and it might help
+  LIR_Address* array_addr = emit_array_address(array.result(), index.result(), x->elt_type(), obj_store);
+
+  if (GenerateRangeChecks && needs_range_check) {
+    if (use_length) {
+      __ cmp(lir_cond_belowEqual, length.result(), index.result());
+      __ branch(lir_cond_belowEqual, T_INT, new RangeCheckStub(range_check_info, index.result()));
+    } else {
+      array_range_check(array.result(), index.result(), null_check_info, range_check_info);
+      // range_check also does the null check
+      null_check_info = NULL;
+    }
+  }
+
+  if (GenerateArrayStoreCheck && needs_store_check) {
+    LIR_Opr tmp1 = new_register(objectType);
+    LIR_Opr tmp2 = new_register(objectType);
+    LIR_Opr tmp3 = new_register(objectType);
+
+    CodeEmitInfo* store_check_info = new CodeEmitInfo(range_check_info);
+    __ store_check(value.result(), array.result(), tmp1, tmp2, tmp3, store_check_info, x->profiled_method(), x->profiled_bci());
+  }
+
+  if (obj_store) {
+    // Needs GC write barriers.
+    pre_barrier(LIR_OprFact::address(array_addr), LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+    __ move(value.result(), array_addr, null_check_info);
+    // Seems to be a precise
+    post_barrier(LIR_OprFact::address(array_addr), value.result());
+  } else {
+    LIR_Opr result = maybe_mask_boolean(x, array.result(), value.result(), null_check_info);
+    __ move(result, array_addr, null_check_info);
+  }
+}
+
+void LIRGenerator::do_MonitorEnter(MonitorEnter* x) {
+  assert(x->is_pinned(),"");
+  LIRItem obj(x->obj(), this);
+  obj.load_item();
+
+  set_no_result(x);
+
+  // "lock" stores the address of the monitor stack slot, so this is not an oop
+  LIR_Opr lock = new_register(T_INT);
+  // Need a scratch register for biased locking
+  LIR_Opr scratch = LIR_OprFact::illegalOpr;
+  if (UseBiasedLocking) {
+    scratch = new_register(T_INT);
+  }
+
+  CodeEmitInfo* info_for_exception = NULL;
+  if (x->needs_null_check()) {
+    info_for_exception = state_for(x);
+  }
+  // this CodeEmitInfo must not have the xhandlers because here the
+  // object is already locked (xhandlers expect object to be unlocked)
+  CodeEmitInfo* info = state_for(x, x->state(), true);
+  monitor_enter(obj.result(), lock, syncTempOpr(), scratch,
+                        x->monitor_no(), info_for_exception, info);
+}
+
+
+void LIRGenerator::do_MonitorExit(MonitorExit* x) {
+  assert(x->is_pinned(),"");
+
+  LIRItem obj(x->obj(), this);
+  obj.dont_load_item();
+
+  LIR_Opr lock = new_register(T_INT);
+  LIR_Opr obj_temp = new_register(T_INT);
+  set_no_result(x);
+  monitor_exit(obj_temp, lock, syncTempOpr(), LIR_OprFact::illegalOpr, x->monitor_no());
+}
+
+
+void LIRGenerator::do_NegateOp(NegateOp* x) {
+
+  LIRItem from(x->x(), this);
+  from.load_item();
+  LIR_Opr result = rlock_result(x);
+  __ negate (from.result(), result);
+
+}
+
+// for  _fadd, _fmul, _fsub, _fdiv, _frem
+//      _dadd, _dmul, _dsub, _ddiv, _drem
+void LIRGenerator::do_ArithmeticOp_FPU(ArithmeticOp* x) {
+
+  if (x->op() == Bytecodes::_frem || x->op() == Bytecodes::_drem) {
+    // float remainder is implemented as a direct call into the runtime
+    LIRItem right(x->x(), this);
+    LIRItem left(x->y(), this);
+
+    BasicTypeList signature(2);
+    if (x->op() == Bytecodes::_frem) {
+      signature.append(T_FLOAT);
+      signature.append(T_FLOAT);
+    } else {
+      signature.append(T_DOUBLE);
+      signature.append(T_DOUBLE);
+    }
+    CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+    const LIR_Opr result_reg = result_register_for(x->type());
+    left.load_item_force(cc->at(1));
+    right.load_item();
+
+    __ move(right.result(), cc->at(0));
+
+    address entry;
+    if (x->op() == Bytecodes::_frem) {
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::frem);
+    } else {
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::drem);
+    }
+
+    LIR_Opr result = rlock_result(x);
+    __ call_runtime_leaf(entry, getThreadTemp(), result_reg, cc->args());
+    __ move(result_reg, result);
+
+    return;
+  }
+
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+  LIRItem* left_arg  = &left;
+  LIRItem* right_arg = &right;
+
+  // Always load right hand side.
+  right.load_item();
+
+  if (!left.is_register())
+    left.load_item();
+
+  LIR_Opr reg = rlock(x);
+  LIR_Opr tmp = LIR_OprFact::illegalOpr;
+  if (x->is_strictfp() && (x->op() == Bytecodes::_dmul || x->op() == Bytecodes::_ddiv)) {
+    tmp = new_register(T_DOUBLE);
+  }
+
+  arithmetic_op_fpu(x->op(), reg, left.result(), right.result(), NULL);
+
+  set_result(x, round_item(reg));
+}
+
+// for  _ladd, _lmul, _lsub, _ldiv, _lrem
+void LIRGenerator::do_ArithmeticOp_Long(ArithmeticOp* x) {
+
+  // missing test if instr is commutative and if we should swap
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+
+  if (x->op() == Bytecodes::_ldiv || x->op() == Bytecodes::_lrem) {
+
+    BasicTypeList signature(2);
+    signature.append(T_LONG);
+    signature.append(T_LONG);
+    CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+    // check for division by zero (destroys registers of right operand!)
+    CodeEmitInfo* info = state_for(x);
+
+    right.load_item();
+
+    __ cmp(lir_cond_equal, right.result(), LIR_OprFact::longConst(0));
+    __ branch(lir_cond_equal, T_LONG, new DivByZeroStub(info));
+
+    const LIR_Opr result_reg = result_register_for(x->type());
+    left.load_item_force(cc->at(1));
+    __ move(right.result(), cc->at(0));
+
+    address entry;
+    switch (x->op()) {
+    case Bytecodes::_lrem:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::lrem);
+      break; // check if dividend is 0 is done elsewhere
+    case Bytecodes::_ldiv:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::ldiv);
+      break; // check if dividend is 0 is done elsewhere
+    default:
+      ShouldNotReachHere();
+    }
+
+    LIR_Opr result = rlock_result(x);
+    __ call_runtime_leaf(entry, getThreadTemp(), result_reg, cc->args());
+    __ move(result_reg, result);
+  } else {
+    assert (x->op() == Bytecodes::_lmul || x->op() == Bytecodes::_ladd || x->op() == Bytecodes::_lsub,
+            "expect lmul, ladd or lsub");
+    // add, sub, mul
+    left.load_item();
+    if (! right.is_register()) {
+      if (x->op() == Bytecodes::_lmul
+          || ! right.is_constant()
+          || ! Assembler::operand_valid_for_add_sub_immediate(right.get_jlong_constant())) {
+        right.load_item();
+      } else { // add, sub
+        assert (x->op() == Bytecodes::_ladd || x->op() == Bytecodes::_lsub, "expect ladd or lsub");
+        // don't load constants to save register
+        right.load_nonconstant();
+      }
+    }
+    rlock_result(x);
+    arithmetic_op_long(x->op(), x->operand(), left.result(), right.result(), NULL);
+  }
+}
+
+// for: _iadd, _imul, _isub, _idiv, _irem
+void LIRGenerator::do_ArithmeticOp_Int(ArithmeticOp* x) {
+
+  // Test if instr is commutative and if we should swap
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+  LIRItem* left_arg = &left;
+  LIRItem* right_arg = &right;
+  if (x->is_commutative() && left.is_stack() && right.is_register()) {
+    // swap them if left is real stack (or cached) and right is real register(not cached)
+    left_arg = &right;
+    right_arg = &left;
+  }
+
+  left_arg->load_item();
+
+  // do not need to load right, as we can handle stack and constants
+  if (x->op() == Bytecodes::_idiv || x->op() == Bytecodes::_irem) {
+
+    right_arg->load_item();
+    rlock_result(x);
+
+    if (!(VM_Version::features() & FT_HW_DIVIDE)) {
+      // MacroAssembler::divide32 destroys both operand registers
+      left_arg->set_destroys_register();
+      right_arg->set_destroys_register();
+    }
+
+    CodeEmitInfo* info = state_for(x);
+    LIR_Opr tmp = new_register(T_INT);
+    __ cmp(lir_cond_equal, right_arg->result(), LIR_OprFact::intConst(0));
+    __ branch(lir_cond_equal, T_INT, new DivByZeroStub(info));
+    info = state_for(x);
+
+    if (x->op() == Bytecodes::_irem) {
+      __ irem(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+    } else if (x->op() == Bytecodes::_idiv) {
+      __ idiv(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+    }
+
+  } else if (x->op() == Bytecodes::_iadd || x->op() == Bytecodes::_isub) {
+    if (right.is_constant()
+        && Assembler::operand_valid_for_add_sub_immediate(right.get_jint_constant())) {
+      right.load_nonconstant();
+    } else {
+      right.load_item();
+    }
+    rlock_result(x);
+    arithmetic_op_int(x->op(), x->operand(), left_arg->result(), right_arg->result(), LIR_OprFact::illegalOpr);
+  } else {
+    assert (x->op() == Bytecodes::_imul, "expect imul");
+    if (right.is_constant()) {
+      int c = right.get_jint_constant();
+      if (! is_power_of_2(c) && ! is_power_of_2(c + 1) && ! is_power_of_2(c - 1)) {
+        // Cannot use constant op.
+        right.load_item();
+      } else {
+        right.dont_load_item();
+      }
+    } else {
+      right.load_item();
+    }
+    rlock_result(x);
+    arithmetic_op_int(x->op(), x->operand(), left_arg->result(), right_arg->result(), new_register(T_INT));
+  }
+}
+
+void LIRGenerator::do_ArithmeticOp(ArithmeticOp* x) {
+  // when an operand with use count 1 is the left operand, then it is
+  // likely that no move for 2-operand-LIR-form is necessary
+  if (x->is_commutative() && x->y()->as_Constant() == NULL && x->x()->use_count() > x->y()->use_count()) {
+    x->swap_operands();
+  }
+
+  ValueTag tag = x->type()->tag();
+  assert(x->x()->type()->tag() == tag && x->y()->type()->tag() == tag, "wrong parameters");
+  switch (tag) {
+    case floatTag:
+    case doubleTag:  do_ArithmeticOp_FPU(x);  return;
+    case longTag:    do_ArithmeticOp_Long(x); return;
+    case intTag:     do_ArithmeticOp_Int(x);  return;
+  }
+  ShouldNotReachHere();
+}
+
+// _ishl, _lshl, _ishr, _lshr, _iushr, _lushr
+void LIRGenerator::do_ShiftOp(ShiftOp* x) {
+
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+
+  left.load_item();
+
+  rlock_result(x);
+  if (right.is_constant()) {
+    right.dont_load_item();
+
+    switch (x->op()) {
+    case Bytecodes::_ishl: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ shift_left(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_ishr: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_iushr: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ unsigned_shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lshl: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ shift_left(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lshr: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lushr: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ unsigned_shift_right(left.result(), c, x->operand());
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+    }
+  } else {
+    right.load_item();
+    LIR_Opr tmp = LIR_OprFact::illegalOpr;
+    if (left.result()->type() == T_LONG)
+      left.set_destroys_register();
+    switch (x->op()) {
+    case Bytecodes::_ishl: {
+      __ shift_left(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_ishr: {
+      __ shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_iushr: {
+      __ unsigned_shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lshl: {
+      __ shift_left(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lshr: {
+      __ shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lushr: {
+      __ unsigned_shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+    }
+  }
+}
+
+// _iand, _land, _ior, _lor, _ixor, _lxor
+void LIRGenerator::do_LogicOp(LogicOp* x) {
+
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+
+  left.load_item();
+
+  rlock_result(x);
+  if (right.is_constant()
+      && ((right.type()->tag() == intTag
+           && Assembler::operand_valid_for_logical_immediate(true, right.get_jint_constant()))
+          || (right.type()->tag() == longTag
+              && Assembler::operand_valid_for_logical_immediate(false, right.get_jlong_constant()))))  {
+    right.dont_load_item();
+  } else {
+    right.load_item();
+  }
+  switch (x->op()) {
+  case Bytecodes::_iand:
+  case Bytecodes::_land:
+    __ logical_and(left.result(), right.result(), x->operand()); break;
+  case Bytecodes::_ior:
+  case Bytecodes::_lor:
+    __ logical_or (left.result(), right.result(), x->operand()); break;
+  case Bytecodes::_ixor:
+  case Bytecodes::_lxor:
+    __ logical_xor(left.result(), right.result(), x->operand()); break;
+  default: Unimplemented();
+  }
+}
+
+// _lcmp, _fcmpl, _fcmpg, _dcmpl, _dcmpg
+void LIRGenerator::do_CompareOp(CompareOp* x) {
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+  ValueTag tag = x->x()->type()->tag();
+  left.load_item();
+  right.load_item();
+  LIR_Opr reg = rlock_result(x);
+
+  if (x->x()->type()->is_float_kind()) {
+    Bytecodes::Code code = x->op();
+    __ fcmp2int(left.result(), right.result(), reg, (code == Bytecodes::_fcmpl || code == Bytecodes::_dcmpl));
+  } else if (x->x()->type()->tag() == longTag) {
+    __ lcmp2int(left.result(), right.result(), reg);
+  } else {
+    Unimplemented();
+  }
+}
+
+void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
+  assert(x->number_of_arguments() == 4, "wrong type");
+  LIRItem obj   (x->argument_at(0), this);  // object
+  LIRItem offset(x->argument_at(1), this);  // offset of field
+  LIRItem cmp   (x->argument_at(2), this);  // value to compare with field
+  LIRItem val   (x->argument_at(3), this);  // replace field with val if matches cmp
+
+  assert(obj.type()->tag() == objectTag, "invalid type");
+
+  // In 64bit the type can be long, sparc doesn't have this assert
+  // assert(offset.type()->tag() == intTag, "invalid type");
+
+  assert(cmp.type()->tag() == type->tag(), "invalid type");
+  assert(val.type()->tag() == type->tag(), "invalid type");
+
+  // get address of field
+  obj.load_item();
+  offset.load_nonconstant();
+  if (type == longType) {
+    // not need if allocator reserves correct pairs
+    val.load_item_force(FrameMap::long0_opr);
+  } else {
+    val.load_item();
+  }
+  cmp.load_item();
+
+  LIR_Address* a;
+  if(offset.result()->is_constant()) {
+    jint c = offset.result()->as_jint();
+    a = new LIR_Address(obj.result(),
+                        c,
+                        as_BasicType(type));
+  } else {
+    a = new LIR_Address(obj.result(),
+                        offset.result(),
+                        LIR_Address::times_1,
+                        0,
+                        as_BasicType(type));
+  }
+  LIR_Opr addr = new_pointer_register();
+  __ leal(LIR_OprFact::address(a), addr);
+
+  if (type == objectType) {  // Write-barrier needed for Object fields.
+    // Do the pre-write barrier, if any.
+    pre_barrier(addr, LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+  }
+
+  LIR_Opr result = rlock_result(x);
+
+  LIR_Opr ill = LIR_OprFact::illegalOpr;  // for convenience
+  if (type == objectType)
+    __ cas_obj(addr, cmp.result(), val.result(), ill, ill, result);
+  else if (type == intType)
+    __ cas_int(addr, cmp.result(), val.result(), ill, ill, result);
+  else if (type == longType)
+    __ cas_long(addr, cmp.result(), val.result(), FrameMap::long1_opr, ill,
+                result);
+  else {
+    ShouldNotReachHere();
+  }
+
+  __ logical_xor(result, LIR_OprFact::intConst(1), result);
+
+  if (type == objectType) {   // Write-barrier needed for Object fields.
+    // Seems to be precise
+    post_barrier(addr, val.result());
+  }
+}
+
+void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+  switch (x->id()) {
+    case vmIntrinsics::_dabs:
+    case vmIntrinsics::_dsqrt: {
+      assert(x->number_of_arguments() == 1, "wrong type");
+      LIRItem value(x->argument_at(0), this);
+      value.load_item();
+      LIR_Opr dst = rlock_result(x);
+
+      switch (x->id()) {
+      case vmIntrinsics::_dsqrt: {
+        __ sqrt(value.result(), dst, LIR_OprFact::illegalOpr);
+        break;
+      }
+      case vmIntrinsics::_dabs: {
+        __ abs(value.result(), dst, LIR_OprFact::illegalOpr);
+        break;
+      }
+      }
+      break;
+    }
+    case vmIntrinsics::_dlog10: // fall through
+    case vmIntrinsics::_dlog: // fall through
+    case vmIntrinsics::_dsin: // fall through
+    case vmIntrinsics::_dtan: // fall through
+    case vmIntrinsics::_dcos: // fall through
+    case vmIntrinsics::_dexp: {
+      assert(x->number_of_arguments() == 1, "wrong type");
+
+      address runtime_entry = NULL;
+      switch (x->id()) {
+      case vmIntrinsics::_dsin:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+        break;
+      case vmIntrinsics::_dcos:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+        break;
+      case vmIntrinsics::_dtan:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+        break;
+      case vmIntrinsics::_dlog:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+        break;
+      case vmIntrinsics::_dlog10:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+        break;
+      case vmIntrinsics::_dexp:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+
+      LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL);
+      set_result(x, result);
+      break;
+    }
+    case vmIntrinsics::_dpow: {
+      assert(x->number_of_arguments() == 2, "wrong type");
+      address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+      LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL);
+      set_result(x, result);
+      break;
+    }
+  }
+}
+
+
+void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+  assert(x->number_of_arguments() == 5, "wrong type");
+
+  // Make all state_for calls early since they can emit code
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIRItem src(x->argument_at(0), this);
+  LIRItem src_pos(x->argument_at(1), this);
+  LIRItem dst(x->argument_at(2), this);
+  LIRItem dst_pos(x->argument_at(3), this);
+  LIRItem length(x->argument_at(4), this);
+
+  // operands for arraycopy must use fixed registers, otherwise
+  // LinearScan will fail allocation (because arraycopy always needs a
+  // call)
+
+  // The java calling convention does not give us enough registers
+  // so we occupy two more: r4 and r5. The fast path code will be able to
+  // make use of these registers for performance purpose. If going into
+  // slow path we'll spill extra data to the stack as necessary
+
+  src.load_item_force     (FrameMap::as_oop_opr(j_rarg0));
+  src_pos.load_item_force (FrameMap::as_opr(j_rarg1));
+  dst.load_item_force     (FrameMap::as_oop_opr(j_rarg2));
+  dst_pos.load_item_force (FrameMap::as_opr(j_rarg3));
+
+  length.load_item_force  (FrameMap::as_opr(r4));
+  LIR_Opr tmp =           FrameMap::as_opr(r5);
+
+  set_no_result(x);
+
+  int flags;
+  ciArrayKlass* expected_type;
+  arraycopy_helper(x, &flags, &expected_type);
+
+  __ arraycopy(src.result(), src_pos.result(), dst.result(), dst_pos.result(), length.result(), tmp, expected_type, flags, info); // does add_safepoint
+}
+
+void LIRGenerator::do_update_CRC32(Intrinsic* x) {
+  assert(UseCRC32Intrinsics, "why are we here?");
+  // Make all state_for calls early since they can emit code
+  LIR_Opr result = rlock_result(x);
+  switch (x->id()) {
+    case vmIntrinsics::_updateCRC32: {
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem val(x->argument_at(1), this);
+      // val is destroyed by update_crc32
+      val.set_destroys_register();
+      crc.load_item();
+      val.load_item();
+      __ update_crc32(crc.result(), val.result(), result);
+      break;
+    }
+    case vmIntrinsics::_updateBytesCRC32:
+    case vmIntrinsics::_updateByteBufferCRC32: {
+      bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32);
+
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem buf(x->argument_at(1), this);
+      LIRItem off(x->argument_at(2), this);
+      LIRItem len(x->argument_at(3), this);
+      buf.load_item();
+      off.load_nonconstant();
+
+      LIR_Opr index = off.result();
+      int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
+      if(off.result()->is_constant()) {
+        index = LIR_OprFact::illegalOpr;
+       offset += off.result()->as_jint();
+      }
+      LIR_Opr base_op = buf.result();
+
+      if (offset) {
+        LIR_Opr tmp = new_pointer_register();
+        __ add(base_op, LIR_OprFact::intConst(offset), tmp);
+        base_op = tmp;
+        offset = 0;
+      }
+
+      LIR_Address* a = new LIR_Address(base_op,
+                                       index,
+                                       LIR_Address::times_1,
+                                       offset,
+                                       T_BYTE);
+      BasicTypeList signature(3);
+      signature.append(T_INT);
+      signature.append(T_ADDRESS);
+      signature.append(T_INT);
+      CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+      const LIR_Opr result_reg = result_register_for(x->type());
+
+      LIR_Opr addr = new_pointer_register();
+      __ leal(LIR_OprFact::address(a), addr);
+
+      crc.load_item_force(cc->at(0));
+      __ move(addr, cc->at(1));
+      len.load_item_force(cc->at(2));
+
+      __ call_runtime_leaf(StubRoutines::updateBytesCRC32(), getThreadTemp(), result_reg, cc->args());
+      __ move(result_reg, result);
+
+      break;
+    }
+    default: {
+      ShouldNotReachHere();
+    }
+  }
+}
+
+// _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f
+// _i2b, _i2c, _i2s
+void LIRGenerator::do_Convert(Convert* x) {
+  // insired by sparc port
+  switch (x->op()) {
+  case Bytecodes::_d2l:
+  case Bytecodes::_f2l:
+  case Bytecodes::_l2d:
+  case Bytecodes::_l2f: {
+    address entry;
+
+    switch (x->op()) {
+    case Bytecodes::_d2l:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::d2l);
+      break;
+    case Bytecodes::_f2l:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::f2l);
+      break;
+    case Bytecodes::_l2d:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::l2d);
+      break;
+    case Bytecodes::_l2f:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::l2f);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+
+    LIR_Opr result = call_runtime(x->value(), entry, x->type(), NULL);
+    set_result(x, result);
+  }
+  break;
+
+  default:
+    LIRItem value(x->value(), this);
+    value.load_item();
+
+    if (x->op() == Bytecodes::_f2i || x->op() == Bytecodes::_d2i) {
+      value.set_destroys_register();
+    }
+
+    LIR_Opr input = value.result();
+    LIR_Opr result = rlock(x);
+
+    __ convert(x->op(), input, result);
+
+    assert(result->is_virtual(), "result must be virtual register");
+    set_result(x, result);
+  }
+}
+
+void LIRGenerator::do_NewInstance(NewInstance* x) {
+#ifndef PRODUCT
+  if (PrintNotLoaded && !x->klass()->is_loaded()) {
+    tty->print_cr("   ###class not loaded at new bci %d", x->printable_bci());
+  }
+#endif
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIR_Opr reg = result_register_for(x->type());
+  new_instance(reg, x->klass(), x->is_unresolved(),
+                       FrameMap::r2_oop_opr,
+                       FrameMap::r5_oop_opr,
+                       FrameMap::r4_oop_opr,
+                       LIR_OprFact::illegalOpr,
+                       FrameMap::r3_metadata_opr, info);
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_NewTypeArray(NewTypeArray* x) {
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIRItem length(x->length(), this);
+  length.load_item_force(FrameMap::r6_opr);
+
+  LIR_Opr reg = result_register_for(x->type());
+  LIR_Opr tmp1 = FrameMap::r2_oop_opr;
+  LIR_Opr tmp2 = FrameMap::r4_oop_opr;
+  LIR_Opr tmp3 = FrameMap::r5_oop_opr;
+  LIR_Opr tmp4 = reg;
+  LIR_Opr klass_reg = FrameMap::r3_metadata_opr;
+  LIR_Opr len = length.result();
+  BasicType elem_type = x->elt_type();
+
+  __ metadata2reg(ciTypeArrayKlass::make(elem_type)->constant_encoding(), klass_reg);
+
+  CodeStub* slow_path = new NewTypeArrayStub(klass_reg, len, reg, info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, elem_type, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_NewObjectArray(NewObjectArray* x) {
+  LIRItem length(x->length(), this);
+  // in case of patching (i.e., object class is not yet loaded), we need to reexecute the instruction
+  // and therefore provide the state before the parameters have been consumed
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info =  state_for(x, x->state_before());
+  }
+
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIR_Opr reg = result_register_for(x->type());
+  LIR_Opr tmp1 = FrameMap::r2_oop_opr;
+  LIR_Opr tmp2 = FrameMap::r4_oop_opr;
+  LIR_Opr tmp3 = FrameMap::r5_oop_opr;
+  LIR_Opr tmp4 = reg;
+  LIR_Opr klass_reg = FrameMap::r3_metadata_opr;
+
+  length.load_item_force(FrameMap::r6_opr);
+  LIR_Opr len = length.result();
+
+  CodeStub* slow_path = new NewObjectArrayStub(klass_reg, len, reg, info);
+  ciKlass* obj = (ciKlass*) ciObjArrayKlass::make(x->klass());
+  if (obj == ciEnv::unloaded_ciobjarrayklass()) {
+    BAILOUT("encountered unloaded_ciobjarrayklass due to out of memory error");
+  }
+  klass2reg_with_patching(klass_reg, obj, patching_info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, T_OBJECT, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_NewMultiArray(NewMultiArray* x) {
+  Values* dims = x->dims();
+  int i = dims->length();
+  LIRItemList* items = new LIRItemList(dims->length(), NULL);
+  while (i-- > 0) {
+    LIRItem* size = new LIRItem(dims->at(i), this);
+    items->at_put(i, size);
+  }
+
+  // Evaluate state_for early since it may emit code.
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info = state_for(x, x->state_before());
+
+    // Cannot re-use same xhandlers for multiple CodeEmitInfos, so
+    // clone all handlers (NOTE: Usually this is handled transparently
+    // by the CodeEmitInfo cloning logic in CodeStub constructors but
+    // is done explicitly here because a stub isn't being used).
+    x->set_exception_handlers(new XHandlers(x->exception_handlers()));
+  }
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  i = dims->length();
+  while (i-- > 0) {
+    LIRItem* size = items->at(i);
+    size->load_item();
+
+    store_stack_parameter(size->result(), in_ByteSize(i*4));
+  }
+
+  LIR_Opr klass_reg = FrameMap::r1_metadata_opr;
+  klass2reg_with_patching(klass_reg, x->klass(), patching_info);
+
+  LIR_Opr rank = FrameMap::r2_opr;
+  __ move(LIR_OprFact::intConst(x->rank()), rank);
+  LIR_Opr varargs = FrameMap::r3_opr;
+  __ move(FrameMap::sp_opr, varargs);
+  LIR_OprList* args = new LIR_OprList(3);
+  args->append(klass_reg);
+  args->append(rank);
+  args->append(varargs);
+  LIR_Opr reg = result_register_for(x->type());
+  __ call_runtime(Runtime1::entry_for(Runtime1::new_multi_array_id),
+                  LIR_OprFact::illegalOpr,
+                  reg, args, info);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_BlockBegin(BlockBegin* x) {
+  // nothing to do for now
+}
+
+void LIRGenerator::do_CheckCast(CheckCast* x) {
+  LIRItem obj(x->obj(), this);
+
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || (PatchALot && !x->is_incompatible_class_change_check())) {
+    // must do this before locking the destination register as an oop register,
+    // and before the obj is loaded (the latter is for deoptimization)
+    patching_info = state_for(x, x->state_before());
+  }
+  obj.load_item();
+
+  // info for exceptions
+  CodeEmitInfo* info_for_exception = state_for(x);
+
+  CodeStub* stub;
+  if (x->is_incompatible_class_change_check()) {
+    assert(patching_info == NULL, "can't patch this");
+    stub = new SimpleExceptionStub(Runtime1::throw_incompatible_class_change_error_id, LIR_OprFact::illegalOpr, info_for_exception);
+  } else {
+    stub = new SimpleExceptionStub(Runtime1::throw_class_cast_exception_id, obj.result(), info_for_exception);
+  }
+  LIR_Opr reg = rlock_result(x);
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+  if (!x->klass()->is_loaded()) {
+    tmp3 = new_register(objectType);
+  }
+  __ checkcast(reg, obj.result(), x->klass(),
+               new_register(objectType), new_register(objectType), tmp3,
+               x->direct_compare(), info_for_exception, patching_info, stub,
+               x->profiled_method(), x->profiled_bci());
+}
+
+void LIRGenerator::do_InstanceOf(InstanceOf* x) {
+  LIRItem obj(x->obj(), this);
+
+  // result and test object may not be in same register
+  LIR_Opr reg = rlock_result(x);
+  CodeEmitInfo* patching_info = NULL;
+  if ((!x->klass()->is_loaded() || PatchALot)) {
+    // must do this before locking the destination register as an oop register
+    patching_info = state_for(x, x->state_before());
+  }
+  obj.load_item();
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+  if (!x->klass()->is_loaded()) {
+    tmp3 = new_register(objectType);
+  }
+  __ instanceof(reg, obj.result(), x->klass(),
+                new_register(objectType), new_register(objectType), tmp3,
+                x->direct_compare(), patching_info, x->profiled_method(), x->profiled_bci());
+}
+
+void LIRGenerator::do_If(If* x) {
+  assert(x->number_of_sux() == 2, "inconsistency");
+  ValueTag tag = x->x()->type()->tag();
+  bool is_safepoint = x->is_safepoint();
+
+  If::Condition cond = x->cond();
+
+  LIRItem xitem(x->x(), this);
+  LIRItem yitem(x->y(), this);
+  LIRItem* xin = &xitem;
+  LIRItem* yin = &yitem;
+
+  xin->load_item();
+
+  if (yin->is_constant()) {
+    if (tag == longTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_jlong_constant())) {
+      yin->dont_load_item();
+    } else if (tag == intTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_jint_constant())) {
+      yin->dont_load_item();
+    } else if (tag == addressTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_address_constant())) {
+      yin->dont_load_item();
+    } else if (tag == objectTag && yin->get_jobject_constant()->is_null_object()) {
+      yin->dont_load_item();
+    } else {
+      yin->load_item();
+    }
+  } else {
+    yin->load_item();
+  }
+
+  // add safepoint before generating condition code so it can be recomputed
+  if (x->is_safepoint()) {
+    // increment backedge counter if needed
+    increment_backedge_counter(state_for(x, x->state_before()), x->profiled_bci());
+    __ safepoint(LIR_OprFact::illegalOpr, state_for(x, x->state_before()));
+  }
+  set_no_result(x);
+
+  LIR_Opr left = xin->result();
+  LIR_Opr right = yin->result();
+
+  __ cmp(lir_cond(cond), left, right);
+  // Generate branch profiling. Profiling code doesn't kill flags.
+  profile_branch(x, cond);
+  move_to_phi(x->state());
+  if (x->x()->type()->is_float_kind()) {
+    __ branch(lir_cond(cond), right->type(), x->tsux(), x->usux());
+  } else {
+    __ branch(lir_cond(cond), right->type(), x->tsux());
+  }
+  assert(x->default_sux() == x->fsux(), "wrong destination above");
+  __ jump(x->default_sux());
+}
+
+LIR_Opr LIRGenerator::getThreadPointer() {
+   return FrameMap::as_pointer_opr(rthread);
+}
+
+void LIRGenerator::trace_block_entry(BlockBegin* block) {
+  __ move(LIR_OprFact::intConst(block->block_id()), FrameMap::r0_opr);
+  LIR_OprList* args = new LIR_OprList(1);
+  args->append(FrameMap::r0_opr);
+  address func = CAST_FROM_FN_PTR(address, Runtime1::trace_block_entry);
+  __ call_runtime_leaf(func, LIR_OprFact::illegalOpr, LIR_OprFact::illegalOpr, args);
+}
+
+void LIRGenerator::volatile_field_store(LIR_Opr value, LIR_Address* address,
+                                        CodeEmitInfo* info) {
+  if (value->is_double_cpu()) {
+    __ move(value, FrameMap::long0_opr);
+    __ volatile_store_mem_reg(FrameMap::long0_opr, address, info);
+  } else {
+    __ volatile_store_mem_reg(value, address, info);
+  }
+}
+
+void LIRGenerator::volatile_field_load(LIR_Address* address, LIR_Opr result,
+                                       CodeEmitInfo* info) {
+  if (result->is_double_cpu()) {
+    __ volatile_load_mem_reg(address, FrameMap::long0_opr, info);
+    __ move(FrameMap::long0_opr, result);
+  } else {
+    __ volatile_load_mem_reg(address, result, info);
+  }
+}
+
+void LIRGenerator::get_Object_unsafe(LIR_Opr dst, LIR_Opr src, LIR_Opr offset,
+                                     BasicType type, bool is_volatile) {
+  LIR_Address* addr = new LIR_Address(src, offset, type);
+  __ load(addr, dst);
+}
+
+
+void LIRGenerator::put_Object_unsafe(LIR_Opr src, LIR_Opr offset, LIR_Opr data,
+                                     BasicType type, bool is_volatile) {
+  LIR_Address* addr = new LIR_Address(src, offset, type);
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  if (is_obj) {
+    // Do the pre-write barrier, if any.
+    pre_barrier(LIR_OprFact::address(addr), LIR_OprFact::illegalOpr /* pre_val */,
+                true /* do_load */, false /* patch */, NULL);
+    __ move(data, addr);
+    assert(src->is_register(), "must be register");
+    // Seems to be a precise address
+    post_barrier(LIR_OprFact::address(addr), data);
+  } else {
+    __ move(data, addr);
+  }
+}
+
+void LIRGenerator::do_UnsafeGetAndSetObject(UnsafeGetAndSetObject* x) {
+  BasicType type = x->basic_type();
+  LIRItem src(x->object(), this);
+  LIRItem off(x->offset(), this);
+  LIRItem value(x->value(), this);
+
+  src.load_item();
+  off.load_nonconstant();
+  if (type == T_LONG && !x->is_add()) {
+      // not need if allocator reserves correct pairs
+      value.load_item_force(FrameMap::long1_opr);
+  } else {
+    // We can cope with a constant increment in an xadd
+    if (! (x->is_add()
+           && value.is_constant()
+           && can_inline_as_constant(x->value()))) {
+      value.load_item();
+    }
+  }
+
+  bool is_long = (type == T_LONG);
+  LIR_Opr dst = is_long ? FrameMap::long0_opr : rlock_result(x, type);
+  LIR_Opr data = value.result();
+  bool is_obj = (type == T_ARRAY || type == T_OBJECT);
+  LIR_Opr offset = off.result();
+
+  if (data == dst) {
+    LIR_Opr tmp = new_register(data->type());
+    __ move(data, tmp);
+    data = tmp;
+  }
+
+  LIR_Address* addr;
+  if (offset->is_constant()) {
+    addr = new LIR_Address(src.result(), offset->as_jint(), type);
+  } else {
+    addr = new LIR_Address(src.result(), offset, type);
+  }
+
+  LIR_Opr tmp = new_register(T_INT);
+  LIR_Opr ptr = LIR_OprFact::illegalOpr;
+
+  if (x->is_add()) {
+    __ xadd(LIR_OprFact::address(addr), data, dst, tmp);
+  } else {
+    if (is_obj) {
+      // Do the pre-write barrier, if any.
+      ptr = new_pointer_register();
+      __ add(src.result(), off.result(), ptr);
+      pre_barrier(ptr, LIR_OprFact::illegalOpr /* pre_val */,
+                  true /* do_load */, false /* patch */, NULL);
+    }
+    __ xchg(LIR_OprFact::address(addr), data, dst, tmp);
+    if (is_obj) {
+      post_barrier(ptr, data);
+    }
+  }
+
+  if (is_long) {
+    dst = rlock_result(x, type);
+    __ move(FrameMap::long0_opr, dst);
+  }
+}
--- /dev/null	2016-08-26 13:07:56.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_LinearScan_aarch32.cpp	2016-08-26 13:07:56.000000000 +0300
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_LinearScan.hpp"
+
+void LinearScan::allocate_fpu_stack() {
+  // No FPU stack on AArch32
+}
--- /dev/null	2016-08-26 13:07:58.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_LinearScan_aarch32.hpp	2016-08-26 13:07:58.000000000 +0300
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
+
+inline bool LinearScan::is_processed_reg_num(int reg_num) {
+  return reg_num <= pd_last_cpu_reg || reg_num >= pd_nof_cpu_regs_frame_map;
+}
+
+inline int LinearScan::num_physical_regs(BasicType type) {
+  if (type == T_LONG || type == T_DOUBLE) {
+    return 2;
+  }
+  return 1;
+}
+
+inline bool LinearScan::requires_adjacent_regs(BasicType type) {
+  if (type == T_DOUBLE) {
+    return true;
+  }
+  return false;
+}
+
+inline bool LinearScan::is_caller_save(int assigned_reg) {
+  assert(assigned_reg >= 0 && assigned_reg < nof_regs,
+         "should call this only for registers");
+  // TODO: Remove the following line when support for callee-saved registers
+  // is added
+  return true;
+  if (assigned_reg < pd_first_callee_saved_cpu_reg) {
+    return true;
+  }
+  if (assigned_reg > pd_last_callee_saved_cpu_reg &&
+      assigned_reg < pd_first_callee_saved_fpu_reg) {
+    return true;
+  }
+  if (assigned_reg > pd_last_callee_saved_fpu_reg &&
+      assigned_reg <= pd_last_fpu_reg) {
+    return true;
+  }
+  return false;
+}
+
+// If there are special cases when some particular LIR operations kill some
+// specific registers, this behavior should be described here. An example
+// can be found in x86 port.
+inline void LinearScan::pd_add_temps(LIR_Op* op) {
+  if (op->code() == lir_move) {
+    LIR_Op1* move_op = op->as_Op1();
+    if (move_op->move_kind() == lir_move_volatile) {
+      bool is_long = move_op->type() == T_LONG;
+      bool is_double = move_op->type() == T_DOUBLE;
+      bool is_store = move_op->in_opr()->is_register();
+      if (is_double) {
+        add_temp(reg_num(FrameMap::long0_opr), op->id(), noUse, T_ILLEGAL);
+        add_temp(reg_numHi(FrameMap::long0_opr), op->id(), noUse, T_ILLEGAL);
+      }
+      if (is_store && (is_long || is_double)) {
+        add_temp(reg_num(FrameMap::long1_opr), op->id(), noUse, T_ILLEGAL);
+        add_temp(reg_numHi(FrameMap::long1_opr), op->id(), noUse, T_ILLEGAL);
+      }
+    }
+  }
+}
+
+inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
+  // The default logic is good enough for AArch32.
+  return false;
+}
+
+#endif // CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
--- /dev/null	2016-08-26 13:07:59.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_MacroAssembler_aarch32.cpp	2016-08-26 13:07:59.000000000 +0300
@@ -0,0 +1,476 @@
+/*
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "gc_interface/collectedHeap.hpp"
+#include "interpreter/interpreter.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+
+void C1_MacroAssembler::float_cmp(bool is_float, int unordered_result,
+                                  FloatRegister f0, FloatRegister f1,
+                                  Register result)
+{
+  Label done;
+  if (is_float) {
+    vcmp_f32(f0, f1);
+  } else {
+    vcmp_f64(f0, f1);
+  }
+
+  get_fpsr();
+
+  mov(result, 0);
+  if (unordered_result < 0) {
+    // we want -1 for unordered or less than, 0 for equal and 1 for
+    // greater than.
+    mov(result, 1, NE); // Not equal or unordered
+    neg(result, result, LT);  // Less than or unordered
+  } else {
+    // we want -1 for less than, 0 for equal and 1 for unordered or
+    // greater than.
+    mov(result, 1, NE); // Not equal or unordered
+    neg(result, result, LO);  // Less than
+  }
+}
+
+int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr, Register scratch, Label& slow_case) {
+  const int aligned_mask = BytesPerWord -1;
+  const int hdr_offset = oopDesc::mark_offset_in_bytes();
+  assert(hdr != obj && hdr != disp_hdr && obj != disp_hdr, "registers must be different");
+  Label done, fail;
+  int null_check_offset = -1;
+
+  verify_oop(obj);
+
+  // save object being locked into the BasicObjectLock
+  str(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+
+  if (UseBiasedLocking) {
+    assert(scratch != noreg, "should have scratch register at this point");
+    null_check_offset = biased_locking_enter(disp_hdr, obj, hdr, scratch, false, done, &slow_case);
+  } else {
+    null_check_offset = offset();
+  }
+
+  // Load object header
+  ldr(hdr, Address(obj, hdr_offset));
+  // and mark it as unlocked
+  orr(hdr, hdr, markOopDesc::unlocked_value);
+  // save unlocked object header into the displaced header location on the stack
+  str(hdr, Address(disp_hdr, 0));
+  // test if object header is still the same (i.e. unlocked), and if so, store the
+  // displaced header address in the object header - if it is not the same, get the
+  // object header instead
+  lea(rscratch2, Address(obj, hdr_offset));
+  cmpxchgptr(hdr, disp_hdr, rscratch2, rscratch1, done, /*fallthough*/NULL);
+  // if the object header was the same, we're done
+  // if the object header was not the same, it is now in the hdr register
+  // => test if it is a stack pointer into the same stack (recursive locking), i.e.:
+  //
+  // 1) (hdr & aligned_mask) == 0
+  // 2) sp <= hdr
+  // 3) hdr <= sp + page_size
+  //
+  // these 3 tests can be done by evaluating the following expression:
+  //
+  // (hdr - sp) & (aligned_mask - page_size)
+  //
+  // assuming both the stack pointer and page_size have their least
+  // significant 2 bits cleared and page_size is a power of 2
+  mov(rscratch1, sp);
+  sub(hdr, hdr, rscratch1);
+  mov(rscratch2, aligned_mask - os::vm_page_size());
+  ands(hdr, hdr, rscratch2);
+  // for recursive locking, the result is zero => save it in the displaced header
+  // location (NULL in the displaced hdr location indicates recursive locking)
+  str(hdr, Address(disp_hdr, 0));
+  // otherwise we don't care about the result and handle locking via runtime call
+  cbnz(hdr, slow_case);
+  // done
+  bind(done);
+  if (PrintBiasedLockingStatistics) {
+    lea(rscratch2, ExternalAddress((address)BiasedLocking::fast_path_entry_count_addr()));
+    addmw(Address(rscratch2, 0), 1, rscratch1);
+  }
+  return null_check_offset;
+}
+
+
+void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_hdr, Label& slow_case) {
+  const int aligned_mask = BytesPerWord -1;
+  const int hdr_offset = oopDesc::mark_offset_in_bytes();
+  assert(hdr != obj && hdr != disp_hdr && obj != disp_hdr, "registers must be different");
+  Label done;
+
+  if (UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+    biased_locking_exit(obj, hdr, done);
+  }
+
+  // load displaced header
+  ldr(hdr, Address(disp_hdr, 0));
+  // if the loaded hdr is NULL we had recursive locking
+  // if we had recursive locking, we are done
+  cbz(hdr, done);
+  if (!UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+  }
+  verify_oop(obj);
+  // test if object header is pointing to the displaced header, and if so, restore
+  // the displaced header in the object - if the object header is not pointing to
+  // the displaced header, get the object header instead
+  // if the object header was not pointing to the displaced header,
+  // we do unlocking via runtime call
+  if (hdr_offset) {
+    lea(rscratch1, Address(obj, hdr_offset));
+    cmpxchgptr(disp_hdr, hdr, rscratch1, rscratch2, done, &slow_case);
+  } else {
+    cmpxchgptr(disp_hdr, hdr, obj, rscratch2, done, &slow_case);
+  }
+  // done
+  bind(done);
+}
+
+
+// Defines obj, preserves var_size_in_bytes
+void C1_MacroAssembler::try_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, Label& slow_case) {
+  if (UseTLAB) {
+    tlab_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
+  } else {
+    eden_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
+    incr_allocated_bytes(noreg, var_size_in_bytes, con_size_in_bytes, t1);
+  }
+}
+
+void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
+  assert_different_registers(obj, klass, len);
+  if (UseBiasedLocking && !len->is_valid()) {
+    assert_different_registers(obj, klass, len, t1, t2);
+    ldr(t1, Address(klass, Klass::prototype_header_offset()));
+  } else {
+    // This assumes that all prototype bits fit in an int32_t
+    mov(t1, (int32_t)(intptr_t)markOopDesc::prototype());
+  }
+  str(t1, Address(obj, oopDesc::mark_offset_in_bytes()));
+  str(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
+
+  if (len->is_valid()) {
+    str(len, Address(obj, arrayOopDesc::length_offset_in_bytes()));
+  }
+}
+
+// Zero words; len is in bytes
+// Destroys all registers except addr
+// len must be a nonzero multiple of wordSize
+void C1_MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
+  assert_different_registers(addr, len, t1, rscratch1, rscratch2);
+
+#ifdef ASSERT
+  { Label L;
+    tst(len, BytesPerWord - 1);
+    b(L, Assembler::EQ);
+    stop("len is not a multiple of BytesPerWord");
+    bind(L);
+  }
+#endif
+
+#ifndef PRODUCT
+  block_comment("zero memory");
+#endif
+
+  Label loop;
+  Label entry;
+
+//  Algorithm:
+//
+//    scratch1 = cnt & 7;
+//    cnt -= scratch1;
+//    p += scratch1;
+//    switch (scratch1) {
+//      do {
+//        cnt -= 8;
+//          p[-8] = 0;
+//        case 7:
+//          p[-7] = 0;
+//        case 6:
+//          p[-6] = 0;
+//          // ...
+//        case 1:
+//          p[-1] = 0;
+//        case 0:
+//          p += 8;
+//      } while (cnt);
+//    }
+
+  const int unroll = 8; // Number of str instructions we'll unroll
+
+  lsr(len, len, LogBytesPerWord);
+  andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
+  sub(len, len, rscratch1);      // cnt -= unroll
+  // t1 always points to the end of the region we're about to zero
+  add(t1, addr, rscratch1, lsl(LogBytesPerWord));
+  adr(rscratch2, entry);
+  sub(rscratch2, rscratch2, rscratch1, lsl(2));
+  mov(rscratch1, 0);
+  b(rscratch2);
+  bind(loop);
+  sub(len, len, unroll);
+  for (int i = -unroll; i < 0; i++)
+    str(rscratch1, Address(t1, i * wordSize));
+  bind(entry);
+  add(t1, t1, unroll * wordSize);
+  cbnz(len, loop);
+}
+
+// preserves obj, destroys len_in_bytes
+void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1) {
+  Label done;
+  assert(obj != len_in_bytes && obj != t1 && t1 != len_in_bytes, "registers must be different");
+  assert((hdr_size_in_bytes & (BytesPerWord - 1)) == 0, "header size is not a multiple of BytesPerWord");
+  Register index = len_in_bytes;
+  // index is positive and ptr sized
+  subs(index, index, hdr_size_in_bytes);
+  b(done, Assembler::EQ);
+  // note: for the remaining code to work, index must be a multiple of BytesPerWord
+#ifdef ASSERT
+  { Label L;
+    tst(index, BytesPerWord - 1);
+    b(L, Assembler::EQ);
+    stop("index is not a multiple of BytesPerWord");
+    bind(L);
+  }
+#endif
+
+  // Preserve obj
+  if (hdr_size_in_bytes)
+    add(obj, obj, hdr_size_in_bytes);
+  zero_memory(obj, index, t1);
+  if (hdr_size_in_bytes)
+    sub(obj, obj, hdr_size_in_bytes);
+
+  // done
+  bind(done);
+}
+
+
+void C1_MacroAssembler::allocate_object(Register obj, Register t1, Register t2, int header_size, int object_size, Register klass, Label& slow_case) {
+  assert_different_registers(obj, t1, t2); // XXX really?
+  assert(header_size >= 0 && object_size >= header_size, "illegal sizes");
+
+  try_allocate(obj, noreg, object_size * BytesPerWord, t1, t2, slow_case);
+
+  initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2);
+}
+
+// This method clobbers t1, t2, and rscratch1 registers.
+void C1_MacroAssembler::initialize_object(Register obj, Register klass,
+                                          Register var_size_in_bytes,
+                                          int con_size_in_bytes,
+                                          Register t1, Register t2) {
+  assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0,
+         "con_size_in_bytes is not multiple of alignment");
+
+  const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize;
+
+  initialize_header(obj, klass, noreg, t1, t2);
+
+  // Null out rest of allocated space
+  const Register index = t2;
+  const int threshold = 8 * BytesPerWord;
+  if (var_size_in_bytes != noreg) {
+    mov(index, var_size_in_bytes);
+    initialize_body(obj, index, hdr_size_in_bytes, t1);
+  } else if (con_size_in_bytes <= threshold) {
+    // Emit required number of str instructions (unroll loop completely)
+    mov(t1, 0);
+    for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += BytesPerWord) {
+      str(t1, Address(obj, i));
+    }
+  } else if (con_size_in_bytes > hdr_size_in_bytes) {
+    // Use loop to null out fields
+    int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord;
+    mov(t1, 0);
+
+    const int unroll = 4; // Number of str instructions we'll unroll
+    mov(index, words / unroll);
+    int remainder = words % unroll;
+    lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord));
+
+    Label entry_point, loop;
+    b(entry_point);
+    bind(loop);
+    sub(index, index, 1);
+    for (int i = -unroll; i < 0; i++) {
+      if (-i == remainder) {
+        bind(entry_point);
+      }
+      str(t1, Address(rscratch1, i * BytesPerWord));
+    }
+    if (remainder == 0) {
+      bind(entry_point);
+    }
+    add(rscratch1, rscratch1, unroll * BytesPerWord);
+    cbnz(index, loop);
+  }
+
+  membar(StoreStore);
+
+  if (CURRENT_ENV->dtrace_alloc_probes()) {
+    assert(obj == r0, "must be");
+    far_call(RuntimeAddress(Runtime1::entry_for(
+             Runtime1::dtrace_object_alloc_id)));
+  }
+
+  verify_oop(obj);
+}
+
+void C1_MacroAssembler::allocate_array(Register obj, Register len, Register t1, Register t2, int header_size, int f, Register klass, Label& slow_case) {
+  assert_different_registers(obj, len, t1, t2, klass);
+
+  // determine alignment mask
+  assert(!(BytesPerWord & 1), "must be a multiple of 2 for masking code to work");
+
+  // check for negative or excessive length
+  mov(rscratch1, (int32_t)max_array_allocation_length);
+  cmp(len, rscratch1);
+  b(slow_case, Assembler::HS);
+
+  const Register arr_size = t2; // okay to be the same
+  // align object end
+  mov(arr_size, (int32_t)header_size * BytesPerWord + MinObjAlignmentInBytesMask);
+  add(arr_size, arr_size, len, Assembler::lsl(f));
+  mov(t1, ~MinObjAlignmentInBytesMask);
+  andr(arr_size, arr_size, t1);
+
+  try_allocate(obj, arr_size, 0, t1, t2, slow_case);
+
+  initialize_header(obj, klass, len, t1, t2);
+
+  // clear rest of allocated space
+  const Register len_zero = len;
+  initialize_body(obj, arr_size, header_size * BytesPerWord, len_zero);
+
+  membar(StoreStore);
+
+  if (CURRENT_ENV->dtrace_alloc_probes()) {
+    assert(obj == r0, "must be");
+    far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::dtrace_object_alloc_id)));
+  }
+
+  verify_oop(obj);
+}
+
+
+void C1_MacroAssembler::inline_cache_check(Register receiver, Register iCache) {
+  verify_oop(receiver);
+  // explicit NULL check not needed since load from [klass_offset] causes a trap
+  // check against inline cache
+  assert(!MacroAssembler::needs_explicit_null_check(oopDesc::klass_offset_in_bytes()), "must add explicit null check");
+
+  cmp_klass(receiver, iCache, rscratch1);
+}
+
+void C1_MacroAssembler::build_frame(int frame_size_in_bytes,
+                                    int bang_size_in_bytes) {
+  assert(bang_size_in_bytes >= frame_size_in_bytes,
+         "stack bang size incorrect");
+
+  // If we have to make this method not-entrant, we'll overwrite its first
+  // instruction with a jump. For this action to be legal we must ensure that
+  // this first instruction is a B, BL, NOP, BKPT, or SVC. Make it a NOP
+  nop();
+
+  // Make sure there is enough stack space for this method's activation
+  generate_stack_overflow_check(bang_size_in_bytes);
+
+  // Push lr, rfp, and optionally update rfp. rfp points to the first stack
+  // word used by the new frame.
+  stmdb(sp, RegSet::of(rfp, lr).bits());
+  if (PreserveFramePointer) {
+    add(rfp, sp, BytesPerWord);
+  }
+
+  // Create frame. frame_size_in_bytes always comes from
+  // LIR_Assembler::initial_frame_size_in_bytes() method, and it already
+  // takes into account two stack words spent on saving lr and rfp.
+  decrement(sp, frame_size_in_bytes);
+}
+
+void C1_MacroAssembler::remove_frame(int frame_size_in_bytes) {
+  // Remove frame. frame_size_in_bytes always comes from
+  // LIR_Assembler::initial_frame_size_in_bytes() method, and it already
+  // takes into account two stack words spent on saving lr and rfp.
+  increment(sp, frame_size_in_bytes);
+
+  // Pop rfp and lr
+  ldmia(sp, RegSet::of(rfp, lr).bits());
+}
+
+void C1_MacroAssembler::verified_entry() {
+}
+
+#ifndef PRODUCT
+
+void C1_MacroAssembler::verify_stack_oop(int stack_offset) {
+  if (!VerifyOops) return;
+  verify_oop_addr(Address(sp, stack_offset), "oop");
+}
+
+void C1_MacroAssembler::verify_not_null_oop(Register r) {
+  if (!VerifyOops) return;
+  Label not_null;
+  cbnz(r, not_null);
+  stop("non-null oop required");
+  bind(not_null);
+  verify_oop(r);
+}
+
+void C1_MacroAssembler::invalidate_registers(bool inv_r0, bool inv_r2, bool inv_r3) {
+#ifdef ASSERT
+  static int nn;
+  if (inv_r0) mov(r0, 0xDEAD);
+  if (inv_r2) mov(r2, nn++);
+  if (inv_r3) mov(r3, 0xDEAD);
+#endif
+}
+#endif // ifndef PRODUCT
--- /dev/null	2016-08-26 13:08:01.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_MacroAssembler_aarch32.hpp	2016-08-26 13:08:01.000000000 +0300
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
+
+// C1_MacroAssembler contains high-level macros for C1
+
+ private:
+  int _rsp_offset;    // track rsp changes
+  // initialization
+  void pd_init() { _rsp_offset = 0; }
+
+void zero_memory(Register addr, Register len, Register t1);
+
+ public:
+  void try_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+
+  void initialize_header(Register obj, Register klass, Register len, Register t1, Register t2);
+  void initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1);
+
+  void float_cmp(bool is_float, int unordered_result,
+                 FloatRegister f0, FloatRegister f1,
+                 Register result);
+
+  // locking
+  // hdr     : must be r0, contents destroyed
+  // obj     : must point to the object to lock, contents preserved
+  // disp_hdr: must point to the displaced header location, contents preserved
+  // scratch : scratch register, contents destroyed
+  // returns code offset at which to add null check debug information
+  int lock_object  (Register swap, Register obj, Register disp_hdr, Register scratch, Label& slow_case);
+
+  // unlocking
+  // hdr     : contents destroyed
+  // obj     : must point to the object to lock, contents preserved
+  // disp_hdr: must be r0 & must point to the displaced header location, contents destroyed
+  void unlock_object(Register swap, Register obj, Register lock, Label& slow_case);
+
+  void initialize_object(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register klass,                    // object klass
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2                        // temp register
+  );
+
+  // allocation of fixed-size objects
+  // (can also be used to allocate fixed-size arrays, by setting
+  // hdr_size correctly and storing the array length afterwards)
+  // obj        : will contain pointer to allocated object
+  // t1, t2     : scratch registers - contents destroyed
+  // header_size: size of object header in words
+  // object_size: total size of object in words
+  // slow_case  : exit to slow case implementation if fast allocation fails
+  void allocate_object(Register obj, Register t1, Register t2, int header_size, int object_size, Register klass, Label& slow_case);
+
+  enum {
+    max_array_allocation_length = 0x00FFFFFF
+  };
+
+  // allocation of arrays
+  // obj        : will contain pointer to allocated object
+  // len        : array length in number of elements
+  // t          : scratch register - contents destroyed
+  // header_size: size of object header in words
+  // f          : element scale factor
+  // slow_case  : exit to slow case implementation if fast allocation fails
+  void allocate_array(Register obj, Register len, Register t, Register t2, int header_size, int f, Register klass, Label& slow_case);
+
+  int  rsp_offset() const { return _rsp_offset; }
+  void set_rsp_offset(int n) { _rsp_offset = n; }
+
+  void invalidate_registers(bool inv_r0, bool inv_r2, bool inv_r3) PRODUCT_RETURN;
+
+#endif // CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
--- /dev/null	2016-08-26 13:08:03.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_Runtime1_aarch32.cpp	2016-08-26 13:08:02.000000000 +0300
@@ -0,0 +1,1316 @@
+/*
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_Defs.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "compiler/disassembler.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "register_aarch32.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/vframe.hpp"
+#include "runtime/vframeArray.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#if INCLUDE_ALL_GCS
+#include "gc_implementation/g1/g1SATBCardTableModRefBS.hpp"
+#endif
+
+// Implementation of StubAssembler
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, int args_size) {
+  // setup registers
+  assert(!(oop_result1->is_valid() || metadata_result->is_valid()) || oop_result1 != metadata_result, "registers must be different");
+  assert(oop_result1 != rthread && metadata_result != rthread, "registers must be different");
+  assert(args_size >= 0, "illegal args_size");
+
+  mov(c_rarg0, rthread);
+  set_num_rt_args(0); // Nothing on stack
+
+  Label retaddr;
+  set_last_Java_frame(sp, rfp, retaddr, rscratch1);
+
+  // do the call
+  lea(rscratch1, RuntimeAddress(entry));
+  bl(rscratch1);
+  bind(retaddr);
+  int call_offset = offset();
+  // verify callee-saved register
+#ifdef ASSERT
+  push(r0, sp);
+  { Label L;
+    get_thread(r0);
+    cmp(rthread, r0);
+    b(L, Assembler::EQ);
+    stop("StubAssembler::call_RT: rthread not callee saved?");
+    bind(L);
+  }
+  pop(r0, sp);
+#endif
+  reset_last_Java_frame(true, true);
+  maybe_isb();
+
+  // check for pending exceptions
+  { Label L;
+    // check for pending exceptions (java_thread is set upon return)
+    ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    cbz(rscratch1, L);
+    mov(rscratch1, 0);
+    // exception pending => remove activation and forward to exception handler
+    // make sure that the vm_results are cleared
+    if (oop_result1->is_valid()) {
+      str(rscratch1, Address(rthread, JavaThread::vm_result_offset()));
+    }
+    if (metadata_result->is_valid()) {
+      str(rscratch1, Address(rthread, JavaThread::vm_result_2_offset()));
+    }
+    if (frame_size() == no_frame_size) {
+      leave();
+      far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+    } else if (_stub_id == Runtime1::forward_exception_id) {
+      should_not_reach_here();
+    } else {
+      far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::forward_exception_id)));
+    }
+    bind(L);
+  }
+  // get oop results if there are any and reset the values in the thread
+  if (oop_result1->is_valid()) {
+    get_vm_result(oop_result1, rthread);
+  }
+  if (metadata_result->is_valid()) {
+    get_vm_result_2(metadata_result, rthread);
+  }
+  return call_offset;
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1) {
+  mov(c_rarg1, arg1);
+  return call_RT(oop_result1, metadata_result, entry, 1);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2) {
+  if (c_rarg1 == arg2) {
+    if (c_rarg2 == arg1) {
+      mov(rscratch1, arg1);
+      mov(arg1, arg2);
+      mov(arg2, rscratch1);
+    } else {
+      mov(c_rarg2, arg2);
+      mov(c_rarg1, arg1);
+    }
+  } else {
+    mov(c_rarg1, arg1);
+    mov(c_rarg2, arg2);
+  }
+  return call_RT(oop_result1, metadata_result, entry, 2);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2, Register arg3) {
+  // if there is any conflict use the stack
+  if (arg1 == c_rarg2 || arg1 == c_rarg3 ||
+      arg2 == c_rarg1 || arg2 == c_rarg3 ||
+      arg3 == c_rarg1 || arg3 == c_rarg2) {
+    push(arg2);
+    push(arg3);
+    push(arg1);
+    pop(c_rarg1);
+    pop(c_rarg3);
+    pop(c_rarg2);
+  } else {
+    mov(c_rarg1, arg1);
+    mov(c_rarg2, arg2);
+    mov(c_rarg3, arg3);
+  }
+  return call_RT(oop_result1, metadata_result, entry, 3);
+}
+
+// Implementation of StubFrame
+
+class StubFrame: public StackObj {
+ private:
+  StubAssembler* _sasm;
+
+ public:
+  StubFrame(StubAssembler* sasm, const char* name, bool must_gc_arguments);
+  void load_argument(int offset_in_words, Register reg);
+
+  ~StubFrame();
+};;
+
+
+#define __ _sasm->
+
+StubFrame::StubFrame(StubAssembler* sasm, const char* name, bool must_gc_arguments) {
+  _sasm = sasm;
+  __ set_info(name, must_gc_arguments);
+  __ enter();
+}
+
+// load parameters that were stored with LIR_Assembler::store_parameter
+// Note: offsets for store_parameter and load_argument must match
+void StubFrame::load_argument(int offset_in_words, Register reg) {
+  //     - 1: link
+  // fp    0: return address
+  //     + 1: argument with offset 0
+  //     + 2: argument with offset 1
+  //     + 3: ...
+
+  __ ldr(reg, Address(rfp, (offset_in_words + 1) * BytesPerWord));
+}
+
+
+StubFrame::~StubFrame() {
+  __ leave();
+  __ ret(lr);
+}
+
+#undef __
+
+
+// Implementation of Runtime1
+
+#define __ sasm->
+
+const int float_regs_as_doubles_size_in_slots = pd_nof_fpu_regs_frame_map * 2;
+
+// Stack layout for saving/restoring  all the registers needed during a runtime
+// call (this includes deoptimization)
+// Note: note that users of this frame may well have arguments to some runtime
+// while these values are on the stack. These positions neglect those arguments
+// but the code in save_live_registers will take the argument count into
+// account.
+//
+
+enum reg_save_layout {
+  reg_save_s0,
+  reg_save_s31 = reg_save_s0 + 31,
+  reg_save_pad, // to align to doubleword to simplify conformance to APCS
+  reg_save_r0,
+  reg_save_r1,
+  reg_save_r2,
+  reg_save_r3,
+  reg_save_r4,
+  reg_save_r5,
+  reg_save_r6,
+  reg_save_r7,
+  reg_save_r8,
+  reg_save_r9,
+  reg_save_r10,
+  reg_save_r11,
+  reg_save_r12,
+  // pushed by enter
+  rfp_off,
+  return_off,
+  reg_save_frame_size
+};
+
+// Save off registers which might be killed by calls into the runtime.
+// Tries to smart of about FP registers.  In particular we separate
+// saving and describing the FPU registers for deoptimization since we
+// have to save the FPU registers twice if we describe them.  The
+// deopt blob is the only thing which needs to describe FPU registers.
+// In all other cases it should be sufficient to simply save their
+// current value.
+
+static int cpu_reg_save_offsets[FrameMap::nof_cpu_regs];
+static int fpu_reg_save_offsets[FrameMap::nof_fpu_regs];
+static int reg_save_size_in_words;
+static int frame_size_in_bytes = -1;
+
+static OopMap* generate_oop_map(StubAssembler* sasm, bool save_fpu_registers) {
+  int frame_size_in_bytes = reg_save_frame_size * BytesPerWord;
+  sasm->set_frame_size(frame_size_in_bytes / BytesPerWord);
+  int frame_size_in_slots = frame_size_in_bytes / sizeof(jint);
+  OopMap* oop_map = new OopMap(frame_size_in_slots, 0);
+
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r0), r0->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r1), r1->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r2), r2->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r3), r3->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r4), r4->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r5), r5->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r6), r6->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r7), r7->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r8), r8->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r9), r9->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r10), r10->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r11), r11->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r12), r12->as_VMReg());
+
+  for (int i = 0; i < 32; ++i) {
+    oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_s0 + i), as_FloatRegister(i)->as_VMReg());
+  }
+
+  return oop_map;
+}
+
+static OopMap* save_live_registers(StubAssembler* sasm,
+                                   bool save_fpu_registers = true) {
+  __ block_comment("save_live_registers");
+
+  __ push(RegSet::range(r0, r12), sp);         // integer registers except lr & sp
+  __ sub(sp, sp, 4);                           // align to 8 bytes
+
+  if (save_fpu_registers) {
+    __ vstmdb_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ sub(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  return generate_oop_map(sasm, save_fpu_registers);
+}
+
+static void restore_live_registers(StubAssembler* sasm, bool restore_fpu_registers = true) {
+  if (restore_fpu_registers) {
+    __ vldmia_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ add(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  __ add(sp, sp, 4);
+  __ pop(RegSet::range(r0, r12), sp);
+}
+
+static void restore_live_registers_except_r0(StubAssembler* sasm, bool restore_fpu_registers = true)  {
+
+  if (restore_fpu_registers) {
+    __ vldmia_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ add(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  __ add(sp, sp, 8);
+  __ pop(RegSet::range(r1, r12), sp);
+}
+
+void Runtime1::initialize_pd() {
+}
+
+// target: the entry point of the method that creates and posts the exception oop
+// has_argument: true if the exception needs an argument (passed in rscratch1)
+
+OopMapSet* Runtime1::generate_exception_throw(StubAssembler* sasm, address target, bool has_argument) {
+  // make a frame and preserve the caller's caller-save registers
+  OopMap* oop_map = save_live_registers(sasm);
+  int call_offset;
+  if (!has_argument) {
+    call_offset = __ call_RT(noreg, noreg, target);
+  } else {
+    call_offset = __ call_RT(noreg, noreg, target, rscratch1);
+  }
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  __ should_not_reach_here();
+  return oop_maps;
+}
+
+
+OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
+  __ block_comment("generate_handle_exception");
+
+  // incoming parameters
+  const Register exception_oop = r0;
+  const Register exception_pc  = r3;
+  // other registers used in this stub
+
+  // Save registers, if required.
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* oop_map = NULL;
+  switch (id) {
+  case forward_exception_id:
+    // We're handling an exception in the context of a compiled frame.
+    // The registers have been saved in the standard places.  Perform
+    // an exception lookup in the caller and dispatch to the handler
+    // if found.  Otherwise unwind and dispatch to the callers
+    // exception handler.
+    oop_map = generate_oop_map(sasm, 1 /*thread*/);
+    __ mov(rscratch1, 0);
+
+    // load and clear pending exception oop into r0
+    __ ldr(exception_oop, Address(rthread, Thread::pending_exception_offset()));
+    __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+
+    // load issuing PC (the return address for this stub) into r3
+    __ ldr(exception_pc, Address(rfp));
+
+    // make sure that the vm_results are cleared (may be unnecessary)
+    __ str(rscratch1, Address(rthread, JavaThread::vm_result_offset()));
+    __ str(rscratch1, Address(rthread, JavaThread::vm_result_2_offset()));
+    break;
+  case handle_exception_nofpu_id:
+  case handle_exception_id:
+    // At this point all registers MAY be live.
+    oop_map = save_live_registers(sasm, id != handle_exception_nofpu_id);
+    break;
+  case handle_exception_from_callee_id: {
+    // At this point all registers except exception oop (r0) and
+    // exception pc (lr) are dead.
+    const int frame_size = 2 /*fp, return address*/;
+    assert(frame_size*wordSize % StackAlignmentInBytes == 0, "must be");
+    oop_map = new OopMap(frame_size * VMRegImpl::slots_per_word, 0);
+    sasm->set_frame_size(frame_size);
+    break;
+  }
+  default:
+    __ should_not_reach_here();
+    break;
+  }
+
+  // verify that only r0 and r3 are valid at this time
+  __ invalidate_registers(false, true, false);
+  // verify that r0 contains a valid exception
+  __ verify_not_null_oop(exception_oop);
+
+#ifdef ASSERT
+  // check that fields in JavaThread for exception oop and issuing pc are
+  // empty before writing to them
+  Label oop_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ cbz(rscratch1, oop_empty);
+  __ stop("exception oop already set");
+  __ bind(oop_empty);
+
+  Label pc_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+  __ cbz(rscratch1, pc_empty);
+  __ stop("exception pc already set");
+  __ bind(pc_empty);
+#endif
+
+  // save exception oop and issuing pc into JavaThread
+  // (exception handler will load it from here)
+  __ str(exception_oop, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(exception_pc, Address(rthread, JavaThread::exception_pc_offset()));
+
+  // patch throwing pc into return address (has bci & oop map)
+  __ str(exception_pc, Address(rfp));
+
+  // compute the exception handler.
+  // the exception oop and the throwing pc are read from the fields in JavaThread
+  int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, exception_handler_for_pc));
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  // r0: handler address
+  //      will be the deopt blob if nmethod was deoptimized while we looked up
+  //      handler regardless of whether handler existed in the nmethod.
+
+  // only r0 is valid at this time, all other registers have been destroyed by the runtime call
+  __ invalidate_registers(false, true, true);
+
+  // patch the return address, this stub will directly return to the exception handler
+  __ str(r0, Address(rfp));
+
+  switch (id) {
+  case forward_exception_id:
+  case handle_exception_nofpu_id:
+  case handle_exception_id:
+    // Restore the registers that were saved at the beginning.
+    restore_live_registers(sasm, id != handle_exception_nofpu_id);
+    break;
+  case handle_exception_from_callee_id:
+    // Pop the return address.
+    __ leave();
+    __ ret(lr);  // jump to exception handler
+    break;
+  default:  ShouldNotReachHere();
+  }
+
+  return oop_maps;
+}
+
+
+void Runtime1::generate_unwind_exception(StubAssembler *sasm) {
+  // incoming parameters
+  const Register exception_oop = r0;
+  // other registers used in this stub
+  const Register exception_pc = r3;
+  const Register handler_addr = r1;
+
+  // verify that only r0, is valid at this time
+  __ invalidate_registers(false, true, true);
+
+#ifdef ASSERT
+  // check that fields in JavaThread for exception oop and issuing pc are empty
+  Label oop_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ cbz(rscratch1, oop_empty);
+  __ stop("exception oop must be empty");
+  __ bind(oop_empty);
+
+  Label pc_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+  __ cbz(rscratch1, pc_empty);
+  __ stop("exception pc must be empty");
+  __ bind(pc_empty);
+#endif
+
+  // Save our return address because
+  // exception_handler_for_return_address will destroy it.  We also
+  // save exception_oop
+  __ push(exception_oop);
+  __ push(lr);
+
+  // search the exception handler address of the caller (using the return address)
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, lr);
+  // r0: exception handler address of the caller
+
+  // Only R0 is valid at this time; all other registers have been
+  // destroyed by the call.
+  __ invalidate_registers(false, true, true);
+
+  // move result of call into correct register
+  __ mov(handler_addr, r0);
+
+  // get throwing pc (= return address).
+  // lr has been destroyed by the call
+  __ pop(lr);
+  __ pop(exception_oop);
+  __ mov(r3, lr);
+
+  __ verify_not_null_oop(exception_oop);
+
+  // continue at exception handler (return address removed)
+  // note: do *not* remove arguments when unwinding the
+  //       activation since the caller assumes having
+  //       all arguments on the stack when entering the
+  //       runtime to determine the exception handler
+  //       (GC happens at call site with arguments!)
+  // r0: exception oop
+  // r3: throwing pc
+  // r1: exception handler
+  __ b(handler_addr);
+}
+
+
+
+OopMapSet* Runtime1::generate_patching(StubAssembler* sasm, address target) {
+  // use the maximum number of runtime-arguments here because it is difficult to
+  // distinguish each RT-Call.
+  // Note: This number affects also the RT-Call in generate_handle_exception because
+  //       the oop-map is shared for all calls.
+  DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+  assert(deopt_blob != NULL, "deoptimization blob must have been created");
+
+  OopMap* oop_map = save_live_registers(sasm);
+
+  __ mov(c_rarg0, rthread);
+  Label retaddr;
+  __ set_last_Java_frame(sp, rfp, retaddr, rscratch1);
+  // do the call
+  __ lea(rscratch1, RuntimeAddress(target));
+  __ bl(rscratch1);
+  __ bind(retaddr);
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(__ offset(), oop_map);
+  // verify callee-saved register
+#ifdef ASSERT
+  { Label L;
+    __ get_thread(rscratch1);
+    __ cmp(rthread, rscratch1);
+    __ b(L, Assembler::EQ);
+    __ stop("StubAssembler::call_RT: rthread not callee saved?");
+    __ bind(L);
+  }
+#endif
+  __ reset_last_Java_frame(true, false);
+  __ maybe_isb();
+
+  // check for pending exceptions
+  { Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    // exception pending => remove activation and forward to exception handler
+
+    { Label L1;
+      __ cbnz(r0, L1);                                  // have we deoptimized?
+      __ far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::forward_exception_id)));
+      __ bind(L1);
+    }
+
+    // the deopt blob expects exceptions in the special fields of
+    // JavaThread, so copy and clear pending exception.
+
+    // load and clear pending exception
+    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ mov(rscratch1, 0);
+    __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+
+    // check that there is really a valid exception
+    __ verify_not_null_oop(r0);
+
+    // load throwing pc: this is the return address of the stub
+    __ ldr(r3, Address(rfp));
+
+#ifdef ASSERT
+    // check that fields in JavaThread for exception oop and issuing pc are empty
+    Label oop_empty;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, oop_empty);
+    __ stop("exception oop must be empty");
+    __ bind(oop_empty);
+
+    Label pc_empty;
+    __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+    __ cbz(rscratch1, pc_empty);
+    __ stop("exception pc must be empty");
+    __ bind(pc_empty);
+#endif
+
+    // store exception oop and throwing pc to JavaThread
+    __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+    __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+
+    restore_live_registers(sasm);
+
+    __ leave();
+
+    // Forward the exception directly to deopt blob. We can blow no
+    // registers and must leave throwing pc on the stack.  A patch may
+    // have values live in registers so the entry point with the
+    // exception in tls.
+    __ far_jump(RuntimeAddress(deopt_blob->unpack_with_exception_in_tls()));
+
+    __ bind(L);
+  }
+
+
+  // Runtime will return true if the nmethod has been deoptimized during
+  // the patching process. In that case we must do a deopt reexecute instead.
+
+  Label reexecuteEntry, cont;
+
+  __ cbz(r0, cont);                                 // have we deoptimized?
+
+  // Will reexecute. Proper return address is already on the stack we just restore
+  // registers, pop all of our frame but the return address and jump to the deopt blob
+  restore_live_registers(sasm);
+  __ leave();
+  __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+
+  __ bind(cont);
+  restore_live_registers(sasm);
+  __ leave();
+  __ ret(lr);
+
+  return oop_maps;
+}
+
+
+OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
+
+  const Register exception_oop = r0;
+  const Register exception_pc  = r3;
+
+  // for better readability
+  const bool must_gc_arguments = true;
+  const bool dont_gc_arguments = false;
+
+  // default value; overwritten for some optimized stubs that are called from methods that do not use the fpu
+  bool save_fpu_registers = true;
+
+  // stub code & info for the different stubs
+  OopMapSet* oop_maps = NULL;
+  OopMap* oop_map = NULL;
+  switch (id) {
+    {
+    case forward_exception_id:
+      {
+        oop_maps = generate_handle_exception(id, sasm);
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case throw_div0_exception_id:
+      { StubFrame f(sasm, "throw_div0_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_div0_exception), false);
+      }
+      break;
+
+    case throw_null_pointer_exception_id:
+      { StubFrame f(sasm, "throw_null_pointer_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_null_pointer_exception), false);
+      }
+      break;
+
+    case new_instance_id:
+    case fast_new_instance_id:
+    case fast_new_instance_init_check_id:
+      {
+        Register klass = r3; // Incoming
+        Register obj   = r0; // Result
+
+        if (id == new_instance_id) {
+          __ set_info("new_instance", dont_gc_arguments);
+        } else if (id == fast_new_instance_id) {
+          __ set_info("fast new_instance", dont_gc_arguments);
+        } else {
+          assert(id == fast_new_instance_init_check_id, "bad StubID");
+          __ set_info("fast new_instance init check", dont_gc_arguments);
+        }
+
+        if ((id == fast_new_instance_id || id == fast_new_instance_init_check_id) &&
+            UseTLAB && FastTLABRefill) {
+          Label slow_path;
+          Register obj_size = r2;
+          Register t1       = r5;
+          Register t2       = r4;
+          assert_different_registers(klass, obj, obj_size, t1, t2);
+
+          __ push(t1);
+          __ push(r5);
+
+          if (id == fast_new_instance_init_check_id) {
+            // make sure the klass is initialized
+            __ ldrb(rscratch1, Address(klass, InstanceKlass::init_state_offset()));
+            __ cmp(rscratch1, InstanceKlass::fully_initialized);
+            __ b(slow_path, Assembler::NE);
+          }
+
+#ifdef ASSERT
+          // assert object can be fast path allocated
+          {
+            Label ok, not_ok;
+            __ ldr(obj_size, Address(klass, Klass::layout_helper_offset()));
+            __ cmp(obj_size, 0u);
+            __ b(not_ok, Assembler::LE); // Make sure it's an instance (layout helper is positive)
+            __ tst(obj_size, Klass::_lh_instance_slow_path_bit);
+            __ b(ok, Assembler::EQ);
+            __ bind(not_ok);
+            __ stop("assert(can be fast path allocated)");
+            __ should_not_reach_here();
+            __ bind(ok);
+          }
+#endif // ASSERT
+
+          // if we got here then the TLAB allocation failed, so try
+          // refilling the TLAB or allocating directly from eden.
+          Label retry_tlab, try_eden;
+          __ tlab_refill(retry_tlab, try_eden, slow_path); // does not destroy r3 (klass), returns r5
+
+          __ bind(retry_tlab);
+
+          // get the instance size (size is postive so movl is fine for 64bit)
+          __ ldr(obj_size, Address(klass, Klass::layout_helper_offset()));
+
+          __ tlab_allocate(obj, obj_size, 0, t1, t2, slow_path);
+
+          __ initialize_object(obj, klass, obj_size, 0, t1, t2);
+          __ verify_oop(obj);
+          __ pop(r5);
+          __ pop(t1);
+          __ ret(lr);
+
+          __ bind(try_eden);
+          // get the instance size (size is postive so movl is fine for 64bit)
+          __ ldr(obj_size, Address(klass, Klass::layout_helper_offset()));
+
+          __ eden_allocate(obj, obj_size, 0, t1, slow_path);
+          __ incr_allocated_bytes(rthread, obj_size, 0, rscratch1);
+
+          __ initialize_object(obj, klass, obj_size, 0, t1, t2);
+          __ verify_oop(obj);
+          __ pop(r5);
+          __ pop(t1);
+          __ ret(lr);
+
+          __ bind(slow_path);
+          __ pop(r5);
+          __ pop(t1);
+        }
+
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        int call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_instance), klass);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+        __ verify_oop(obj);
+        __ leave();
+        __ ret(lr);
+
+        // r0,: new instance
+      }
+
+      break;
+
+    case counter_overflow_id:
+      {
+        Register bci = r0, method = r1;
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        // Retrieve bci
+        __ ldr(bci, Address(rfp, 2*BytesPerWord));
+        // And a pointer to the Method*
+        __ ldr(method, Address(rfp, 3*BytesPerWord));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, counter_overflow), bci, method);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm);
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case new_type_array_id:
+    case new_object_array_id:
+      {
+        Register length   = r6; // Incoming
+        Register klass    = r3; // Incoming
+        Register obj      = r0; // Result
+
+        if (id == new_type_array_id) {
+          __ set_info("new_type_array", dont_gc_arguments);
+        } else {
+          __ set_info("new_object_array", dont_gc_arguments);
+        }
+
+#ifdef ASSERT
+        // assert object type is really an array of the proper kind
+        {
+          Label ok;
+          Register t0 = obj;
+          __ ldr(t0, Address(klass, Klass::layout_helper_offset()));
+          __ asr(t0, t0, Klass::_lh_array_tag_shift);
+          int tag = ((id == new_type_array_id)
+                     ? Klass::_lh_array_tag_type_value
+                     : Klass::_lh_array_tag_obj_value);
+          __ mov(rscratch1, tag);
+          __ cmp(t0, rscratch1);
+          __ b(ok, Assembler::EQ);
+          __ stop("assert(is an array klass)");
+          __ should_not_reach_here();
+          __ bind(ok);
+        }
+#endif // ASSERT
+
+        if (UseTLAB && FastTLABRefill) {
+          Register arr_size = r4;
+          Register t1       = r2;
+          Register t2       = r5;
+          Label slow_path;
+          assert_different_registers(length, klass, obj, arr_size, t1, t2);
+
+          // check that array length is small enough for fast path.
+          __ mov(rscratch1, C1_MacroAssembler::max_array_allocation_length);
+          __ cmp(length, rscratch1);
+          __ b(slow_path, Assembler::HI);
+
+          // if we got here then the TLAB allocation failed, so try
+          // refilling the TLAB or allocating directly from eden.
+          Label retry_tlab, try_eden;
+          const Register thread =
+            __ tlab_refill(retry_tlab, try_eden, slow_path); // preserves r6 & r3, returns rthread
+
+          __ bind(retry_tlab);
+
+          // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
+          // since size is positive ldrw does right thing on 64bit
+          __ ldr(t1, Address(klass, Klass::layout_helper_offset()));
+          __ andr(rscratch1, t1, 0x1f);
+          __ lsl(arr_size, length, rscratch1);
+          __ extract_bits(t1, t1, Klass::_lh_header_size_shift,
+                  exact_log2(Klass::_lh_header_size_mask + 1));
+          __ add(arr_size, arr_size, t1);
+          __ add(arr_size, arr_size, MinObjAlignmentInBytesMask); // align up
+          __ mov(rscratch1, ~MinObjAlignmentInBytesMask);
+          __ andr(arr_size, arr_size, rscratch1);
+
+          __ tlab_allocate(obj, arr_size, 0, t1, t2, slow_path);  // preserves arr_size
+
+          __ initialize_header(obj, klass, length, t1, t2);
+          // Assume Little-Endian
+          __ ldrb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
+          assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
+          assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
+          __ andr(t1, t1, Klass::_lh_header_size_mask);
+          __ sub(arr_size, arr_size, t1);  // body length
+          __ add(t1, t1, obj);       // body start
+          __ initialize_body(t1, arr_size, 0, t2);
+          __ verify_oop(obj);
+
+          __ ret(lr);
+
+          __ bind(try_eden);
+          // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
+          // since size is positive ldrw does right thing on 64bit
+          __ ldr(t1, Address(klass, Klass::layout_helper_offset()));
+          __ andr(rscratch1, t1, 0x1f);
+          __ lsl(arr_size, length, rscratch1);
+          __ extract_bits(t1, t1, Klass::_lh_header_size_shift,
+                  exact_log2(Klass::_lh_header_size_mask + 1));
+          __ add(arr_size, arr_size, t1);
+          __ add(arr_size, arr_size, MinObjAlignmentInBytesMask); // align up
+          __ mov(rscratch1, ~MinObjAlignmentInBytesMask);
+          __ andr(arr_size, arr_size, rscratch1);
+
+          __ eden_allocate(obj, arr_size, 0, t1, slow_path);  // preserves arr_size
+          __ incr_allocated_bytes(thread, arr_size, 0, rscratch1);
+
+          __ initialize_header(obj, klass, length, t1, t2);
+          // Assume Little-Endian
+          __ ldrb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
+          assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
+          assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
+          __ andr(t1, t1, Klass::_lh_header_size_mask);
+          __ sub(arr_size, arr_size, t1);  // body length
+          __ add(t1, t1, obj);       // body start
+          __ initialize_body(t1, arr_size, 0, t2);
+          __ verify_oop(obj);
+
+          __ ret(lr);
+
+          __ bind(slow_path);
+        }
+
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        int call_offset;
+        if (id == new_type_array_id) {
+          call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_type_array), klass, length);
+        } else {
+          call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_object_array), klass, length);
+        }
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+
+        __ verify_oop(obj);
+        __ leave();
+        __ ret(lr);
+
+        // r0: new array
+      }
+      break;
+
+    case new_multi_array_id:
+      { StubFrame f(sasm, "new_multi_array", dont_gc_arguments);
+        // r1: klass
+        // r2: rank
+        // r3: address of 1st dimension
+        OopMap* map = save_live_registers(sasm);
+        int call_offset = __ call_RT(r0, noreg, CAST_FROM_FN_PTR(address, new_multi_array), r1, r2, r3);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+
+        // r0,: new multi array
+        __ verify_oop(r0);
+      }
+      break;
+
+    case register_finalizer_id:
+      {
+        __ set_info("register_finalizer", dont_gc_arguments);
+
+        // This is called via call_runtime so the arguments
+        // will be place in C abi locations
+
+        __ verify_oop(c_rarg0);
+
+        // load the klass and check the has finalizer flag
+        Label register_finalizer;
+        Register t = r5;
+        __ load_klass(t, r0);
+        __ ldr(t, Address(t, Klass::access_flags_offset()));
+        __ tst(t, JVM_ACC_HAS_FINALIZER);
+        __ b(register_finalizer, Assembler::NE);
+        __ ret(lr);
+
+        __ bind(register_finalizer);
+        __ enter();
+        OopMap* oop_map = save_live_registers(sasm);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, SharedRuntime::register_finalizer), r0);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+
+        // Now restore all the live registers
+        restore_live_registers(sasm);
+
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case throw_class_cast_exception_id:
+      { StubFrame f(sasm, "throw_class_cast_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_class_cast_exception), true);
+      }
+      break;
+
+    case throw_incompatible_class_change_error_id:
+      { StubFrame f(sasm, "throw_incompatible_class_cast_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_incompatible_class_change_error), false);
+      }
+      break;
+
+    case slow_subtype_check_id:
+      {
+        // Typical calling sequence:
+        // __ push(klass_RInfo);  // object klass or other subclass
+        // __ push(sup_k_RInfo);  // array element klass or other superclass
+        // __ bl(slow_subtype_check);
+        // Note that the subclass is pushed first, and is therefore deepest.
+        enum layout {
+          r0_off,
+          r2_off,
+          r4_off,
+          r5_off,
+          sup_k_off,
+          klass_off,
+          framesize,
+          result_off = sup_k_off
+        };
+
+        __ set_info("slow_subtype_check", dont_gc_arguments);
+        __ push(RegSet::of(r0, r2, r4, r5), sp);
+
+        // This is called by pushing args and not with C abi
+        __ ldr(r4, Address(sp, (klass_off) * VMRegImpl::stack_slot_size)); // subclass
+        __ ldr(r0, Address(sp, (sup_k_off) * VMRegImpl::stack_slot_size)); // superclass
+
+
+        Label miss;
+        __ check_klass_subtype_slow_path(r4, r0, r2, r5, NULL, &miss);
+
+        // fallthrough on success:
+        __ mov(rscratch1, 1);
+        __ str(rscratch1, Address(sp, (result_off) * VMRegImpl::stack_slot_size)); // result
+        __ pop(RegSet::of(r0, r2, r4, r5), sp);
+        __ ret(lr);
+
+        __ bind(miss);
+        __ mov(rscratch1, 0);
+        __ str(rscratch1, Address(sp, (result_off) * VMRegImpl::stack_slot_size)); // result
+        __ pop(RegSet::of(r0, r2, r4, r5), sp);
+        __ ret(lr);
+      }
+      break;
+
+    case monitorenter_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorenter_id:
+      {
+        StubFrame f(sasm, "monitorenter", dont_gc_arguments);
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+
+        // Called with store_parameter and not C abi
+
+        f.load_argument(1, r0); // r0,: object
+        f.load_argument(0, r1); // r1,: lock address
+
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorenter), r0, r1);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case monitorexit_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorexit_id:
+      {
+        StubFrame f(sasm, "monitorexit", dont_gc_arguments);
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+
+        // Called with store_parameter and not C abi
+
+        f.load_argument(0, r0); // r0,: lock address
+
+        // note: really a leaf routine but must setup last java sp
+        //       => use call_RT for now (speed can be improved by
+        //       doing last java sp setup manually)
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorexit), r0);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case deoptimize_id:
+      {
+        StubFrame f(sasm, "deoptimize", dont_gc_arguments);
+        OopMap* oop_map = save_live_registers(sasm);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, deoptimize));
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+        restore_live_registers(sasm);
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+        __ leave();
+        __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+      }
+      break;
+
+    case throw_range_check_failed_id:
+      { StubFrame f(sasm, "range_check_failed", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_range_check_exception), true);
+      }
+      break;
+
+    case unwind_exception_id:
+      { __ set_info("unwind_exception", dont_gc_arguments);
+        // note: no stubframe since we are about to leave the current
+        //       activation and we are calling a leaf VM function only.
+        generate_unwind_exception(sasm);
+      }
+      break;
+
+    case access_field_patching_id:
+      { StubFrame f(sasm, "access_field_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, access_field_patching));
+      }
+      break;
+
+    case load_klass_patching_id:
+      { StubFrame f(sasm, "load_klass_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_klass_patching));
+      }
+      break;
+
+    case load_mirror_patching_id:
+      { StubFrame f(sasm, "load_mirror_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_mirror_patching));
+      }
+      break;
+
+    case load_appendix_patching_id:
+      { StubFrame f(sasm, "load_appendix_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_appendix_patching));
+      }
+      break;
+
+    case handle_exception_nofpu_id:
+    case handle_exception_id:
+      { StubFrame f(sasm, "handle_exception", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case handle_exception_from_callee_id:
+      { StubFrame f(sasm, "handle_exception_from_callee", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case throw_index_exception_id:
+      { StubFrame f(sasm, "index_range_check_failed", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_index_exception), true);
+      }
+      break;
+
+    case throw_array_store_exception_id:
+      { StubFrame f(sasm, "throw_array_store_exception", dont_gc_arguments);
+        // tos + 0: link
+        //     + 1: return address
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_array_store_exception), true);
+      }
+      break;
+
+#if INCLUDE_ALL_GCS
+
+// Registers to be saved around calls to g1_wb_pre or g1_wb_post
+#define G1_SAVE_REGS (RegSet::range(r0, r12) - RegSet::of(rscratch1, rscratch2))
+
+    case g1_pre_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_pre_barrier", dont_gc_arguments);
+        // arg0 : previous value of memory
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        if (bs->kind() != BarrierSet::G1SATBCTLogging) {
+          __ mov(r0, (int)id);
+          __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), r0);
+          __ should_not_reach_here();
+          break;
+        }
+
+        const Register pre_val = r0;
+        const Register thread = rthread;
+        const Register tmp = rscratch1;
+
+        Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_active()));
+
+        Address queue_index(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        Label done;
+        Label runtime;
+
+        // Can we store original value in the thread's buffer?
+        __ ldr(tmp, queue_index);
+        __ cbz(tmp, runtime);
+
+        __ sub(tmp, tmp, wordSize);
+        __ str(tmp, queue_index);
+        __ ldr(rscratch2, buffer);
+        __ add(tmp, tmp, rscratch2);
+        f.load_argument(0, rscratch2);
+        __ str(rscratch2, Address(tmp, 0));
+        __ b(done);
+
+        __ bind(runtime);
+        __ push(G1_SAVE_REGS, sp);
+        f.load_argument(0, pre_val);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread);
+        __ pop(G1_SAVE_REGS, sp);
+        __ bind(done);
+      }
+      break;
+    case g1_post_barrier_slow_id:
+      {
+        StubFrame f(sasm, "g1_post_barrier", dont_gc_arguments);
+
+        // arg0: store_address
+        Address store_addr(rfp, 2*BytesPerWord);
+
+        BarrierSet* bs = Universe::heap()->barrier_set();
+        CardTableModRefBS* ct = (CardTableModRefBS*)bs;
+        assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
+
+        Label done;
+        Label runtime;
+
+        // At this point we know new_value is non-NULL and the new_value crosses regions.
+        // Must check to see if card is already dirty
+
+        const Register thread = rthread;
+
+        Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                             PtrQueue::byte_offset_of_index()));
+        Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() +
+                                        PtrQueue::byte_offset_of_buf()));
+
+        const Register card_addr = rscratch2;
+        ExternalAddress cardtable((address) ct->byte_map_base);
+
+        f.load_argument(0, card_addr);
+        __ lsr(card_addr, card_addr, CardTableModRefBS::card_shift);
+        __ mov(rscratch1, cardtable);
+        __ add(card_addr, card_addr, rscratch1);
+        __ ldrb(rscratch1, Address(card_addr));
+        __ cmp(rscratch1, (int)G1SATBCardTableModRefBS::g1_young_card_val());
+        __ b(done, Assembler::EQ);
+
+        assert((int)CardTableModRefBS::dirty_card_val() == 0, "must be 0");
+
+        __ membar(Assembler::StoreLoad);
+        __ ldrb(rscratch1, Address(card_addr));
+        __ cbz(rscratch1, done);
+
+        // storing region crossing non-NULL, card is clean.
+        // dirty card and log.
+        __ mov(rscratch1, 0);
+        __ strb(rscratch1, Address(card_addr));
+
+        __ ldr(rscratch1, queue_index);
+        __ cbz(rscratch1, runtime);
+        __ sub(rscratch1, rscratch1, wordSize);
+        __ str(rscratch1, queue_index);
+
+        const Register buffer_addr = r0;
+
+        __ push(RegSet::of(r0, r1), sp);
+        __ ldr(buffer_addr, buffer);
+        __ str(card_addr, Address(buffer_addr, rscratch1));
+        __ pop(RegSet::of(r0, r1), sp);
+        __ b(done);
+
+        __ bind(runtime);
+        __ push(G1_SAVE_REGS, sp);
+        __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread);
+        __ pop(G1_SAVE_REGS, sp);
+        __ bind(done);
+
+      }
+      break;
+#endif
+
+    case predicate_failed_trap_id:
+      {
+        StubFrame f(sasm, "predicate_failed_trap", dont_gc_arguments);
+
+        OopMap* map = save_live_registers(sasm);
+
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, predicate_failed_trap));
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm);
+        __ leave();
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+
+        __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+      }
+      break;
+
+
+    default:
+      { StubFrame f(sasm, "unimplemented entry", dont_gc_arguments);
+        __ mov(r0, (int)id);
+        __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), r0);
+        __ should_not_reach_here();
+      }
+      break;
+    }
+  }
+  return oop_maps;
+}
+
+#undef __
+
+const char *Runtime1::pd_name_for_address(address entry) { Unimplemented(); return 0; }
--- /dev/null	2016-08-26 13:08:04.000000000 +0300
+++ new/src/cpu/aarch32/vm/c1_globals_aarch32.hpp	2016-08-26 13:08:04.000000000 +0300
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
+// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+#ifndef CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+// Sets the default values for platform dependent flags used by the client compiler.
+// (see c1_globals.hpp)
+
+#ifndef TIERED
+define_pd_global(bool, BackgroundCompilation,        true );
+define_pd_global(bool, UseTLAB,                      true );
+define_pd_global(bool, ResizeTLAB,                   true );
+define_pd_global(bool, InlineIntrinsics,             true );
+define_pd_global(bool, PreferInterpreterNativeStubs, false);
+define_pd_global(bool, ProfileTraps,                 false);
+define_pd_global(bool, UseOnStackReplacement,        true);
+define_pd_global(bool, TieredCompilation,            false);
+define_pd_global(intx, CompileThreshold,             1500 );
+define_pd_global(intx, BackEdgeThreshold,            100000);
+
+define_pd_global(intx, OnStackReplacePercentage,     933  );
+define_pd_global(intx, FreqInlineSize,               325  );
+define_pd_global(intx, NewSizeThreadIncrease,        4*K  );
+define_pd_global(intx, InitialCodeCacheSize,         160*K);
+define_pd_global(intx, ReservedCodeCacheSize,        32*M );
+define_pd_global(intx, NonProfiledCodeHeapSize,      13*M );
+define_pd_global(intx, ProfiledCodeHeapSize,         14*M );
+define_pd_global(intx, NonNMethodCodeHeapSize,       5*M  );
+define_pd_global(bool, ProfileInterpreter,           false);
+define_pd_global(intx, CodeCacheExpansionSize,       32*K );
+define_pd_global(uintx, CodeCacheMinBlockLength,     1);
+define_pd_global(uintx, CodeCacheMinimumUseSpace,    400*K);
+define_pd_global(uintx, MetaspaceSize,               12*M );
+define_pd_global(bool, NeverActAsServerClassMachine, true );
+define_pd_global(uint64_t,MaxRAM,                    1ULL*G);
+define_pd_global(bool, CICompileOSR,                 true );
+#endif // !TIERED
+define_pd_global(bool, UseTypeProfile,               false);
+define_pd_global(bool, RoundFPResults,               true );
+
+define_pd_global(bool, LIRFillDelaySlots,            false);
+define_pd_global(bool, OptimizeSinglePrecision,      true );
+define_pd_global(bool, CSEArrayLength,               true );
+define_pd_global(bool, TwoOperandLIRForm,            false);
+
+define_pd_global(intx, SafepointPollOffset,          0  );
+
+#endif // CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP