--- /dev/null	2018-09-25 19:25:10.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/macroAssembler_aarch32.cpp	2018-09-25 19:25:10.000000000 +0300
@@ -0,0 +1,4941 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "precompiled.hpp"
+#include "jvm.h"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/cardTable.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "gc/shared/cardTableBarrierSet.hpp"
+#include "interpreter/interpreter.hpp"
+#include "compiler/disassembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/accessDecorators.hpp"
+//This ifdef was introduced so a core build can be built
+#ifdef COMPILER2
+#include "opto/compile.hpp"
+#include "opto/node.hpp"
+#endif
+
+#include "runtime/biasedLocking.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/jniHandles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#define STOP(error) stop(error)
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#define STOP(error) block_comment(error); stop(error)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// FIXME This is not a nice fix, this constant was in a compiler2 header
+#define MAX_stubs_size_div2 (128 / 2)
+// FIXME END
+
+// Note the corrections in the following three instructions for the PC.
+// All literal modes that use the PC need to have the offset adjusted
+// Patch any kind of instruction; there may be several instructions.
+// Return the total length (in bytes) of the instructions.
+
+int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
+  // Note the corrections
+  int instructions = 1;
+  long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
+  bool add = offset >= 0;
+  unsigned insn = *(unsigned*)branch;
+  int opc = Instruction_aarch32::extract(insn, 27, 24);
+
+  if(0b1010 == opc || 0b1011 == opc) {
+    // Branch or branch with link
+    assert(0 == (offset & 3), "not aligned correctly");
+    Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
+  } else if (0b0011 == opc) {
+    // Movw, Movt or mov, orr, orr, orr
+    // patch up address load to registers (absolute address).
+      instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
+  } else if (0b010 == (opc >> 1)) {
+    // LDR, LDRB, STR, STRB
+    Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b000 == (opc >> 1)) {
+    // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
+    offset = uabs(offset);
+    Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
+    Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b1101 == opc) {
+    // VLDR, VSTR - NOTE VSTR(lit) is deprecated
+    offset = uabs(offset);
+    assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
+    Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b0010 == opc) {
+    // ADR
+    Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
+    Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
+  } else {
+    ShouldNotReachHere();
+  }
+  // aarch64 had something for polling page load?
+  return instructions * NativeInstruction::arm_insn_sz;
+}
+
+int MacroAssembler::patch_oop(address insn_addr, address o) {
+    unsigned insn = *(unsigned*)insn_addr;
+    int opc = Instruction_aarch32::extract(insn, 27, 21);
+    if(0b0011000 == opc) {
+        //32-bit pointers, formed of a mov and a movt
+        assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
+
+        uint32_t btm = (uint32_t)o & 0xffff;
+        Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
+        Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
+        uint32_t top = (uint32_t)o >> 16;
+        Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
+        Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
+        return 2 * NativeInstruction::arm_insn_sz;
+  } else if(0b0011101 == opc) {
+    //Instead 32bit load sequence uses mov, orr, orr, orr
+    assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
+    assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
+    assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
+    // FIXME this could carry us outside valid memory
+
+    uint32_t addr = (uint32_t)o;
+    Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
+    return 4 * NativeInstruction::arm_insn_sz;
+  } else {
+    ShouldNotReachHere();
+  }
+  return 0; //won't reach here
+}
+
+address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
+  long offset = 0;
+  int opc = Instruction_aarch32::extract(insn, 27, 24);
+
+  if(0b1010 == opc || 0b1011 == opc) {
+    // Branch or branch with link
+    offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
+  } else if (0b0011 == opc) {
+    unsigned *insn_buf = (unsigned*)insn_addr;
+    int opc2 = Instruction_aarch32::extract(insn, 23, 21);
+    if(0b000 == opc2) {
+      // movw, movt (only on newer ARMs)
+      assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
+      uint32_t addr;
+      addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
+      addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
+      addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
+      addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
+      return address(addr);
+    } else if(0b101 == opc2) {
+      // mov, orr, orr, orr
+      assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
+      assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
+      assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
+      uint32_t addr;
+      // TODO Check that the rotations are in the expected order.
+      addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
+      return address(addr);
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (0b010 == (opc >> 1)) {
+    // LDR, LDRB, STR, STRB
+    offset = Instruction_aarch32::extract(insn, 11, 0);
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b000 == (opc >> 1)) {
+    // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
+    offset = Instruction_aarch32::extract(insn, 3, 0);
+    offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b1101 == opc) {
+    // VLDR, VSTR - NOTE VSTR(lit) is deprecated
+    offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b0010 == opc) {
+    // ADR
+    offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
+    int code = Instruction_aarch32::extract(insn, 23, 22);
+    switch(code) {
+      case 0b01: offset = -offset; break;
+      case 0b10:                   break;
+      default: ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  //Correct offset for PC
+  offset += 8;
+  return address(((uint32_t)insn_addr + offset));
+}
+
+
+void MacroAssembler::serialize_memory(Register thread, Register tmp) {
+  dmb(Assembler::ISH);
+}
+
+void MacroAssembler::safepoint_poll(Label& slow_path) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
+    tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
+  } else {
+    mov(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()));
+    ldr(rscratch1, Address(rscratch1));
+    cmp(rscratch1, SafepointSynchronize::_not_synchronized);
+    b(slow_path, Assembler::NE);
+  }
+}
+
+// Just like safepoint_poll, but use an acquiring load for thread-
+// local polling.
+//
+// We need an acquire here to ensure that any subsequent load of the
+// global SafepointSynchronize::_state flag is ordered after this load
+// of the local Thread::_polling page.  We don't want this poll to
+// return false (i.e. not safepointing) and a later poll of the global
+// SafepointSynchronize::_state spuriously to return true.
+//
+// This is to avoid a race when we're in a native->Java transition
+// racing the code which wakes up from a safepoint.
+//
+void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
+    ldr(rscratch1, rscratch1);
+    dmb(Assembler::ISH);
+    tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
+  } else {
+    safepoint_poll(slow_path);
+  }
+}
+
+void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
+  mov(rscratch1, 0);
+  // we must set sp to zero to clear frame
+  str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
+  // must clear fp, so that compiled frames are not confused; it is
+  // possible that we need it only for debugging
+  if (clear_fp) {
+    str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+
+  // Always clear the pc because it could have been set by make_walkable()
+  str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
+}
+
+// Calls to C land
+//
+// When entering C land, the rfp & sp of the last Java frame have to be recorded
+// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
+// has to be reset to 0. This is required to allow proper stack traversal.
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Register last_java_pc,
+                                         Register scratch) {
+
+  if (last_java_pc->is_valid()) {
+      str(last_java_pc, Address(rthread,
+                                JavaThread::frame_anchor_offset()
+                                + JavaFrameAnchor::last_Java_pc_offset()));
+    }
+
+  // determine last_java_sp register
+  if (last_java_sp == sp) {
+    mov(scratch, sp);
+    last_java_sp = scratch;
+  } else if (!last_java_sp->is_valid()) {
+    last_java_sp = sp;
+  }
+
+  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
+
+  // last_java_fp is optional
+  if (last_java_fp->is_valid()) {
+    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         address  last_java_pc,
+                                         Register scratch) {
+  if (last_java_pc != NULL) {
+    adr(scratch, last_java_pc);
+  } else {
+    // FIXME: This is almost never correct.  We should delete all
+    // cases of set_last_Java_frame with last_java_pc=NULL and use the
+    // correct return address instead.
+    adr(scratch, pc());
+  }
+
+  str(scratch, Address(rthread,
+                       JavaThread::frame_anchor_offset()
+                       + JavaFrameAnchor::last_Java_pc_offset()));
+
+  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Label &L,
+                                         Register scratch) {
+  if (L.is_bound()) {
+    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
+  } else {
+    InstructionMark im(this);
+    L.add_patch_at(code(), locator());
+    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
+  }
+}
+
+void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf) {
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    lea(lr, entry);
+    if (cbuf) cbuf->set_insts_mark();
+    bl(lr);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    bl(entry);
+  }
+}
+
+void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    lea(tmp, entry);
+    if (cbuf) cbuf->set_insts_mark();
+    b(tmp);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    b(entry);
+  }
+}
+
+void MacroAssembler::reserved_stack_check() {
+    // testing if reserved zone needs to be enabled
+    Label no_reserved_zone_enabling;
+
+    ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
+    cmp(sp, rscratch1);
+    b(no_reserved_zone_enabling, Assembler::LO);
+
+    enter();   // LR and FP are live.
+    lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
+    mov(c_rarg0, rthread);
+    bl(rscratch1);
+    leave();
+
+    // We have already removed our own frame.
+    // throw_delayed_StackOverflowError will think that it's been
+    // called by our caller.
+    lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
+    b(rscratch1);
+    should_not_reach_here();
+
+    bind(no_reserved_zone_enabling);
+}
+
+int MacroAssembler::biased_locking_enter(Register obj_reg,
+                                         Register swap_reg,
+                                         Register tmp_reg,
+                                         Register tmp_reg2,
+                                         bool swap_reg_contains_mark,
+                                         Label& done,
+                                         Label* slow_case,
+                                         BiasedLockingCounters* counters) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  if (PrintBiasedLockingStatistics && counters == NULL)
+    counters = BiasedLocking::counters();
+
+  assert(tmp_reg != noreg, "must be real register");
+  assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp_reg2);
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
+  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
+
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+  int null_check_offset = -1;
+  if (!swap_reg_contains_mark) {
+    null_check_offset = offset();
+    ldr(swap_reg, mark_addr);
+  }
+  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+  b(cas_label, Assembler::NE);
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+  load_prototype_header(tmp_reg, obj_reg);
+  orr(tmp_reg, tmp_reg, rthread);
+  eor(tmp_reg, swap_reg, tmp_reg);
+//  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
+  bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
+  if (counters != NULL) {
+    Label around;
+    cbnz(tmp_reg, around);
+    atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, tmp_reg2);
+    b(done);
+    bind(around);
+  } else {
+    cbz(tmp_reg, done);
+  }
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  andr(tmp_reg2, tmp_reg, markOopDesc::biased_lock_mask_in_place);
+  cbnz(tmp_reg2, try_revoke_bias);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  andr(tmp_reg2, tmp_reg, markOopDesc::epoch_mask_in_place);
+  cbnz(tmp_reg2, try_rebias);
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+  {
+    Label here;
+    mov(tmp_reg2, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+    andr(swap_reg, swap_reg, tmp_reg2);
+    orr(tmp_reg, swap_reg, rthread);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
+    // If the biasing toward our thread failed, this means that
+    // another thread succeeded in biasing it toward itself and we
+    // need to revoke that bias. The revocation will occur in the
+    // interpreter runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
+                  tmp_reg, tmp_reg2);
+    }
+  }
+  b(done);
+
+  bind(try_rebias);
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here;
+    load_prototype_header(tmp_reg, obj_reg);
+    orr(tmp_reg, rthread, tmp_reg);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
+    // If the biasing toward our thread failed, then another thread
+    // succeeded in biasing it toward itself and we need to revoke that
+    // bias. The revocation will occur in the runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
+                  tmp_reg, tmp_reg2);
+    }
+  }
+  b(done);
+
+  bind(try_revoke_bias);
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here, nope;
+    load_prototype_header(tmp_reg, obj_reg);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, &nope);
+    bind(here);
+
+    // Fall through to the normal CAS-based lock, because no matter what
+    // the result of the above CAS, some thread must have succeeded in
+    // removing the bias bit from the object's header.
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
+                  tmp_reg2);
+    }
+    bind(nope);
+  }
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
+void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  // Check for biased locking unlock case, which is a no-op
+  // Note: we do not have to check the thread ID for two reasons.
+  // First, the interpreter checks for IllegalMonitorStateException at
+  // a higher level. Second, if the bias was revoked while we held the
+  // lock, the object could not be rebiased toward another thread, so
+  // the bias bit would be clear.
+  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(temp_reg, markOopDesc::biased_lock_pattern);
+  b(done, Assembler::EQ);
+}
+
+
+static void pass_arg0(MacroAssembler* masm, Register arg) {
+  if (c_rarg0 != arg ) {
+    masm->mov(c_rarg0, arg);
+  }
+}
+
+static void pass_arg1(MacroAssembler* masm, Register arg) {
+  if (c_rarg1 != arg ) {
+    masm->mov(c_rarg1, arg);
+  }
+}
+
+static void pass_arg2(MacroAssembler* masm, Register arg) {
+  if (c_rarg2 != arg ) {
+    masm->mov(c_rarg2, arg);
+  }
+}
+
+static void pass_arg3(MacroAssembler* masm, Register arg) {
+  if (c_rarg3 != arg ) {
+    masm->mov(c_rarg3, arg);
+  }
+}
+
+void MacroAssembler::call_VM_base(Register oop_result,
+                                  Register java_thread,
+                                  Register last_java_sp,
+                                  address  entry_point,
+                                  int      number_of_arguments,
+                                  bool     check_exceptions) {
+   // determine java_thread register
+  if (!java_thread->is_valid()) {
+    java_thread = rthread;
+  }
+
+  // determine last_java_sp register
+  if (!last_java_sp->is_valid()) {
+    last_java_sp = sp;
+  }
+
+  // debugging support
+  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
+  assert(java_thread == rthread, "unexpected register");
+
+  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
+  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
+
+  // push java thread (becomes first argument of C function)
+
+  mov(c_rarg0, java_thread);
+
+  // set last Java frame before call
+  assert(last_java_sp != rfp, "can't use rfp");
+
+  Label l;
+  set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
+
+
+  // FIXME - Can save lr in more elegant way ?
+  //str(lr, pre(sp, -wordSize));
+
+  // do the call, remove parameters
+  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
+
+  //ldr(lr, post(sp, wordSize));
+
+  // reset last Java frame
+  // Only interpreter should have to clear fp
+  reset_last_Java_frame(true);
+
+   // C++ interp handles this in the interpreter
+  check_and_handle_popframe(java_thread);
+  check_and_handle_earlyret(java_thread);
+
+  if (check_exceptions) {
+    // check for pending exceptions (java_thread is set upon return)
+    ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
+    Label ok;
+    cbz(rscratch2, ok);
+
+    lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
+    // forward_exception uses LR to choose exception handler but LR is trashed by previous code
+    // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
+    bl(rscratch2);
+    bind(ok);
+  }
+
+  // get oop result if there is one and reset the value in the thread
+  if (oop_result->is_valid()) {
+    get_vm_result(oop_result, java_thread);
+  }
+}
+
+void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
+  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
+}
+
+// Maybe emit a call via a trampoline.  If the code cache is small
+// trampolines won't be emitted.
+
+void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
+  assert(JavaThread::current()->is_Compiler_thread(), "just checking");
+  assert(entry.rspec().type() == relocInfo::runtime_call_type
+         || entry.rspec().type() == relocInfo::opt_virtual_call_type
+         || entry.rspec().type() == relocInfo::static_call_type
+         || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
+
+  if (cbuf) {
+    cbuf->set_insts_mark();
+  }
+
+  if (far_branches()) {
+    // Have make trampoline such way: destination address should be raw 4 byte value,
+    // so it's patching could be done atomically.
+    relocate(entry.rspec());
+    address start = pc();
+    add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
+    ldr(r15_pc, Address(r15_pc, 4));
+    emit_int32((uintptr_t) entry.target());
+    // possibly pad the call to the NativeCall size to make patching happy
+    while (pc() - start < NativeCall::instruction_size) {
+      nop();
+    }
+    assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!");
+  } else {
+    bl(entry);
+  }
+}
+
+void MacroAssembler::c2bool(Register x) {
+  ands(r0, r0, 0xff);
+  mov(r0, 1, Assembler::NE);
+}
+
+void MacroAssembler::ic_call(address entry, jint method_index) {
+  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
+  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
+  // unsigned long offset;
+  // ldr_constant(rscratch2, const_ptr);
+  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
+  trampoline_call(Address(entry, rh));
+}
+
+// Implementation of call_VM versions
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             bool check_exceptions) {
+  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             int number_of_arguments,
+                             bool check_exceptions) {
+  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
+}
+
+
+void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
+  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
+  assert(oop_result != rscratch2, "can't be");
+  mov(rscratch2, 0);
+  str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
+  verify_oop(oop_result, "broken oop in call_VM_base");
+}
+
+void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
+  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
+  assert(metadata_result != rscratch2 &&
+         java_thread != rscratch2, "can't be");
+  mov(rscratch2, 0);
+  str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
+}
+
+void MacroAssembler::align(int modulus) {
+  while (offset() % modulus != 0) nop();
+}
+
+// these are no-ops overridden by InterpreterMacroAssembler
+
+void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
+
+void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
+
+
+RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
+                                                      Register tmp,
+                                                      int offset) {
+  intptr_t value = *delayed_value_addr;
+  if (value != 0)
+    return RegisterOrConstant(value + offset);
+
+  // load indirectly to solve generation ordering problem
+  ldr(tmp, ExternalAddress((address) delayed_value_addr));
+
+  if (offset != 0)
+    add(tmp, tmp, offset);
+
+  return RegisterOrConstant(tmp);
+}
+
+
+// Look up the method for a megamorphic invokeinterface call.
+// The target method is determined by <intf_klass, itable_index>.
+// The receiver klass is in recv_klass.
+// On success, the result will be in method_result, and execution falls through.
+// On failure, execution transfers to the given label.
+void MacroAssembler::lookup_interface_method(Register recv_klass,
+                                             Register intf_klass,
+                                             RegisterOrConstant itable_index,
+                                             Register method_result,
+                                             Register scan_temp,
+                                             Label& L_no_such_interface,
+                                             bool return_method) {
+  assert_different_registers(recv_klass, intf_klass, scan_temp);
+  assert_different_registers(method_result, intf_klass, scan_temp);
+  assert(recv_klass != method_result || !return_method,
+         "recv_klass can be destroyed when method isn't needed");
+
+  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
+  int vtable_base = in_bytes(InstanceKlass::vtable_start_offset());
+  int itentry_off = itableMethodEntry::method_offset_in_bytes();
+  int scan_step   = itableOffsetEntry::size() * wordSize;
+  int vte_size    = vtableEntry::size_in_bytes();
+  assert(vte_size == wordSize, "else adjust times_vte_scale");
+
+  ldr(scan_temp, Address(recv_klass, in_bytes(InstanceKlass::vtable_length_offset())));
+
+  // %%% Could store the aligned, prescaled offset in the klassoop.
+  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
+  lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
+  add(scan_temp, scan_temp, vtable_base);
+
+  if (return_method) {
+    // Adjust recv_klass by scaled itable_index, so we can free itable_index.
+    assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
+    // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
+    lea(recv_klass, itable_index.is_register() ?
+            Address(recv_klass, itable_index, lsl(2)) :
+            Address(recv_klass, itable_index.as_constant() << 2));
+    if (itentry_off)
+      add(recv_klass, recv_klass, itentry_off);
+  }
+
+  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
+  //   if (scan->interface() == intf) {
+  //     result = (klass + scan->offset() + itable_index);
+  //   }
+  // }
+  Label search, found_method;
+
+  for (int peel = 1; peel >= 0; peel--) {
+    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
+    cmp(intf_klass, method_result);
+
+    if (peel) {
+      b(found_method, Assembler::EQ);
+    } else {
+      b(search, Assembler::NE);
+      // (invert the test to fall through to found_method...)
+    }
+
+    if (!peel)  break;
+
+    bind(search);
+
+    // Check that the previous entry is non-null.  A null entry means that
+    // the receiver class doesn't implement the interface, and wasn't the
+    // same as when the caller was compiled.
+    cbz(method_result, L_no_such_interface);
+    add(scan_temp, scan_temp, scan_step);
+  }
+
+  bind(found_method);
+
+  if (return_method) {
+    // Got a hit.
+    ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
+    ldr(method_result, Address(recv_klass, scan_temp));
+  }
+}
+
+// virtual method calling
+void MacroAssembler::lookup_virtual_method(Register recv_klass,
+                                           RegisterOrConstant vtable_index,
+                                           Register method_result) {
+  const int base = in_bytes(InstanceKlass::vtable_start_offset());
+  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
+  if (vtable_index.is_register()) {
+    lea(method_result, Address(recv_klass,
+                               vtable_index.as_register(),
+                               lsl(LogBytesPerWord)));
+    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
+  } else {
+    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
+    if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) {
+      ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
+    } else {
+      mov(method_result, vtable_offset_in_bytes);
+      ldr(method_result, Address(recv_klass, method_result));
+    }
+  }
+}
+
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success) {
+  Label L_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
+  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
+  bind(L_failure);
+}
+
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path,
+                                        RegisterOrConstant super_check_offset) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
+  if (super_check_offset.is_register()) {
+    assert_different_registers(sub_klass, super_klass,
+                               super_check_offset.as_register());
+  } else if (must_load_sco) {
+    assert(temp_reg != noreg, "supply either a temp or a register offset");
+  }
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
+  Address super_check_offset_addr(super_klass, sco_offset);
+
+  // Hacked jmp, which may only be used just before L_fallthrough.
+#define final_jmp(label)                                                \
+  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
+  else                            b(label)                /*omit semi*/
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmp(sub_klass, super_klass);
+  b(*L_success, Assembler::EQ);
+
+  // Check the supertype display:
+  if (must_load_sco) {
+    ldr(temp_reg, super_check_offset_addr);
+    super_check_offset = RegisterOrConstant(temp_reg);
+  }
+  Address super_check_addr(sub_klass, super_check_offset);
+  ldr(rscratch1, super_check_addr);
+  cmp(super_klass, rscratch1); // load displayed supertype
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  if (super_check_offset.is_register()) {
+    b(*L_success, Assembler::EQ);
+    cmp(super_check_offset.as_register(), sc_offset);
+    if (L_failure == &L_fallthrough) {
+      b(*L_slow_path, Assembler::EQ);
+    } else {
+      b(*L_failure, Assembler::NE);
+      final_jmp(*L_slow_path);
+    }
+  } else if (super_check_offset.as_constant() == sc_offset) {
+    // Need a slow path; fast failure is impossible.
+    if (L_slow_path == &L_fallthrough) {
+      b(*L_success, Assembler::EQ);
+    } else {
+      b(*L_slow_path, Assembler::NE);
+      final_jmp(*L_success);
+    }
+  } else {
+    // No slow path; it's a fast decision.
+    if (L_failure == &L_fallthrough) {
+      b(*L_success, Assembler::EQ);
+    } else {
+      b(*L_failure, Assembler::NE);
+      final_jmp(*L_success);
+    }
+  }
+
+  bind(L_fallthrough);
+
+#undef final_jmp
+}
+
+// These two are taken from x86, but they look generally useful
+
+// scans count pointer sized words at [addr] for occurence of value,
+// generic
+void MacroAssembler::repne_scan(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label loop, fail, found;
+  cmp(count, 0);
+  b(fail, EQ);
+
+  bind(loop);
+  ldr(scratch, post(addr, wordSize));
+  cmp(value, scratch);
+  b(found, EQ);
+  subs(count, count, 1);
+  b(loop, NE);
+
+  bind(fail);
+  cmp(sp, 0); // sp never zero
+  bind(found);
+}
+
+// Form an address from base + offset in Rd.  Rd may or may
+// not actually be used: you must use the Address that is returned.
+// It is up to you to ensure that the shift provided matches the size
+// of your data.
+Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
+  // form_address result should only be used together with ldr/str instructions
+  // otherwise please provide exact type instead of IDT_INT or apply safe_for()
+  if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
+    // It fits; no need for any heroics
+    return Address(base, byte_offset);
+
+  // See if we can do this with two 12-bit offsets
+  {
+    unsigned long masked_offset = byte_offset & ~0xfff;
+    if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
+        && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
+      add(Rd, base, masked_offset);
+      byte_offset -= masked_offset;
+      return Address(Rd, byte_offset);
+    }
+  }
+
+  // Do it the hard way
+  mov(Rd, byte_offset);
+  add(Rd, base, Rd);
+  return Address(Rd);
+}
+
+// scans count 4 byte words at [addr] for occurence of value,
+// generic
+/*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label Lloop, Lexit;
+  cbz(count, Lexit);
+  bind(Lloop);
+  ldr(scratch, post(addr, wordSize));
+  cmp(value, scratch);
+  b(Lexit, EQ);
+  sub(count, count, 1);
+  cbnz(count, Lloop);
+  bind(Lexit);
+}*/
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   bool set_cond_codes) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  if (temp2_reg != noreg)
+    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
+#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  Address secondary_supers_addr(sub_klass, ss_offset);
+  Address super_cache_addr(     sub_klass, sc_offset);
+
+  BLOCK_COMMENT("check_klass_subtype_slow_path");
+
+  // Do a linear scan of the secondary super-klass chain.
+  // This code is rarely used, so simplicity is a virtue here.
+  // The repne_scan instruction uses fixed registers, which we must spill.
+  // Don't worry too much about pre-existing connections with the input regs.
+
+  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
+  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
+
+  RegSet pushed_registers;
+  if (!IS_A_TEMP(r2))    pushed_registers += r2;
+  if (!IS_A_TEMP(r14))    pushed_registers += r14;
+
+  if (super_klass != r0) {
+    if (!IS_A_TEMP(r0))   pushed_registers += r0;
+  }
+
+  push(pushed_registers, sp);
+
+  // Get super_klass value into r0 (even if it was in r5 or r2).
+  if (super_klass != r0) {
+    mov(r0, super_klass);
+  }
+
+#ifndef PRODUCT
+  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
+  Address pst_counter_addr(rscratch2);
+  ldr(rscratch1, pst_counter_addr);
+  add(rscratch1, rscratch1, 1);
+  str(rscratch1, pst_counter_addr);
+#endif //PRODUCT
+
+  // We will consult the secondary-super array.
+  ldr(r14, secondary_supers_addr);
+  // Load the array length.
+  ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
+  // Skip to start of data.
+  add(r14, r14, Array<Klass*>::base_offset_in_bytes());
+
+  cmp(sp, 0); // Clear Z flag; SP is never zero
+  // Scan R2 words at [R14] for an occurrence of R0.
+  // Set NZ/Z based on last compare.
+  repne_scan(r14, r0, r2, rscratch1);
+
+  // Unspill the temp. registers:
+  pop(pushed_registers, sp);
+
+  b(*L_failure, Assembler::NE);
+
+  // Success.  Cache the super we found and proceed in triumph.
+  str(super_klass, super_cache_addr);
+
+  if (L_success != &L_fallthrough) {
+    b(*L_success);
+  }
+
+#undef IS_A_TEMP
+
+  bind(L_fallthrough);
+}
+
+
+void MacroAssembler::verify_oop(Register reg, const char* s) {
+  if (!VerifyOops) return;
+
+  // Pass register number to verify_oop_subroutine
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop: %s: %s", reg->name(), s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop {");
+
+  stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  mov(r0, reg);
+  mov(rscratch1, (address)b);
+  mrs(r1);
+
+  // call indirectly to solve generation ordering problem
+  reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  bl(rscratch2);
+  reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
+
+  msr(r1);
+  ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  BLOCK_COMMENT("} verify_oop");
+}
+
+void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
+  if (!VerifyOops) return;
+
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop_addr: %s", s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop_addr {");
+
+  stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+  mrs(r1);
+
+  // addr may contain sp so we will have to adjust it based on the
+  // pushes that we just did.
+  if (addr.uses(sp)) {
+    lea(r0, addr);
+    ldr(r0, Address(r0, 5 * wordSize));
+  } else {
+    ldr(r0, addr);
+  }
+  mov(rscratch1, (address)b);
+
+  // call indirectly to solve generation ordering problem
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  bl(rscratch2);
+
+  msr(r1);
+  ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  BLOCK_COMMENT("} verify_oop_addr");
+}
+
+Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
+                                         int extra_slot_offset) {
+  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
+  int stackElementSize = Interpreter::stackElementSize;
+  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
+#ifdef ASSERT
+  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
+  assert(offset1 - offset == stackElementSize, "correct arithmetic");
+#endif
+  if (arg_slot.is_constant()) {
+    return Address(sp, arg_slot.as_constant() * stackElementSize
+                   + offset);
+  } else {
+    add(rscratch1, sp, arg_slot.as_register(),
+        lsl(exact_log2(stackElementSize)));
+    return Address(rscratch1, offset);
+  }
+}
+
+void MacroAssembler::call_VM_leaf_base(address entry_point,
+                                       int number_of_arguments,
+                                       Label *retaddr) {
+  Label E, L;
+
+  //FIXME Do this alignment in a more elegant way
+  mov(rscratch2, sp);
+  sub(sp, sp, wordSize);
+  bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
+  str(rscratch2, Address(sp));
+
+  // FIXME Do we need to preserve rscratch2?
+  //str(rscratch2, Address(pre(sp, -wordSize)));
+
+  mov(rscratch2, entry_point);
+  reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
+  bl(rscratch2);
+  if (retaddr)
+    bind(*retaddr);
+  reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
+
+  //ldr(rscratch2, Address(post(sp, wordSize)));
+
+  //Undo alignment
+  ldr(sp, Address(sp));
+
+  maybe_isb();
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
+  call_VM_leaf_base(entry_point, number_of_arguments);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
+                                  Register arg_1, Register arg_2) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  pass_arg2(this, arg_2);
+  call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
+  assert(arg_0 != c_rarg3, "smashed arg");
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 4);
+}
+
+// Clobbers rscratch1
+void MacroAssembler::null_check(Register reg, int offset) {
+  if (needs_explicit_null_check(offset)) {
+    // provoke OS NULL exception if reg = NULL by
+    // accessing M[reg] w/o changing any registers
+    // NOTE: this is plenty to provoke a segv
+    reg_printf("Generating OS check null with ptr = %p\n", reg);
+    assert(reg != rscratch1, "can't be");
+    ldr(rscratch1, Address(reg));
+  } else {
+    // nothing to do, (later) access of M[reg + offset]
+    // will provoke OS NULL exception if reg = NULL
+  }
+}
+
+// MacroAssembler protected routines needed to implement
+// public methods
+
+void MacroAssembler::mov(Register r, Address dest, Condition cond) {
+  code_section()->relocate(pc(), dest.rspec());
+  uint32_t imm32 = (uint32_t)dest.target();
+  movptr(r, imm32, cond);
+}
+
+// Move a constant pointer into r.  In aarch32 address space
+// is 32 bits in size and so a pointer can be encoded in two mov
+// instructions.
+void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
+#ifndef PRODUCT
+  {
+    char buffer[64];
+    snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
+    block_comment(buffer);
+  }
+#endif
+  Assembler::mov_immediate32(r, imm32, cond, false);
+}
+
+void MacroAssembler::ret(Register reg) {
+  assert(reg == lr, "Can do return only to LR");
+  b(lr);
+}
+
+void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
+  Label retry_load;
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  ldrex(tmp, counter_addr);
+  add(tmp, tmp, 1);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, tmp, counter_addr);
+  cmp(tmp, 0);
+  b(retry_load, Assembler::NE);
+}
+
+
+// MacroAssembler routines found actually to be needed
+
+void MacroAssembler::push(Register src)
+{
+  str(src, Address(pre(sp, -1 * wordSize)));
+}
+
+void MacroAssembler::pop(Register dst)
+{
+  ldr(dst, Address(post(sp, 1 * wordSize)));
+}
+
+// Note: load_unsigned_short used to be called load_unsigned_word.
+int MacroAssembler::load_unsigned_short(Register dst, Address src) {
+  int off = offset();
+  ldrh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
+  int off = offset();
+  ldrb(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_short(Register dst, Address src) {
+  int off = offset();
+  ldrsh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_byte(Register dst, Address src) {
+  int off = offset();
+  ldrsb(dst, src);
+  return off;
+}
+
+void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
+  switch (size_in_bytes) {
+  //case  8:  ldr(dst, src); break;
+  case  4:  ldr(dst, src); break;
+  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
+  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
+  switch (size_in_bytes) {
+  //case  8:  str(src, dst); break;
+  case  4:  str(src, dst); break;
+  case  2:  strh(src, dst); break;
+  case  1:  strb(src, dst); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::decrement(Register reg, int value) {
+  if (value < 0) {
+    increment(reg, -value);
+    return;
+  }
+  if (value == 0) {
+    return;
+  }
+  if (operand_valid_for_add_sub_immediate(value)) {
+    sub(reg, reg, value);
+    return;
+  }
+  assert(reg != rscratch2, "invalid register for decrement");
+  mov(rscratch2, (unsigned int) value);
+  sub(reg, reg, rscratch2);
+}
+
+void MacroAssembler::decrement(Address dst, int value) {
+  assert(!dst.uses(rscratch1), "invalid address for decrement");
+  ldr(rscratch1, dst);
+  decrement(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+void MacroAssembler::increment(Register reg, int value) {
+  if (value < 0) {
+    decrement(reg, -value);
+    return;
+  }
+  if (value == 0) {
+    return;
+  }
+  if (operand_valid_for_add_sub_immediate(value)) {
+    add(reg, reg, value);
+    return;
+  }
+  assert(reg != rscratch2, "invalid register for increment");
+  mov(rscratch2, (unsigned int) value);
+  add(reg, reg, rscratch2);
+}
+
+void MacroAssembler::increment(Address dst, int value) {
+  assert(!dst.uses(rscratch1), "invalid address for increment");
+  ldr(rscratch1, dst);
+  increment(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+// Loads and stores everything except the pc and sp
+void MacroAssembler::pusha() {
+  unsigned regset = 0b0101111111111111;
+  stmdb(sp, regset);
+}
+void MacroAssembler::popa() {
+  unsigned regset = 0b0101111111111111;
+  ldmia(sp, regset);
+}
+
+static void multiple_reg_check(unsigned int bitset, Register stack) {
+  const unsigned int pcbit = 1 << r15_pc->encoding();
+  const unsigned int lrbit = 1 << lr->encoding();
+  const unsigned int spbit = 1 << sp->encoding();
+  const unsigned int stackbit = 1 << stack->encoding();
+  assert(!(bitset & spbit), "The SP can be in the list. However, "
+      "ARM deprecates using these instructions with SP in the list.");
+  assert(!(bitset & pcbit) || !(bitset & lrbit),
+      "ARM deprecates using these instructions with both "
+      "the LR and the PC in the list.");
+  assert(!(bitset & stackbit), "Instructions with the base register "
+      "in the list and ! specified are only available before ARMv7, "
+      "and ARM deprecates the use of such instructions. "
+      "The value of the base register after such an instruction is UNKNOWN");
+}
+
+// Push lots of registers in the bit set supplied.  Don't push sp.
+// Return the number of words pushed
+int MacroAssembler::push(unsigned int bitset, Register stack) {
+  multiple_reg_check(bitset, stack);
+  unsigned bc = bitset, count = 0, i;
+  for(i = 0; i <= 15; i++) {
+    if (1 & bc) count++;
+    bc >>= 1;
+  }
+  // TODO Also why did it only do even quantities before?
+  stmdb(stack, bitset);
+  return count;
+}
+
+int MacroAssembler::pop(unsigned int bitset, Register stack) {
+  multiple_reg_check(bitset, stack);
+  unsigned bc = bitset, count = 0, i;
+  for(i = 0; i <= 15; i++) {
+    if (1 & bc) count++;
+    bc >>= 1;
+  }
+  // TODO Also why did it only do even quantities before?
+  ldmia(stack, bitset);
+  return count;
+}
+
+void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
+  Label done, not_weak;
+  cbz(value, done);           // Use NULL as-is.
+
+  STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
+  tbz(value, 0, not_weak);    // Test for jweak tag.
+
+  // Resolve jweak.
+
+  access_load_word_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
+              value, Address(value, -JNIHandles::weak_tag_value), tmp, noreg);
+  verify_oop(value);
+  b(done);
+
+
+  bind(not_weak);
+  // Resolve (untagged) jobject.
+  access_load_word_at(T_OBJECT, IN_NATIVE, value, Address(value), tmp, noreg);
+  verify_oop(value);
+  bind(done);
+}
+
+void MacroAssembler::stop(const char* msg) {
+  pusha();
+  // Save old sp value
+  add(rscratch2, sp, 14 * wordSize);
+  str(rscratch2, Address(pre(sp, -4)));
+  mov(c_rarg0, (address)msg);
+  mov(c_rarg1, r15_pc);
+  sub(c_rarg1, c_rarg1, 8); // Restore to actual value
+  mov(c_rarg2, sp);
+  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
+  bl(c_rarg3);
+  hlt(0);
+}
+
+void MacroAssembler::unimplemented(const char* what) {
+  const char* buf = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("unimplemented: %s", what);
+    buf = code_string(ss.as_string());
+  }
+  stop(buf);
+}
+
+// this simulates the behaviour of the x86 cmpxchg instruction using a
+// load linked/store conditional pair. we use the acquire/release
+// versions of these instructions so that we flush pending writes as
+// per Java semantics.
+
+// n.b the x86 version assumes the old value to be compared against is
+// in rax and updates rax with the value located in memory if the
+// cmpxchg fails. we supply a register for the old value explicitly
+
+// the aarch32 load linked/store conditional instructions do not
+// accept an offset. so, unlike x86, we must provide a plain register
+// to identify the memory word to be compared/exchanged rather than a
+// register+offset Address.
+
+void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldrex(tmp, addr);
+  cmp(tmp, oldv);
+  b(nope, Assembler::NE);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, newv, addr);
+  cmp(tmp, 0);
+  b(succeed, Assembler::EQ);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
+                                        Label &succeed, Label *fail) {
+  assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
+  cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
+}
+
+void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldrex(tmp, addr);
+  cmp(tmp, oldv);
+  b(nope, Assembler::NE);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, newv, addr);
+  cmp(tmp, 0);
+  b(succeed, Assembler::EQ);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+#ifndef PRODUCT
+extern "C" void findpc(intptr_t x);
+#endif
+
+void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
+{
+  print_unseen_bytecodes();
+  // In order to get locks to work, we need to fake a in_VM state
+  if (ShowMessageBoxOnError) {
+    JavaThread* thread = JavaThread::current();
+    JavaThreadState saved_state = thread->thread_state();
+    thread->set_thread_state(_thread_in_vm);
+#ifndef PRODUCT
+    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
+      ttyLocker ttyl;
+      BytecodeCounter::print();
+    }
+#endif
+    if (os::message_box(msg, "Execution stopped, print registers?")) {
+      ttyLocker ttyl;
+      tty->print_cr(" pc = 0x%016x", pc);
+#ifndef PRODUCT
+      tty->cr();
+      findpc(pc);
+      tty->cr();
+#endif
+      tty->print_cr("THIS IS WRONG!");
+      tty->print_cr(" r0 = 0x%016x", regs[0]);
+      tty->print_cr(" r1 = 0x%016x", regs[1]);
+      tty->print_cr(" r2 = 0x%016x", regs[2]);
+      tty->print_cr(" r3 = 0x%016x", regs[3]);
+      tty->print_cr(" r4 = 0x%016x", regs[4]);
+      tty->print_cr(" r5 = 0x%016x", regs[5]);
+      tty->print_cr(" r6 = 0x%016x", regs[6]);
+      tty->print_cr(" r7 = 0x%016x", regs[7]);
+      tty->print_cr(" r8 = 0x%016x", regs[8]);
+      tty->print_cr(" r9 = 0x%016x", regs[9]);
+      tty->print_cr("r10 = 0x%016x", regs[10]);
+      tty->print_cr("r11 = 0x%016x", regs[11]);
+      tty->print_cr("r12 = 0x%016x", regs[12]);
+      tty->print_cr("r13 = 0x%016x", regs[13]);
+      tty->print_cr("r14 = 0x%016x", regs[14]);
+      tty->print_cr("r15 = 0x%016x", regs[15]);
+      BREAKPOINT;
+    }
+    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
+  } else {
+    {
+    ttyLocker ttyl;
+    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
+    ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
+    ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
+    ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
+    ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
+    ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
+    ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
+    ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
+    ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
+    ::tty->print_cr(" r8 [  rmethod  ] = 0x%08x", regs[9]);
+    ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
+    ::tty->print_cr("r10 [  rthread  ] = 0x%08x", regs[11]);
+    ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
+    ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
+    ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
+    ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
+    ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
+    }
+    assert(false, "DEBUG MESSAGE: %s", msg);
+  }
+}
+
+void MacroAssembler::push_call_clobbered_registers() {
+  push(RegSet::range(r0, r3), sp);
+  if(hasFPU()) {
+    const int nfloat = 16; // number of callee-saved 32-bit float registers
+    vstmdb_f64(sp, (1 << nfloat/2) - 1);
+  }
+}
+
+void MacroAssembler::pop_call_clobbered_registers() {
+  if(hasFPU()) {
+    const int nfloat = 16; // number of callee-saved 32-bit float registers
+    vldmia_f64(sp, (1 << nfloat/2) - 1);
+  }
+  pop(RegSet::range(r0, r3), sp);
+}
+
+void MacroAssembler::push_CPU_state() {
+  // if fix this, update also RegisterSaved::save_live_registers and it's map
+  push(0x5fff, sp); // integer registers except sp & (aarch32 pc)
+
+  if(hasFPU()) {
+    const int nfloat = FPUStateSizeInWords / 2; // saved by pairs
+    vstmdb_f64(sp, (1 << nfloat) - 1);
+  } else {
+    sub(sp, sp, FPUStateSizeInWords * wordSize);
+  }
+}
+
+void MacroAssembler::pop_CPU_state() {
+  if(hasFPU()) {
+    const int nfloat = FloatRegisterImpl::number_of_registers / 2;
+    vldmia_f64(sp, (1 << nfloat) - 1);
+  } else {
+    add(sp, sp, FPUStateSizeInWords * wordSize);
+  }
+
+  pop(0x5fff, sp); // integer registers except sp & (aarch32 pc)
+}
+
+// appears this needs to round up!
+void MacroAssembler::round_to(Register reg, int modulus) {
+  // from x86
+  add(reg, reg, modulus - 1);
+  bic(reg, reg, modulus - 1); // and( reg, -modulus)
+}
+
+SkipIfEqual::SkipIfEqual(
+    MacroAssembler* masm, const bool* flag_addr, bool value) {
+  _masm = masm;
+  _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
+  _masm->ldrb(rscratch1, rscratch1);
+  _masm->cmp(rscratch1, 0);
+  _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
+}
+
+SkipIfEqual::~SkipIfEqual() {
+  _masm->bind(_label);
+}
+
+void MacroAssembler::cmpptr(Register src1, Address src2) {
+  mov(rscratch1, src2);
+  ldr(rscratch1, Address(rscratch1));
+  cmp(src1, rscratch1);
+}
+
+void MacroAssembler::cmpoop(Register obj1, Register obj2) {
+  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->obj_equals(this, obj1, obj2);
+}
+
+void MacroAssembler::load_klass(Register dst, Register src) {
+  ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
+}
+
+// ((OopHandle)result).resolve();
+void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
+  // OopHandle::resolve is an indirection.
+  access_load_word_at(T_OBJECT, IN_NATIVE, result, Address(result), tmp, noreg);
+}
+
+void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
+  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+  ldr(dst, Address(rmethod, Method::const_offset()));
+  ldr(dst, Address(dst, ConstMethod::constants_offset()));
+  ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
+  ldr(dst, Address(dst, mirror_offset));
+  resolve_oop_handle(dst, tmp);
+}
+
+void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
+  ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
+  cmp(trial_klass, tmp);
+}
+
+void MacroAssembler::load_prototype_header(Register dst, Register src) {
+  load_klass(dst, src);
+  ldr(dst, Address(dst, Klass::prototype_header_offset()));
+}
+
+void MacroAssembler::store_klass(Register dst, Register src) {
+  str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
+}
+
+void MacroAssembler::store_klass_gap(Register dst, Register src) { }
+
+void MacroAssembler::access_load_word_at(BasicType type, DecoratorSet decorators,
+                                         Register dst, Address src,
+                                         Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  } else {
+    bs->load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_store_word_at(BasicType type, DecoratorSet decorators,
+                                          Address dst, Register src,
+                                          Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  } else {
+    bs->store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_load_tos_at(BasicType type, DecoratorSet decorators,
+                                    Address src,
+                                    Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
+  } else {
+    bs->load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_store_tos_at(BasicType type, DecoratorSet decorators,
+                                         Address dst,
+                                         Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
+  } else {
+    bs->store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
+                                   Register thread_tmp, DecoratorSet decorators) {
+  access_load_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
+}
+
+void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
+                                            Register thread_tmp, DecoratorSet decorators) {
+  access_load_word_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
+}
+
+void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
+                                    Register thread_tmp, DecoratorSet decorators) {
+  access_store_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
+}
+
+// Used for storing NULLs.
+void MacroAssembler::store_heap_oop_null(Address dst, Register tmp) {
+  access_store_word_at(T_OBJECT, IN_HEAP, dst, noreg, tmp, noreg);
+}
+
+Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
+  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
+  int index = oop_recorder()->allocate_metadata_index(obj);
+  RelocationHolder rspec = metadata_Relocation::spec(index);
+  return Address((address)obj, rspec);
+}
+
+// Move an oop into a register.  immediate is true if we want
+// immediate instrcutions, i.e. we are not going to patch this
+// instruction while the code is being executed by another thread.  In
+// that case we can use move immediates rather than the constant pool.
+void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_oop_index(obj);
+  } else {
+#ifdef ASSERT
+    {
+      ThreadInVMfromUnknown tiv;
+      assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
+    }
+#endif
+    oop_index = oop_recorder()->find_index(obj);
+  }
+  if (! immediate) {
+    far_load_oop(dst, oop_index);
+  } else {
+    RelocationHolder rspec = oop_Relocation::spec(oop_index);
+    mov(dst, Address((address)obj, rspec));
+  }
+}
+
+// Move a metadata address into a register.
+void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_metadata_index(obj);
+  } else {
+    oop_index = oop_recorder()->find_index(obj);
+  }
+  RelocationHolder rspec = metadata_Relocation::spec(oop_index);
+  mov(dst, Address((address)obj, rspec));
+}
+
+void MacroAssembler::far_load(Register dst, address addr) {
+  address far_load_addr = pc();
+  add(dst, r15_pc, 0);
+  ldr(dst, Address(dst));
+
+  NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr;
+  far_load->set_data_addr((intptr_t*) addr);
+}
+
+void MacroAssembler::far_load_oop(Register dst, int oop_index) {
+    relocate(oop_Relocation::spec(oop_index));
+    // can't provide meaningful addr, give far_load addr itself
+    far_load(dst, pc());
+}
+
+void MacroAssembler::far_load_metadata(Register dst, int metadata_index) {
+    relocate(metadata_Relocation::spec(metadata_index));
+    // can't provide meaningful addr, give far_load addr itself
+    far_load(dst, pc());
+}
+
+void MacroAssembler::far_load_const(Register dst, address const_addr) {
+    relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
+    far_load(dst, const_addr);
+}
+
+Address MacroAssembler::constant_oop_address(jobject obj) {
+#ifdef ASSERT
+  {
+    ThreadInVMfromUnknown tiv;
+    assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
+    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
+  }
+#endif
+  int oop_index = oop_recorder()->find_index(obj);
+  return Address((address)obj, oop_Relocation::spec(oop_index));
+}
+
+// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
+void MacroAssembler::tlab_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Register t2,
+                                   Label& slow_case) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
+}
+
+// Defines obj, preserves var_size_in_bytes
+void MacroAssembler::eden_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Label& slow_case) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
+}
+
+// Zero words; len is in bytes
+// Destroys all registers except addr
+// len must be a nonzero multiple of wordSize
+void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
+  assert_different_registers(addr, len, t1, rscratch1, rscratch2);
+
+#ifdef ASSERT
+  { Label L;
+    tst(len, BytesPerWord - 1);
+    b(L, Assembler::EQ);
+    stop("len is not a multiple of BytesPerWord");
+    bind(L);
+  }
+#endif
+
+#ifndef PRODUCT
+  block_comment("zero memory");
+#endif
+
+  Label loop;
+  Label entry;
+
+//  Algorithm:
+//
+//    scratch1 = cnt & 7;
+//    cnt -= scratch1;
+//    p += scratch1;
+//    switch (scratch1) {
+//      do {
+//        cnt -= 8;
+//          p[-8] = 0;
+//        case 7:
+//          p[-7] = 0;
+//        case 6:
+//          p[-6] = 0;
+//          // ...
+//        case 1:
+//          p[-1] = 0;
+//        case 0:
+//          p += 8;
+//      } while (cnt);
+//    }
+
+  const int unroll = 8; // Number of str instructions we'll unroll
+
+  lsr(len, len, LogBytesPerWord);
+  andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
+  sub(len, len, rscratch1);      // cnt -= unroll
+  // t1 always points to the end of the region we're about to zero
+  add(t1, addr, rscratch1, lsl(LogBytesPerWord));
+  adr(rscratch2, entry);
+  sub(rscratch2, rscratch2, rscratch1, lsl(2));
+  mov(rscratch1, 0);
+  b(rscratch2);
+  bind(loop);
+  sub(len, len, unroll);
+  for (int i = -unroll; i < 0; i++)
+    str(rscratch1, Address(t1, i * wordSize));
+  bind(entry);
+  add(t1, t1, unroll * wordSize);
+  cbnz(len, loop);
+}
+
+void MacroAssembler::verify_tlab() {
+#ifdef ASSERT
+  if (UseTLAB && VerifyOops) {
+    Label next, ok;
+
+    strd(rscratch2, rscratch1, Address(pre(sp, -16)));
+
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
+    cmp(rscratch2, rscratch1);
+    b(next, Assembler::HS);
+    STOP("assert(top >= start)");
+    should_not_reach_here();
+
+    bind(next);
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    cmp(rscratch2, rscratch1);
+    b(ok, Assembler::HS);
+    STOP("assert(top <= end)");
+    should_not_reach_here();
+
+    bind(ok);
+    ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
+  }
+#endif
+}
+
+// Writes to stack successive pages until offset reached to check for
+// stack overflow + shadow pages.  This clobbers tmp.
+void MacroAssembler::bang_stack_size(Register size, Register tmp) {
+  assert_different_registers(tmp, size, rscratch1);
+  mov(tmp, sp);
+  // Bang stack for total size given plus shadow page size.
+  // Bang one page at a time because large size can bang beyond yellow and
+  // red zones.
+  Label loop;
+  mov(rscratch1, os::vm_page_size());
+  bind(loop);
+  lea(tmp, Address(tmp, -os::vm_page_size()));
+  subs(size, size, rscratch1);
+  str(size, Address(tmp));
+  b(loop, Assembler::GT);
+
+  // Bang down shadow pages too.
+  // At this point, (tmp-0) is the last address touched, so don't
+  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
+  // was post-decremented.)  Skip this address by starting at i=1, and
+  // touch a few more pages below.  N.B.  It is important to touch all
+  // the way down to and including i=StackShadowPages.
+  for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
+    // this could be any sized move but this is can be a debugging crumb
+    // so the bigger the better.
+    lea(tmp, Address(tmp, -os::vm_page_size()));
+    str(size, Address(tmp));
+  }
+}
+
+
+// Move the address of the polling page into dest.
+void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    ldr(dest, Address(rthread, Thread::polling_page_offset()));
+  } else {
+    mov(dest, Address(page, rtype));
+  }
+}
+
+// Move the address of the polling page into r, then read the polling
+// page.
+address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
+  get_polling_page(r, page, rtype);
+  return read_polling_page(r, rtype);
+}
+
+address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
+  InstructionMark im(this);
+  code_section()->relocate(inst_mark(), rtype);
+  // It's ok to load to reg from reg + off (without write-back)
+  ldr(r, Address(r, 0));
+  return inst_mark();
+}
+
+// Helper functions for 64-bit multipliction, division and remainder
+// does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
+void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
+  Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
+  Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
+  Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
+
+  mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
+}
+
+// does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
+void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
+  assert_different_registers(Rn, Rnh);
+  assert_different_registers(Rm, Rmh);
+  assert_different_registers(Rd, Rdh); // umull restriction
+  const Register t = rscratch1;
+
+  mul(t, Rm, Rnh);
+  mla(t, Rn, Rmh, t);
+  umull(Rd, Rdh, Rm, Rn);
+  add(Rdh, t, Rdh);
+}
+
+
+int64_t internal_ldiv(int64_t a, int64_t b) {
+  return a / b;
+}
+
+int64_t internal_lmod(int64_t a, int64_t b) {
+  return a % b;
+}
+
+void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
+    Register cnt = rscratch1;
+    Register mod = rscratch2;
+    Register sign = r14;
+    assert_different_registers(num, den, rscratch1, rscratch2, r14);
+
+    // FIXME This works by first converting any negative values to positive ones, however
+    // it is not possible to express |INT_MIN|. Need to fix this
+
+    //Convert to positive values
+    mov(sign, 0);
+
+    cmp(num, 0);
+    mov(sign, 1, MI);
+    rsb(num, num, 0, MI);
+
+    cmp(den, 0);
+    if(!want_mod) eor(sign, sign, 1, MI);
+    rsb(den, den, 0, MI);
+
+    // Algorithm from
+    // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
+    // Graeme Williams
+    mov(cnt, 28);
+    mov(mod, num, lsr(4));
+    cmp(den, mod, lsr(12));
+    sub(cnt, cnt, 16, Assembler::LE);
+    mov(mod, mod, lsr(16), Assembler::LE);
+    cmp(den, mod, lsr(4));
+    sub(cnt, cnt, 8, Assembler::LE);
+    mov(mod, mod, lsr(8), Assembler::LE);
+    cmp(den, mod);
+    sub(cnt, cnt, 4, Assembler::LE);
+    mov(mod, mod, lsr(4), Assembler::LE);
+    mov(num, num, lsl(cnt));
+    rsb(den, den, 0);
+
+    adds(num, num, num);
+    //Now skip over cnt copies of the 3 instr. loop.
+    add(cnt, cnt, cnt, lsl(1));
+    add(r15_pc, r15_pc, cnt, lsl(2));
+    mov(r0, r0);
+
+    for(int i = 0; i < 32; i++) {
+        adcs(mod, den, mod, lsl(1));
+        sub(mod, mod, den, Assembler::LO);
+        adcs(num, num, num);
+    }
+
+    cmp(sign, 0);
+    rsb(res, want_mod? mod : num, 0, NE);
+    mov(res, want_mod? mod : num, EQ);
+}
+
+
+// <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
+// <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
+// <Rd> = <Rn> / <Rm>
+// <Rd> = <Rn> % <Rm>
+void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
+  //Dispatch to best possible
+  Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
+  Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
+  Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
+
+  assert(32 == width || 64 == width, "Invalid width");
+  bool is64b = 64 == width;
+
+  if(is64b) {
+    assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
+  }
+
+  if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
+    // Emit a hw instruction sequnce.
+    if(want_remainder) {
+      sdiv(rscratch1, Rn, Rm);
+      mls(Rd, rscratch1, Rm, Rn);
+    } else {
+      sdiv(Rd, Rn, Rm);
+    }
+  } else if(!is64b) {
+    // Fall back to assembly software routine
+    divide32(Rd, Rn, Rm, want_remainder);
+  } else {
+    // Fall back to C software routine for
+    // 64 bit divide/mod
+    if(Rn != r0) {
+      mov(rscratch1, Rm);
+      mov(rscratch2, Rmh);
+
+      mov(r0, Rn);
+      mov(r1, Rnh);
+
+      mov(r2, rscratch1);
+      mov(r3, rscratch2);
+    } else if(Rm != r2) {
+      mov(r2, Rm);
+      mov(r3, Rmh);
+    }
+    address function;
+    if(want_remainder) function = (address)internal_lmod;
+    else               function = (address)internal_ldiv;
+
+    mov(rscratch1, function);
+    bl(rscratch1);
+    if(Rd != r0) {
+      mov(Rd, r0);
+      if(is64b) mov(Rdh, r1);
+    }
+  }
+}
+
+void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
+  assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
+  // Dispatch to the best sequence
+  if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
+    // Can use extend X
+    switch(width){
+      case 8:  uxtb(dest, source, ror(lsb)); break;
+      case 16: uxth(dest, source, ror(lsb)); break;
+      default:                               break;
+   }
+  } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
+    ubfx(dest, source, lsb, width);
+  } else {
+    // Do two shifts
+    lsl(dest, source, 32 - (width + lsb));
+    lsr(dest, dest, 32 - width);
+  }
+}
+
+
+void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
+  assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
+  assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
+  if(VM_Version::features() & FT_SINGLE_CORE) {
+    ldrd(Rt, Rbase);
+  } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
+#ifdef ASSERT
+    Label lbl;
+    tst(Rbase, 7);
+    b(lbl, EQ);
+    stop("atomic_ldrd is not doubleword aligned!");
+    bind(lbl);
+#endif // ASSERT
+
+    ldrexd(Rt, Rbase);
+  } else {
+    // TODO: Find Java way of logging
+    static bool warning_printed = false;
+    if(!warning_printed) {
+      fprintf(stderr, "Unable to provide atomic doubleword load.\n");
+      warning_printed = true;
+    }
+    ldrd(Rt, Rbase);
+  }
+}
+
+void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
+                                 Register temp, Register temp2) {
+  assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
+  assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
+  assert((Register) (temp + 1) == temp2, "Must be contiguous");
+  assert_different_registers(temp, Rt, Rbase, temp2);
+  if(VM_Version::features() & FT_SINGLE_CORE) {
+    strd(Rt, Rbase);
+  } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
+    // First need to gain exclusive access
+    Label retry;
+
+#ifdef ASSERT
+    tst(Rbase, 7);
+    b(retry, EQ);
+    stop("atomic_strd is not doubleword aligned!");
+#endif // ASSERT
+
+    bind(retry);
+    ldrexd(temp, Rbase);
+    strexd(temp, Rt, Rbase);
+    cmp(temp, 0);
+    b(retry, NE);
+  } else {
+    // TODO: Find Java way of logging
+    static bool warning_printed = false;
+    if(!warning_printed) {
+      fprintf(stderr, "Unable to provide atomic doubleword store.\n");
+      warning_printed = true;
+    }
+    strd(Rt, Rbase);
+  }
+}
+
+
+#define ENABLE_DEBUGGING 0
+// Helloworld is 2,482,397
+uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
+
+uint32_t MacroAssembler::bytecodes_executed = 0;
+
+int MacroAssembler::enable_debug = 0;
+int MacroAssembler::enable_method_debug = 0;
+int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
+
+#define N_J_BYTECODES 238
+const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
+"lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
+"iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
+"lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
+"aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
+"dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
+"fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
+"iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
+"dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
+"lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
+"ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
+"i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
+"dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
+"ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
+"lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
+"invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
+"anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
+"multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
+"fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
+"fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
+"fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
+"fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
+"fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "nofast_getfield", "nofast_putfield",
+"nofast_aload_0", "nofast_iload", "INVALID"};
+
+int bytecodes_seen[256];
+
+void MacroAssembler::init_unseen_bytecodes() {
+  for(int i = 0; i < 256; i++ ) {
+    bytecodes_seen[i] = 0;
+  }
+}
+
+void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
+  if(ENABLE_DEBUGGING) {
+    mov(scratch, (address)bytecodes_seen);
+    add(scratch, scratch, bc_reg, lsl(2));
+    add(bc_reg, bc_reg, 1);
+    str(bc_reg, Address(scratch));
+    sub(bc_reg, bc_reg, 1);
+  }
+}
+
+void MacroAssembler::print_unseen_bytecodes() {
+  if(ENABLE_DEBUGGING) {
+    printf("=== Unseen bytecodes ===\n");
+    for(int i = 0; i < N_J_BYTECODES; i++) {
+      if(0 == bytecodes_seen[i]) {
+        printf("\t%s\n", j_bytecodes[i]);
+      }
+    }
+    printf("=== End unseen ===\n");
+  } else {
+    printf("Not kept track, enable debugging to view info\n");
+  }
+  fflush(stdout);
+}
+
+int machine_state_regset = 0b0101111111111111;
+int machine_state_float_regset = 0b11;
+
+void MacroAssembler::save_machine_state() {
+    stmdb(sp, machine_state_regset);
+    if(hasFPU()) {
+        vstmdb_f64(sp, machine_state_float_regset);
+    }
+    enter();
+}
+
+void MacroAssembler::restore_machine_state() {
+    leave();
+    if(hasFPU()) {
+        vldmia_f64(sp, machine_state_float_regset);
+    }
+    ldmia(sp, machine_state_regset);
+}
+
+void internal_internal_printf(const char *fmt, ...) {
+  va_list args;
+  va_start (args, fmt);
+  vprintf (fmt, args);
+  fflush(stdout);
+  va_end(args);
+}
+
+void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
+  char buf[2048];
+  char fmt[2048];
+  buf[0] = '\0';
+  const char *thread_str = "THREAD 0x%08x : ";
+  int id = pthread_self();
+  strcpy(fmt, format);
+
+  char *str = strtok(fmt, "\n");
+  int nreplace = 0;
+  while(str) {
+    strcpy(buf, thread_str);
+    strcat(buf, str);
+    strcat(buf, "\n");
+    internal_internal_printf((const char*)buf, id, a, b, c);
+    str = strtok(NULL, "\n");
+  }
+}
+
+void MacroAssembler::get_bytecode(Register dst, Register bc) {
+  if(ENABLE_DEBUGGING) {
+    int nbytecodes = N_J_BYTECODES;
+    mov(dst, (address)j_bytecodes);
+    cmp(bc, nbytecodes);
+
+    ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
+    ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
+  }
+}
+
+int invocation_depth_count = -1; //TODO remove this with debugging info
+
+#define MAX_FCALL_DEPTH 4096
+struct thread_method_record{
+  int thread_id;
+  char names[MAX_FCALL_DEPTH][512];
+  int invocation_depth_count;
+};
+int ntmrs = 0;
+#define MAX_TMRS 10
+thread_method_record tmr_list[MAX_TMRS];
+
+void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
+  int id = pthread_self();
+  *thread_id = id;
+  for(int i = 0; i < ntmrs; i++) {
+    thread_method_record *tmr = &tmr_list[i];
+    if(id == tmr->thread_id) {
+      // Add a new frame
+      if(tmr->invocation_depth_count >= -1 &&
+        tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
+        *invocation_depth_count = ++(tmr->invocation_depth_count);
+        *name = tmr->names[tmr->invocation_depth_count];
+        meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
+        return;
+      } else {
+        fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
+        exit(1);
+      }
+    }
+  }
+  // Add a new thread
+  if(ntmrs >= MAX_TMRS) {
+    fprintf(stderr, "Too many tmrs\n");
+    exit(1);
+  }
+  //Create a new tmr
+  tmr_list[ntmrs].thread_id = id;
+  tmr_list[ntmrs].invocation_depth_count = 0;
+  meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
+  *invocation_depth_count = 0;
+  *name = tmr_list[ntmrs].names[0];
+  ntmrs++;
+}
+
+void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
+  int id = pthread_self();
+  *thread_id = id;
+  for(int i = 0; i < ntmrs; i++) {
+    thread_method_record *tmr = &tmr_list[i];
+    if(id == tmr->thread_id) {
+      if(tmr->invocation_depth_count >= 0 &&
+        tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
+        // Pop frame
+        *name = tmr->names[tmr->invocation_depth_count];
+        *invocation_depth_count = (tmr->invocation_depth_count)--;
+        return;
+      } else if ( -1 == tmr->invocation_depth_count) {
+        *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
+        *invocation_depth_count = 0;
+        return;
+      } else {
+        fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
+        exit(1);
+      }
+    }
+  }
+  fprintf(stderr, "Unable to find suitable tmr\n");
+  exit(1);
+}
+
+void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
+  sprintf(buf, "THREAD 0x%08x : ", id);
+  for(int i = 0; i < invocation_depth_count; i++) {
+    strcat(buf, "  ");
+  }
+}
+
+
+void print_entry(Method *meth, int native) {
+  char *name;
+  int invocation_depth_count, id;
+  push_tmr(meth, &id, &invocation_depth_count, &name);
+
+  if(MacroAssembler::enable_method_debug) {
+    char buf[4096], buf_b[2048];
+    prepare_entry_exit_prefix(buf, id, invocation_depth_count);
+    if(native) {
+      sprintf(buf_b, "CALL NATIVE : %s\n", name);
+    } else {
+      sprintf(buf_b, "CALL JAVA   : %s\n", name);
+    }
+    strcat(buf, buf_b);
+    printf("%s", buf);
+    fflush(stdout);
+  }
+}
+
+void print_exit(bool normal) {
+  char *name;
+  int invocation_depth_count, id;
+  pop_tmr(&id, &invocation_depth_count, &name);
+
+  if(MacroAssembler::enable_method_debug) {
+    char buf[4096], buf_b[2048];
+    prepare_entry_exit_prefix(buf, id, invocation_depth_count);
+    sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
+    strcat(buf, buf_b);
+    printf("%s", buf);
+    fflush(stdout);
+  }
+}
+
+void MacroAssembler::print_method_entry(Register rmethod, bool native) {
+  if(ENABLE_DEBUGGING) {
+    save_machine_state();
+
+    bic(sp, sp, 7); // 8-byte align stack
+    mov(rscratch2, (address)print_entry);
+    mov(r0, rmethod);
+    mov(r1, native);
+    bl(rscratch2);
+
+    restore_machine_state();
+  }
+}
+
+void MacroAssembler::print_method_exit(bool normal) {
+  if(ENABLE_DEBUGGING) {
+    save_machine_state();
+
+    bic(sp, sp, 7); // 8-byte align stack
+    mov(rscratch2, (address)print_exit);
+    mov(r0, normal);
+    bl(rscratch2);
+
+    restore_machine_state();
+  }
+}
+
+void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
+  if(ENABLE_DEBUGGING) {
+    Label skip;
+    save_machine_state();
+
+        mov(rscratch1, ra);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+        mov(rscratch1, rb);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+        mov(rscratch1, rc);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+
+        if(!important) {
+            mov(r0, (address)&enable_debug);
+            ldr(r0, Address(r0));
+            cmp(r0, 0);
+            b(skip, Assembler::EQ);
+        }
+
+        int sp_difference = wordSize * (count_bits(machine_state_regset) +
+                                        2 * count_bits(machine_state_float_regset) +
+                                        2 + 3); //Frame entry and saved
+
+        mov(r0, (address)fmt);
+        if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
+        else         add(r1, sp, sp_difference);
+
+        if(rb != sp) ldr(r2, Address(sp, wordSize));
+        else         add(r2, sp, sp_difference);
+
+        if(rc != sp) ldr(r3, Address(sp));
+        else         add(r3, sp, sp_difference);
+
+        bic(sp, sp, 7); // 8-byte align stack
+
+        mov(rscratch2, (address)internal_printf);
+        bl(rscratch2);
+
+        bind(skip);
+        restore_machine_state();
+    }
+}
+
+void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
+    reg_printf_internal(false, fmt, ra, rb, rc);
+}
+
+void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
+  reg_printf_internal(true, fmt, ra, rb, rc);
+}
+
+// When debugging, set the break on bkpnt
+void bkpnt() { return; }
+void MacroAssembler::create_breakpoint() {
+    if(ENABLE_DEBUGGING) {
+        save_machine_state();
+        bic(sp, sp, 7); // 8-byte align stack
+
+        mov(rscratch2, (address) bkpnt);
+        bl(rscratch2);
+
+        restore_machine_state();
+    }
+}
+
+
+void MacroAssembler::print_cpool(InstanceKlass *klass) {
+    ttyLocker ttyl;
+    klass->constants()->print_on(tty);
+}
+
+int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
+    if((0 == Rt->encoding_nocheck() % 2 &&
+         (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
+      (uabs(adr.offset()) < (1 << 8))) {
+      /* Good to go with a ldrd */
+      ldrd(Rt, adr, cond);
+      return 0x0;
+    } else {
+      return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
+                                &Assembler::ldr, Rtmp, cond);
+    }
+}
+
+int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
+    if((0 == Rt->encoding_nocheck() % 2 &&
+         (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
+      (uabs(adr.offset()) < (1 << 8))) {
+      /* Good to go with a strd */
+      strd(Rt, adr, cond);
+    } else {
+      double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
+    }
+    return 0x0;
+}
+
+int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+        void (Assembler::* mul)(unsigned, const Address&, Condition),
+        void (Assembler::* sgl)(Register, const Address&, Condition),
+        Register Rtmp, Condition cond) {
+  if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
+          (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
+    /* Do a load or store multiple instruction */
+    (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
+  } else if (!adr.uses(Rt)) {
+    double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
+  } else {
+    // need to reshuffle operation, otherwise write to Rt destroys adr
+    if (adr.get_mode() != Address::reg) {
+      // offset-based addressing. hence Rt2 could not be by adr
+      if (adr.get_wb_mode() == Address::pre) {
+        (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
+        (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
+      } else if (adr.get_wb_mode() == Address::post) {
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+        (this->*sgl)(Rt, adr, cond);
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+        (this->*sgl)(Rt, adr, cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    } else {
+      // index-based addressing. both Rt and Rt2 could be used by adr
+      // hence temp register is necessary
+      adr.lea(this, Rtmp);
+      double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
+      // adr.lea have only address manipulation and cannot cause trap.
+      // first instruction when NPE can occur is in double_ldst_failed_dispatch
+      // so shift offset appropriately
+      return 0x4;
+    }
+  }
+  return 0x0;
+}
+
+void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+        void (Assembler::* mul)(unsigned, const Address&, Condition),
+        void (Assembler::* sgl)(Register, const Address&, Condition),
+        Condition cond) {
+  if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
+          (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
+    /* Do a store multiple instruction */
+    (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
+  } else {
+    if (adr.get_mode() != Address::reg) {
+      // offset-based addressing
+      if (adr.get_wb_mode() == Address::pre) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
+      } else if (adr.get_wb_mode() == Address::post) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    } else {
+      // index-based addressing
+      if (adr.get_wb_mode() == Address::pre) {
+        // current implementation does not use Address::pre for indexed access
+        ShouldNotReachHere();
+      } else if (adr.get_wb_mode() == Address::post) {
+        // current implementation does not use Address:post for indexed access
+        // enable the code below and implement proper post() method if it is required
+#if 0
+        (this->*sgl)(Rt, Address(post(adr.base(), wordSize)), cond);
+        (this->*sgl)(Rt2, Address(post(adr.base(), adr.index(), adr.shift())), cond);
+        sub(adr.base(), wordSize, cond);
+#endif
+        ShouldNotReachHere();
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
+        compensate_addr_offset(adr, cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    }
+  }
+}
+
+#ifdef ASSERT
+void MacroAssembler::verify_stack_alignment() {
+  if (StackAlignmentInBytes > 4) {
+    Label x;
+    tst(sp, StackAlignmentInBytes-1);
+    b(x, EQ);
+    stop("stack unaligned");
+    bind(x);
+  }
+}
+#endif
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * r0: x
+ * r1: xlen
+ * r2: y
+ * r3: ylen
+ * r4:  z
+ * r5: zlen
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
+                                     Register z, Register zlen,
+                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                                     Register tmp5, Register tmp6) {
+
+  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
+
+  const Register xc = xlen;
+  const Register yc = tmp1;
+  const Register zc = tmp2;
+
+  const Register vz = tmp3;
+  const Register carry = tmp4;
+  const Register vx = tmp5;
+  const Register vy = tmp6;
+
+  // ensure y (inner cycle) is shorter than x (outer cycle), this in theory uses CPU caches more effectively
+  Label L_x_longer;
+  cmp(xlen, ylen);
+  b(L_x_longer, Assembler::GE);
+#define SWP(X, Y) \
+  mov(tmp1, Y); \
+  mov(Y, X); \
+  mov(X, tmp1)
+  SWP(x, y);
+  SWP(xlen, ylen);
+  bind(L_x_longer);
+
+  lea(xc, Address(x, xlen, lsl(LogBytesPerInt))); // x[xstart]
+  lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[idx]
+  lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[kdx]
+
+  // First Loop.
+  //
+  //  final static long LONG_MASK = 0xffffffffL;
+  //  int xstart = xlen - 1;
+  //  int ystart = ylen - 1;
+  //  long carry = 0;
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
+  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+  //    z[kdx] = (int)product;
+  //    carry = product >>> 32;
+  //  }
+  //  z[xstart] = (int)carry;
+  //
+
+  ldr(vx, Assembler::pre(xc, -BytesPerInt));
+  mov(carry, 0);
+
+  Label L_loop_1;
+  bind(L_loop_1);
+  ldr(vy, Assembler::pre(yc, -BytesPerInt));
+  mov(vz, 0);
+  umaal(vz, carry, vx, vy);
+  str(vz, Assembler::pre(zc, -BytesPerInt));
+  cmp(yc, y);
+  b(L_loop_1, Assembler::GT);
+
+  str(carry, Address(zc, -BytesPerInt));
+
+  // Second and third (nested) loops.
+  //
+  // for (int i = xstart-1; i >= 0; i--) { // Second loop
+  //   carry = 0;
+  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+  //                    (z[k] & LONG_MASK) + carry;
+  //     z[k] = (int)product;
+  //     carry = product >>> 32;
+  //   }
+  //   z[i] = (int)carry;
+  // }
+  //
+  Label L_loop_2, L_loop_3;
+  bind(L_loop_2);
+
+  sub(zlen, zlen, 1);
+  lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[jdx]
+  lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[k]
+
+  ldr(vx, Assembler::pre(xc, -BytesPerInt));
+  mov(carry, 0);
+
+  bind(L_loop_3);
+  ldr(vy, Assembler::pre(yc, -BytesPerInt));
+  ldr(vz, Assembler::pre(zc, -BytesPerInt)); // r1 is vz, r2 is carry
+  umaal(vz, carry, vx, vy);
+  str(vz, Address(zc));
+  cmp(yc, y);
+  b(L_loop_3, Assembler::GT);
+
+  str(carry, Address(zc, -BytesPerInt));
+  cmp(xc, x);
+  b(L_loop_2, Assembler::GT);
+}
+
+/**
+ * Code for BigInteger::mulAdd() instrinsic.
+ *
+ * r0: out
+ * r1: in
+ * r2: offset
+ * r3: len
+ * r4: k
+ */
+void MacroAssembler::mul_add(Register out, Register in, Register offset, Register len, Register k,
+                              Register tmp1, Register tmp2, Register tmp3) {
+
+  assert_different_registers(out, in, offset, len, k, tmp1, tmp2, tmp3);
+
+  Register vin = tmp1;
+  Register vout = tmp2;
+  Register carry = tmp3;
+  Register result = r0;
+
+//        long kLong = k & LONG_MASK;
+//        long carry = 0;
+//
+//        offset = out.length-offset - 1;
+//        for (int j=len-1; j >= 0; j--) {
+//            long product = (in[j] & LONG_MASK) * kLong +
+//                           (out[offset] & LONG_MASK) + carry;
+//            out[offset--] = (int)product;
+//            carry = product >>> 32;
+//        }
+//        return (int)carry;
+
+  lea(in, Address(in, len, lsl(LogBytesPerInt)));
+  lea(out, Address(out, offset, lsl(LogBytesPerInt)));
+  mov(carry, 0);
+
+  Label L_loop;
+  bind(L_loop);
+  ldr(vin, Assembler::pre(in, -BytesPerInt));
+  ldr(vout, Assembler::pre(out, -BytesPerInt));
+  umaal(vout, carry, vin, k);
+  str(vout, Address(out));
+  subs(len, len, 1);
+  b(L_loop, Assembler::GT);
+
+  mov(result, carry);
+}
+
+/**
+ * Emits code to update CRC-32 with a byte value according to constants in table
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ *
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  eor(val, val, crc);
+  andr(val, val, 0xff);
+  ldr(val, Address(table, val, lsl(2)));
+  eor(crc, val, crc, Assembler::lsr(8));
+}
+
+/**
+ * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]v         Register containing the 32-bit to fold into the CRC.
+ * @param [in]table0    Register containing table 0 of crc constants.
+ * @param [in]table1    Register containing table 1 of crc constants.
+ * @param [in]table2    Register containing table 2 of crc constants.
+ * @param [in]table3    Register containing table 3 of crc constants.
+ *
+ * uint32_t crc;
+ *   v = crc ^ v
+ *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
+ *
+ */
+void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
+        Register tmp2, Register table0, Register table1, Register table2, Register table3) {
+  eor(v, crc, v);
+  uxtb(tmp, v);
+  uxtb(tmp2, v, ror(8));
+  ldr(crc, Address(table3, tmp, lsl(2)));
+  ldr(tmp2, Address(table2, tmp2, lsl(2)));
+  uxtb(tmp, v, ror(16));
+  eor(crc, crc, tmp2);
+  uxtb(tmp2, v, ror(24));
+  ldr(tmp, Address(table1, tmp, lsl(2)));
+  ldr(tmp2, Address(table0, tmp2, lsl(2)));
+  eor(crc, crc, tmp);
+  eor(crc, crc, tmp2);
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register that will contain address of CRC table
+ * @param tmp   scratch register
+ */
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
+        Register table0, Register table1, Register table2, Register table3,
+        Register tmp, Register tmp2, Register tmp3, int is_crc32c) {
+  Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
+
+  if (!is_crc32c)
+    inv(crc, crc);
+  if (UseCRC32) {
+    Label CRC_by4_loop, CRC_by1_loop;
+
+      subs(len, len, 4);
+      b(CRC_by4_loop, Assembler::GE);
+      adds(len, len, 4);
+      b(CRC_by1_loop, Assembler::GT);
+      b(L_exit);
+
+    BIND(CRC_by4_loop);
+      ldr(tmp, Address(post(buf, 4)));
+      subs(len, len, 4);
+      if (!is_crc32c)
+        crc32w(crc, crc, tmp);
+      else // is_crc32c
+        crc32cw(crc, crc, tmp);
+      b(CRC_by4_loop, Assembler::GE);
+      adds(len, len, 4);
+      b(L_exit, Assembler::LE);
+    BIND(CRC_by1_loop);
+      ldrb(tmp, Address(post(buf, 1)));
+      subs(len, len, 1);
+      if (!is_crc32c)
+        crc32b(crc, crc, tmp);
+      else // is_crc32c
+        crc32cb(crc, crc, tmp);
+      b(CRC_by1_loop, Assembler::GT);
+    BIND(L_exit);
+      if (!is_crc32c)
+        inv(crc, crc);
+      return;
+  }
+    lea(table0, ExternalAddress(
+        !is_crc32c ?
+            StubRoutines::crc_table_addr() :
+            StubRoutines::crc32c_table_addr() ));
+    add(table1, table0, 1*256*sizeof(juint));
+    add(table2, table0, 2*256*sizeof(juint));
+    add(table3, table0, 3*256*sizeof(juint));
+
+  BIND(L_align_by1_loop);
+    tst(buf, 3);
+    b(L_align_exit, Assembler::EQ);
+    cmp(len, 0);
+    b(L_exit, Assembler::EQ);
+    sub(len, len, 1);
+    ldrb(tmp, Address(post(buf, 1)));
+    update_byte_crc32(crc, tmp, table0);
+    b(L_align_by1_loop);
+
+  BIND(L_align_exit);
+
+  if(VM_Version::features() & FT_AdvSIMD) {
+  if (UseNeon) {
+      cmp(len, 32+12); // account for possible need for alignment
+      b(L_cpu, Assembler::LT);
+
+    Label L_fold, L_align_by4_loop, L_align_by4_exit;
+
+    BIND(L_align_by4_loop);
+      tst(buf, 0xf);
+      b(L_align_by4_exit, Assembler::EQ);
+      ldr(tmp, Address(post(buf, 4)));
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      sub(len, len, 4);
+      b(L_align_by4_loop);
+
+    BIND(L_align_by4_exit);
+
+      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+      vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
+      vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
+      veor_64(d16, d16, d16);
+      vmov_32(d16, 0, crc);
+
+      veor_64(d0, d0, d16);
+      sub(len, len, 32);
+
+    BIND(L_fold);
+      vmullp_8(q8, d0, d5);
+      vmullp_8(q9, d0, d7);
+      vmullp_8(q10, d0, d4);
+      vmullp_8(q11, d0, d6);
+
+      vmullp_8(q12, d1, d5);
+      vmullp_8(q13, d1, d7);
+      vmullp_8(q14, d1, d4);
+      vmullp_8(q15, d1, d6);
+
+      vuzp_128_16(q9, q8);
+      veor_128(q8, q8, q9);
+
+      vuzp_128_16(q13, q12);
+      veor_128(q12, q12, q13);
+
+      vshll_16u(q9, d16, 8);
+      vshll_16u(q8, d17, 8);
+
+      vshll_16u(q13, d24, 8);
+      vshll_16u(q12, d25, 8);
+
+      veor_128(q8, q8, q10);
+      veor_128(q12, q12, q14);
+      veor_128(q9, q9, q11);
+      veor_128(q13, q13, q15);
+
+      veor_64(d19, d19, d18);
+      veor_64(d18, d27, d26);
+
+      vshll_32u(q13, d18, 16);
+      vshll_32u(q9, d19, 16);
+
+      veor_128(q9, q8, q9);
+      veor_128(q13, q12, q13);
+
+      veor_64(d31, d26, d27);
+      veor_64(d30, d18, d19);
+
+      vshl_128_64(q15, q15, 1);
+      vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
+      veor_128(q0, q0, q15);
+
+      subs(len, len, 16);
+      b(L_fold, Assembler::GE);
+
+      vmov_32(tmp, d0, 0);
+      mov(crc, 0);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d0, 1);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d1, 0);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d1, 1);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+
+      add(len, len, 16);
+    }
+  } // if FT_AdvSIMD
+
+  BIND(L_cpu);
+    subs(len, len, 8);
+    b(L_by8_loop, Assembler::GE);
+    adds(len, len, 8);
+    b(L_by1_loop, Assembler::GT);
+    b(L_exit);
+
+  BIND(L_by8_loop);
+    ldr(tmp, Address(post(buf, 4)));
+    update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+    ldr(tmp, Address(post(buf, 4)));
+    update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+    subs(len, len, 8);
+    b(L_by8_loop, Assembler::GE);
+    adds(len, len, 8);
+    b(L_exit, Assembler::LE);
+  BIND(L_by1_loop);
+    subs(len, len, 1);
+    ldrb(tmp, Address(post(buf, 1)));
+    update_byte_crc32(crc, tmp, table0);
+    b(L_by1_loop, Assembler::GT);
+
+  BIND(L_exit);
+    if (!is_crc32c)
+      inv(crc, crc);
+}
+
+/**
+ * First round Key (cpu implementation)
+ * @param in   register containing address of input data (plain or cipher text)
+ * @param key  register containing address of the key data
+ * @param t0   output register t0
+ * @param t1   output register t1
+ * @param t2   output register t2
+ * @param t3   output register t3
+ * @param t4   temporary register
+ * @param t5   temporary register
+ * @param t6   temporary register
+ * @param t7   temporary register
+ */
+void MacroAssembler::kernel_aescrypt_firstRound(Register in, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+
+  ldr(t4, Address(post(key, 4)));
+  ldr(t5, Address(post(key, 4)));
+  ldr(t6, Address(post(key, 4)));
+  ldr(t7, Address(post(key, 4)));
+  ldr(t0, Address(post(in, 4)));
+  ldr(t1, Address(post(in, 4)));
+  ldr(t2, Address(post(in, 4)));
+  ldr(t3, Address(post(in, 4)));
+  rev(t0, t0);
+  rev(t1, t1);
+  rev(t2, t2);
+  rev(t3, t3);
+  eor(t0, t0, t4);
+  eor(t1, t1, t5);
+  eor(t2, t2, t6);
+  eor(t3, t3, t7);
+}
+
+/**
+ * AES ECB Round
+ * @param table_te Register contains address of AES replacement table
+ * @param key   register containing address of the key data
+ * @param t0    Register for input value t0
+ * @param t1    Register for input value t1
+ * @param t2    Register for input value t2
+ * @param t3    Register for input value t3
+ * @param a     Register for output value
+ * @param tmp1  Temporary register 1
+ * @param tmp2  Temporary register 2
+ */
+void MacroAssembler::kernel_aescrypt_round(Register table_te, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register a, Register tmp1, Register tmp2) {
+
+  ldr(a, Address(post(key, 4))); // K
+  uxtb(tmp1, t0, ror(24));
+  ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T1
+  uxtb(tmp2, t1, ror(16));
+  eor(a, a, tmp1);
+  ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T2
+  uxtb(tmp1, t2, ror(8));
+  eor(a, a, tmp2, ror(8));
+  ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T3
+  uxtb(tmp2, t3);
+  eor(a, a, tmp1, ror(16));
+  ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T4
+  eor(a, a, tmp2, ror(24)); // a0
+};
+
+/**
+ *
+ *  Last AES encryption round ( 4 bytes )
+ * @param table_te
+ * @param key
+ * @param to
+ * @param t0
+ * @param t1
+ * @param t2
+ * @param t3
+ * @param t4
+ * @param t5
+ * @param t6
+ * @param t7
+ *
+ *           int tt = K[keyOffset++];
+ *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
+ *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
+ *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
+ *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
+ */
+void MacroAssembler::kernel_aescrypt_lastRound(
+        Register table_te, Register key, Register to,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+
+  ldr(t7, Address(post(key, 4))); // tt
+
+  uxtb(t5, t0, ror(24));
+  ldr(t4, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t1, ror(16));
+  eor(t4, t4, t7, lsr(24));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  uxtb(t5, t2, ror(8));
+  eor(t6, t6, t7, lsr(16));
+  uxtb(t6, t6);
+  add(t4, t4, t6, lsl(8));
+  ldr(t5, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t3);
+  eor(t5, t5, t7, lsr(8));
+  uxtb(t5, t5);
+  add(t4, t4, t5, lsl(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  eor(t6, t6, t7);
+  uxtb(t6, t6);
+  add(t4, t4, t6, lsl(24));
+
+  str(t4, Address(post(to, 4)));
+
+}
+
+/**
+ *
+ *  Last AES encryption round ( 4 bytes )
+ * @param table_te
+ * @param key
+ * @param to
+ * @param t0
+ * @param t1
+ * @param t2
+ * @param t3
+ * @param t4
+ * @param t5
+ * @param t6
+ * @param t7
+ *
+ *           int tt = K[keyOffset++];
+ *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
+ *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
+ *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
+ *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
+ */
+void MacroAssembler::kernel_aescrypt_lastRound_cbc(
+        Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+
+  uxtb(t5, t0, ror(24));
+  ldr(t4, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t1, ror(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  uxtb(t5, t2, ror(8));
+  add(t4, t4, t6, lsl(8));
+  ldr(t5, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t3);
+  add(t4, t4, t5, lsl(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  add(t4, t4, t6, lsl(24));
+}
+
+/**
+ * AES ECB encryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param keylen    register containing key len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_encryptBlock(Register from, Register to,
+        Register key, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+  Label L_loop;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+
+  kernel_aescrypt_firstRound(from, key,
+          t0, t1, t2, t3, t4, t5, t6, t7);
+
+  sub(keylen, keylen, 8);
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t1, t2, t3, t4, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t2, t3, t0, t5, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t3, t0, t1, t6, t7, from);
+
+  uxtb(t7, t3, ror(24));
+  ldr(t3, Address(table_te, t7, lsl(2))); // T1
+  uxtb(t7, t0, ror(16));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T2
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, t7, ror(8));
+  uxtb(t7, t1, ror(8));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, t7, ror(16));
+  uxtb(t7, t2);
+  ldr(t7, Address(table_te, t7, lsl(2))); // T4
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, t7, ror(24));
+  ldr(t7, Address(post(key, 4))); // K
+  eor(t3, t3, t7); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t0, t1, t2, t3,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t1, t2, t3, t0,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t2, t3, t0, t1,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t3, t0, t1, t2,
+          t4, t5, t6, t7);
+}
+
+/**
+ * AES ECB decryption
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param keylen    register containing key len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_decryptBlock(Register from, Register to,
+        Register key, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+  Label L_loop;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  push(key, sp);
+
+  add(key, key, 16);
+  kernel_aescrypt_firstRound(from, key,
+          t0, t1, t2, t3, t4, t5, t6, t7);
+
+  sub(keylen, keylen, 8);
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t3, t2, t1, t4, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t0, t3, t2, t5, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t1, t0, t3, t6, t7, from);
+
+  uxtb(t7, t3, ror(24));
+  ldr(t3, Address(table_te, t7, lsl(2))); // T1
+  uxtb(t7, t2, ror(16));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T2
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, t7, ror(8));
+  uxtb(t7, t1, ror(8));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, t7, ror(16));
+  uxtb(t7, t0);
+  ldr(t7, Address(table_te, t7, lsl(2))); // T4
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, t7, ror(24));
+  ldr(t7, Address(post(key, 4))); // K
+  eor(t3, t3, t7); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  pop(key, sp);
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t0, t3, t2, t1,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t1, t0, t3, t2,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t2, t1, t0, t3,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t3, t2, t1, t0,
+          t4, t5, t6, t7);
+}
+
+/**
+ * AES CBC encryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param rvec      register pointing to roundkey vector
+ * @param len       register containing source len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_encrypt(Register from, Register to,
+        Register key, Register rvec, Register len, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+  Label L_loop, L_loop2;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
+  vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
+  sub(keylen, keylen, 8);
+
+  add(t4, key, keylen, lsl(2));
+  vld1_64(d8, d9, Address(t4), Assembler::ALIGN_STD); // read last key bytes to q4
+  vrev32_128_8(q4, q4);
+
+  push(to, sp);
+  BIND(L_loop2);
+  // get round key and first round
+  vld1_64(d0, d1, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q0
+  veor_128(q0, q0, q2);
+  vrev32_128_8(q0, q0);
+  veor_128(q0, q0, q1);
+  vmov_f64(t0, t1, d0);
+  vmov_f64(t2, t3, d1);
+
+  push(RegSet::of(key, from), sp);
+  push(RegSet::of(to, keylen), sp);
+
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t1, t2, t3, t4, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t2, t3, t0, t5, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t3, t0, t1, t6, to, from);
+
+  uxtb(to, t3, ror(24));
+  ldr(t3, Address(table_te, to, lsl(2))); // T1
+  uxtb(to, t0, ror(16));
+  ldr(to, Address(table_te, to, lsl(2))); // T2
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, to, ror(8));
+  uxtb(to, t1, ror(8));
+  ldr(to, Address(table_te, to, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, to, ror(16));
+  uxtb(to, t2);
+  ldr(to, Address(table_te, to, lsl(2))); // T4
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, to, ror(24));
+  ldr(to, Address(post(key, 4))); // K
+  eor(t3, t3, to); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t0, t1, t2, t3,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t1, t2, t3, t0,
+          t5, t6, from);
+  vmov_f64(d6, t4, t5);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t2, t3, t0, t1,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t3, t0, t1, t2,
+          t5, t6, from);
+  vmov_f64(d7, t4, t5);
+  veor_128(q2, q4, q3);
+
+  pop(RegSet::of(to, keylen), sp);
+  sub(table_te, table_te, 4 * 256); //Te
+  vst1_64(d4, Address(post(to, 8)), Assembler::ALIGN_STD);
+  pop(RegSet::of(key, from), sp);
+  vst1_64(d5, Address(post(to, 8)), Assembler::ALIGN_STD);
+
+  subs(len, len, 16);
+  b(L_loop2, Assembler::NE);
+  vstr_f64(d4, Address(rvec));
+  vstr_f64(d5, Address(rvec, 8));
+  mov(r0, to);
+  pop(to, sp);
+  sub(r0, r0, to);
+};
+
+/**
+ * AES CBC decryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param rvec      register pointing to roundkey vector
+ * @param len       register containing source len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_decrypt(Register from, Register to,
+        Register key, Register rvec, Register len, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+  Label L_loop, L_loop2;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
+  vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
+  vld1_64(d10, d11, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q5
+  vrev32_128_8(q1, q1);
+  sub(keylen, keylen, 8);
+
+  push(to, sp);
+  BIND(L_loop2);
+  // get round key and first round
+  vld1_64(d8, d9, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q4
+
+  push(RegSet::of(to, key, from, keylen), sp);
+  vrev32_128_8(q0, q4);
+  veor_128(q0, q0, q5);
+  vmov_f64(t0, t1, d0);
+  vmov_f64(t2, t3, d1);
+
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t3, t2, t1, t4, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t0, t3, t2, t5, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t1, t0, t3, t6, to, from);
+
+  uxtb(to, t3, ror(24));
+  ldr(t3, Address(table_te, to, lsl(2))); // T1
+  uxtb(to, t2, ror(16));
+  ldr(to, Address(table_te, to, lsl(2))); // T2
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, to, ror(8));
+  uxtb(to, t1, ror(8));
+  ldr(to, Address(table_te, to, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, to, ror(16));
+  uxtb(to, t0);
+  ldr(to, Address(table_te, to, lsl(2))); // T4
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, to, ror(24));
+  ldr(to, Address(post(key, 4))); // K
+  eor(t3, t3, to); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t0, t3, t2, t1,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t1, t0, t3, t2,
+          t5, t6, to);
+  vmov_f64(d6, t4, t5); //q3
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t2, t1, t0, t3,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t3, t2, t1, t0,
+          t5, t6, to);
+  vmov_f64(d7, t4, t5); //q3
+  pop(RegSet::of(to, key, from, keylen), sp);
+  veor_128(q3, q1, q3);
+  veor_128(q3, q3, q2);
+  vshl_128_64(q2, q4, 0);
+
+  sub(table_te, table_te, 4 * 256); //Te
+
+  vst1_64(d6, Address(post(to, 8)), Assembler::ALIGN_STD);
+  subs(len, len, 16);
+  vst1_64(d7, Address(post(to, 8)), Assembler::ALIGN_STD);
+
+  b(L_loop2, Assembler::NE);
+
+  vstr_f64(d4, Address(rvec));
+  vstr_f64(d5, Address(rvec, 8));
+  mov(r0, to);
+  pop(to, sp);
+  sub(r0, r0, to);
+};
+
+/*
+ * First round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round1(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    eor(st_f, st_d, st_c, ror(32-sh));
+  } else {
+    eor(st_f, st_d, st_c);
+  }
+  andr(st_f, st_f, st_b);
+  eor(st_f, st_f, st_d);
+}
+
+/*
+ * Second and forth round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round2(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    eor(st_f, st_b, st_c, ror(32-sh));
+  } else {
+    eor(st_f, st_b, st_c);
+  }
+  eor(st_f, st_f, st_d);
+}
+
+/*
+ * Third round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round3(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    andr(st_f, st_b, st_c, ror(32-sh));
+    orr(tmp, st_b, st_c, ror(32-sh));
+  } else {
+    andr(st_f, st_b, st_c);
+    orr(tmp, st_b, st_c);
+  }
+  andr(tmp, st_d, tmp);
+  orr(st_f, st_f, tmp);
+}
+
+/*
+ * Calculate Deltas w[i] and w[i+1]
+ * w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) rotl 1
+ */
+void MacroAssembler::sha_w0(FloatRegister w16, FloatRegister w14,
+        FloatRegister w8, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, bool update) {
+  vadd_64_32(st_kw, st_k, w16);
+  if(update) {
+    veor_64(tmp1, w16, w14);
+    vext_64(tmp2, w2, w4, 4);
+    veor_64(tmp3, tmp1, w8);
+    veor_64(tmp4, tmp3, tmp2);
+
+    vshr_64_u32(tmp1, tmp4, 31);
+    vshl_64_32(tmp2, tmp4, 1);
+    vorr_64(w16, tmp1, tmp2);
+  }
+}
+/*
+ * Calculate Deltas w[i] and w[i+1]
+ */
+void MacroAssembler::sha_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp,
+        bool update) {
+  Label L_7, L_6, L_5, L_4, L_3, L_2, L_1, L_done;
+  andr(rtmp, counter, 0x7);
+  add(counter, counter, 1);
+  cmp(rtmp, 7);
+  b(L_7, Assembler::EQ);
+  cmp(rtmp, 6);
+  b(L_6, Assembler::EQ);
+  cmp(rtmp, 5);
+  b(L_5, Assembler::EQ);
+  cmp(rtmp, 4);
+  b(L_4, Assembler::EQ);
+  cmp(rtmp, 3);
+  b(L_3, Assembler::EQ);
+  cmp(rtmp, 2);
+  b(L_2, Assembler::EQ);
+  cmp(rtmp, 1);
+  b(L_1, Assembler::EQ);
+    sha_w0(w16, w14, w8, w4, w2, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update);
+    b(L_done);
+  BIND(L_1); {
+    sha_w0(w14, w12, w6, w2, w16, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_2); {
+    sha_w0(w12, w10, w4, w16, w14, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_3); {
+    sha_w0(w10, w8, w2, w14, w12, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_4); {
+    sha_w0(w8, w6, w16, w12, w10, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_5); {
+    sha_w0(w6, w4, w14, w10, w8, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_6); {
+    sha_w0(w4, w2, w12, w8, w6, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_7); {
+    sha_w0(w2, w16, w10, w6, w4, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+  }
+  BIND(L_done);
+}
+
+/**
+ * SHA1 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register st_a, Register st_b,
+        Register st_c, Register st_d, Register st_e,
+        Register tmp, Register counter2, Register st_new_a, Register st_w) {
+  Label L_round_1, L_round_2, L_round_3, L_round_4, L_round_4_cont, L_hash_no_w;
+
+  FloatRegister w16 = d0;  //q0-q7
+  FloatRegister w14 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w12 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w10 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w8  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w6  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w4  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w2  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp1  = w2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp2  = wtmp1->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp3  = wtmp2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp4  = wtmp3->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k1  = wtmp4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k2  = st_k1->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k   = st_k2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_kw  = st_k->successor(FloatRegisterImpl::DOUBLE);
+
+
+  assert_different_registers(st_a,st_b,st_c,st_d,st_e,tmp,counter2, st_new_a, st_w);
+  assert_different_registers(w2,w4,w6,w8,w10,w12,w14,w16);
+
+  lea(table_k, ExternalAddress(StubRoutines::sha1_table_addr()));
+
+  // read initial 16 W elements
+  vld1_64(w16,  w14,  w12,  w10,  Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w8,   w6,   w4,   w2,   Address(from), Assembler::ALIGN_STD);
+
+  // revert W
+  vrev64_128_8(w16, w16);
+  vrev64_128_8(w12, w12);
+  vrev64_128_8(w8,  w8);
+  vrev64_128_8(w4,  w4);
+  // load state
+  ldr(st_a, Address(post(state, 4)));
+  ldr(st_b, Address(post(state, 4)));
+  ldr(st_c, Address(post(state, 4)));
+  ldr(st_d, Address(post(state, 4)));
+  ldr(st_e, Address(state));
+  sub(state, state, 16);
+
+  mov(counter2, 0);
+  mov(counter, 10);
+  // first round
+  vld1_64(st_k1, st_k2, Address(table_k), Assembler::ALIGN_128);
+  vdup_64_32(st_k, st_k1, 0);
+
+  BIND(L_round_1); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round1(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round1(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_1);
+
+  mov(counter, 10);
+  // second round
+  vdup_64_32(st_k, st_k1, 1);
+
+  BIND(L_round_2); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_2);
+
+  mov(counter, 10);
+  vdup_64_32(st_k, st_k2, 0);
+  // third round
+
+  BIND(L_round_3); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round3(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round3(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_3);
+
+  mov(counter, 10);
+  // forth round
+  vdup_64_32(st_k, st_k2, 1);
+
+  BIND(L_round_4); {
+    sub(counter, counter, 1);
+    cmp(counter, 8);
+    b(L_hash_no_w, Assembler::LO);
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+    b(L_round_4_cont);
+    BIND(L_hash_no_w);
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp, false);
+    BIND(L_round_4_cont);
+
+    sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+  }cbnz(counter, L_round_4);
+
+    // load state
+  ldr(tmp, Address(post(state, 4)));
+  add(st_a, st_a, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_b, st_b, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_c, st_c, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_d, st_d, tmp);
+  ldr(tmp, Address(state));
+  add(st_e, st_e, tmp);
+  sub(state, state, 16);
+
+  // save state
+  str(st_a, Address(post(state, 4)));
+  str(st_b, Address(post(state, 4)));
+  str(st_c, Address(post(state, 4)));
+  str(st_d, Address(post(state, 4)));
+  str(st_e, Address(state));
+}
+/**
+ * One iteration of SHA256 algorithm
+ * Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
+ * Ma := (a and b) xor (a and c) xor (b and c)
+ * t2 := Σ0 + Ma
+ * Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
+ * Ch := (e and f) xor ((not e) and g)
+ * t1 := h + Σ1 + Ch + k[i] + w[i]
+ * h := g
+ * g := f
+ * f := e
+ * e := d + t1
+ * d := c
+ * c := b
+ * b := a
+ * a := t1 + t2
+ */
+void MacroAssembler::sha256_implCompress_iter0(
+      Register Da, Register Db, Register Dc, Register Dd,
+      Register De, Register Df, Register Dg, Register Dh,
+      FloatRegister Dkw, int index,
+      Register Dtmp,
+      Register Dnew_a, Register Dnew_e
+        ) {
+    assert_different_registers(Da, Db, Dc, Dd, De, Df, Dg, Dh);
+
+    //  Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
+    //  Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
+    andr(Dnew_a, Da, Db);
+    andr(Dnew_e, Da, Dc);
+    eor(Dnew_a, Dnew_a, Dnew_e);
+    andr(Dnew_e, Db, Dc);
+    eor(Dnew_e, Dnew_a, Dnew_e); //Ma
+
+    mov(Dnew_a, Da, ror(2));
+    eor(Dnew_a, Dnew_a, Da, ror(13));
+    eor(Dnew_a, Dnew_a, Da, ror(22)); //Σ0
+
+    add(Dnew_a, Dnew_a, Dnew_e); //t2
+
+    andr(Dnew_e, De, Df);
+    mvn(Dtmp, De);
+    andr(Dtmp, Dtmp, Dg);
+    eor(Dtmp, Dnew_e, Dtmp); //Ch
+
+    mov(Dnew_e, De, ror(6));
+    eor(Dnew_e, Dnew_e, De, ror(11));
+    eor(Dnew_e, Dnew_e, De, ror(25)); //Σ1
+
+    add(Dnew_e, Dnew_e, Dtmp);
+    vmov_32(Dtmp, Dkw, index);
+    add(Dnew_e, Dnew_e, Dh);
+
+    add(Dtmp, Dnew_e, Dtmp); //t1
+
+    add(Dnew_e, Dtmp, Dd); //new_e
+    add(Dnew_a, Dtmp, Dnew_a); //new_a
+};
+/**
+ * Four iterations of SHA256 algorithm
+ */
+void MacroAssembler::sha256_implCompress_iter(
+      Register ra, Register rb, Register rc, Register rd,
+      Register re, Register rf, Register rg, Register rh,
+      FloatRegister Dkw1, FloatRegister Dkw2,
+      Register step,
+      Register tmp,
+      Register ra2, Register re2
+        ) {
+  Label L_4, L_3, L_2, L_1, L_done;
+  cmp(step, 4);
+  b(L_4, Assembler::EQ);
+  cmp(step, 3);
+  b(L_3, Assembler::EQ);
+  cmp(step, 2);
+  b(L_2, Assembler::EQ);
+  cmp(step, 1);
+  b(L_1, Assembler::EQ);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 0, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 1, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 0, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 1, tmp, rb,  rf);
+    mov(step, 4);
+    b(L_done);
+  BIND(L_1); {
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 0, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 1, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 0, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 1, tmp, ra,  re);
+    mov(step, 0);
+    b(L_done);
+  }
+  BIND(L_2); {
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 0, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 1, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 0, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 1, tmp, ra2, re2);
+    mov(step, 1);
+    b(L_done);
+  }
+  BIND(L_3); {
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 0, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 1, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 0, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 1, tmp, rd,  rh);
+    mov(step, 2);
+    b(L_done);
+  }
+  BIND(L_4); {
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 0, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 1, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 0, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 1, tmp, rc,  rg);
+    mov(step, 3);
+  }
+  BIND(L_done);
+};
+
+  /*
+   * Calculate Deltas w[i] and w[i+1]
+   * s0 := (w[i-15] rotr 7) xor (w[i-15] rotr 18) xor (w[i-15] shr 3)
+   * s1 := (w[i-2] rotr 17) xor (w[i-2] rotr 19) xor (w[i-2] shr 10)
+   * w[i] := w[i-16] + s0 + w[i-7] + s1
+   */
+void MacroAssembler::sha256_w0(
+      FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14,
+      FloatRegister w_m7, FloatRegister w_m6,
+      FloatRegister w_m2,
+      FloatRegister Qtmp_S0, FloatRegister Qtmp_S1,
+      FloatRegister Qtmp1){
+
+    vmov_64(Qtmp1, w_m15);
+    vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m14);
+    vshr_128_u64(Qtmp_S0, Qtmp1, 7);
+    vshr_128_u64(Qtmp_S1, Qtmp1, 18);
+    veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);
+    vshr_128_u64(Qtmp_S1, Qtmp1, 35);
+    veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);  //S0
+
+    vshr_128_u64(Qtmp_S1, w_m2, 17);
+    vshr_128_u64(Qtmp1, w_m2, 19);
+    veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);
+    vshr_128_u64(Qtmp1, w_m2, 42);
+    veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);  //S1
+
+    vmov_64(Qtmp1, w_m7);
+    vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m6);
+    vadd_128_32(Qtmp1, Qtmp1, w_m16);
+    vadd_128_32(Qtmp1, Qtmp1, Qtmp_S0);
+    vadd_128_32(w_m16, Qtmp1, Qtmp_S1); // w[i/i+1]
+
+    vdup_64_32(w_m16, w_m16, 0);
+    vdup_64_32(w_m15, w_m15, 0);
+}
+
+/*
+ * Calculate Deltas w[i] ... w[i+3]
+ */
+void MacroAssembler::sha256_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3,
+        FloatRegister st_kw, Register counter, Register rtmp) {
+  FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister Dtmp1  = as_FloatRegister(tmp1->encoding());
+  FloatRegister Dtmp2  = Dtmp1->successor(FloatRegisterImpl::DOUBLE);
+  Label L_3, L_2, L_1, L_done;
+
+  andr(rtmp, counter, 0x3);
+  cmp(rtmp, 3);
+  b(L_3, Assembler::EQ);
+  cmp(rtmp, 2);
+  b(L_2, Assembler::EQ);
+  cmp(rtmp, 1);
+  b(L_1, Assembler::EQ);
+    vext_64(Dtmp1, w16, w15, 4);
+    vext_64(Dtmp2, w14, w13, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w16, w15, w14, w7,  w6,  w2,  tmp1, tmp2, tmp3);
+    sha256_w0(w14, w13, w12, w5,  w4,  w16, tmp1, tmp2, tmp3);
+    b(L_done);
+  BIND(L_3); {
+    vext_64(Dtmp1, w12, w11, 4);
+    vext_64(Dtmp2, w10, w9,  4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w12, w11, w10, w3,  w2,  w14, tmp1, tmp2, tmp3);
+    sha256_w0(w10, w9,  w8,  w1,  w16, w12, tmp1, tmp2, tmp3);
+    b(L_done);
+  }
+  BIND(L_2); {
+    vext_64(Dtmp1, w8, w7, 4);
+    vext_64(Dtmp2, w6, w5, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w8,  w7,  w6,  w15, w14, w10, tmp1, tmp2, tmp3);
+    sha256_w0(w6,  w5,  w4,  w13, w12, w8,  tmp1, tmp2, tmp3);
+    b(L_done);
+  }
+  BIND(L_1); {
+    vext_64(Dtmp1, w4, w3, 4);
+    vext_64(Dtmp2, w2, w1, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w4,  w3,  w2,  w11, w10, w6,  tmp1, tmp2, tmp3);
+    sha256_w0(w2,  w1,  w16, w9,  w8,  w4,  tmp1, tmp2, tmp3);
+  }
+  BIND(L_done);
+}
+
+/**
+ * SHA256 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha256_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register ra, Register rb, Register rc, Register rd, Register re,
+        Register rf, Register rg, Register rh,
+        Register ra2, Register re2) {
+
+    Label L_hash_loop, L_hash_loop_done, L_hash_no_w;
+    lea(table_k, ExternalAddress(StubRoutines::sha256_table_addr()));
+
+    // read next k
+    vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
+    // read initial 16 W elements in q8-q11
+    vld1_64(d16, d17, d18, d19, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
+    vld1_64(d20, d21, d22, d23, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
+    // revert W
+    vrev32_128_8(q8,  q8);
+    vrev32_128_8(q9,  q9);
+    vrev32_128_8(q10, q10);
+    vrev32_128_8(q11, q11);
+
+    vadd_128_32(q7, q7, q8); // k + w
+
+    vdup_64_32(d31, d23, 1);  //w1
+    vdup_64_32(d30, d23, 0);  //w2
+    vdup_64_32(d29, d22, 1);  //w3
+    vdup_64_32(d28, d22, 0);  //w4
+    vdup_64_32(d27, d21, 1);  //w5
+    vdup_64_32(d26, d21, 0);  //w6
+    vdup_64_32(d25, d20, 1);  //w7
+    vdup_64_32(d24, d20, 0);  //w8
+    vdup_64_32(d23, d19, 1);  //w9
+    vdup_64_32(d22, d19, 0);  //w10
+    vdup_64_32(d21, d18, 1);  //w11
+    vdup_64_32(d20, d18, 0);  //w12
+    vdup_64_32(d19, d17, 1);  //w13
+    vdup_64_32(d18, d17, 0);  //w14
+    vdup_64_32(d17, d16, 1);  //w15
+    vdup_64_32(d16, d16, 0);  //w16
+
+    mov(counter, 16);
+    // load state
+    push(state, sp);
+    ldr(ra, Address(post(state, 4)));
+    ldr(rb, Address(post(state, 4)));
+    ldr(rc, Address(post(state, 4)));
+    ldr(rd, Address(post(state, 4)));
+    ldr(re, Address(post(state, 4)));
+    ldr(rf, Address(post(state, 4)));
+    ldr(rg, Address(post(state, 4)));
+    ldr(rh, Address(state));
+
+    const Register tmp = from;
+    const Register step = state;
+
+    // calculate deltas
+    sha256_w0(d16, d17, d18, d25,  d26,  d30, q0, q1, q2);
+    sha256_w0(d18, d19, d20, d27,  d28,  d16, q0, q1, q2);
+
+    mov(step, 0); // use state for internal counter
+    sub(counter, counter, 1);
+
+    sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
+        step,
+        tmp, ra2, re2);
+
+    BIND(L_hash_loop); {
+      // read next k
+      vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
+      //calculate deltas
+      sha256_w(q8, q9, q10, q11, q12, q13, q14, q15,
+        q0, q1, q2,
+        q7,
+        counter, tmp);
+
+      //calculate state
+      sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
+        step,
+        tmp, ra2, re2);
+      sub(counter, counter, 1);
+    } cbnz(counter, L_hash_loop);
+
+    pop(state, sp);
+
+    // load initial state and add to current state
+    ldr(tmp, Address(post(state, 4)));
+    add(rb, rb, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rc, rc, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rd, rd, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(ra2, ra2, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rf, rf, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rg, rg, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rh, rh, tmp);
+    ldr(tmp, Address(state));
+    add(re2, re2, tmp);
+    sub(state, state, 28);
+
+    // save state
+    str(rb,  Address(post(state, 4)));
+    str(rc,  Address(post(state, 4)));
+    str(rd,  Address(post(state, 4)));
+    str(ra2, Address(post(state, 4)));
+    str(rf,  Address(post(state, 4)));
+    str(rg,  Address(post(state, 4)));
+    str(rh,  Address(post(state, 4)));
+    str(re2, Address(post(state, 4)));
+}
+
+/**
+ * SHA512 Sigma
+ * Sigma(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR ROTR(x, sh3)
+ */
+void MacroAssembler::sha512_sigma(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3) {
+  FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
+  FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(x, Dtmp0, Dtmp1, Dsigma);
+
+  vshr_64_u64(Dtmp0, x, sh1);
+  vshl_64_64(Dtmp1, x, 64-sh1);
+  vorr_64(Dsigma, Dtmp0, Dtmp1);
+
+  vshr_64_u64(Dtmp0, x, sh2);
+  vshl_64_64(Dtmp1, x, 64-sh2);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Dsigma, Dsigma, Dtmp0);
+
+  vshr_64_u64(Dtmp0, x, sh3);
+  vshl_64_64(Dtmp1, x, 64-sh3);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Dsigma, Dsigma, Dtmp0);
+}
+
+/**
+ * SHA512 Delta
+ * Delta(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR SHR(x, sh3)
+ */
+void MacroAssembler::sha512_delta(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3) {
+  FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
+  FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(x, Dtmp0, Dtmp1, Ddelta);
+
+  vshr_64_u64(Dtmp0, x, sh1);
+  vshl_64_64(Dtmp1, x, 64-sh1);
+  vorr_64(Ddelta, Dtmp0, Dtmp1);
+
+  vshr_64_u64(Dtmp0, x, sh2);
+  vshl_64_64(Dtmp1, x, 64-sh2);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Ddelta, Ddelta, Dtmp0);
+
+  vshr_64_u64(Dtmp0, x, sh3);
+
+  veor_64(Ddelta, Ddelta, Dtmp0);
+}
+
+/**
+ * SHA512 Ch
+ * Ch(x, y, z) = (x AND y) XOR ( NOT x AND z)
+ */
+void MacroAssembler::sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dch) {
+  assert_different_registers(x, Dtmp, Dch);
+
+  vmvn_64(Dtmp, x);
+  vand_64(Dtmp, Dtmp, z);
+
+  vand_64(Dch, x, y);
+  veor_64(Dch, Dtmp, Dch);
+}
+
+/**
+ * SHA512 Maj
+ * Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+ */
+void MacroAssembler::sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dmaj) {
+  assert_different_registers(x, Dtmp, Dmaj);
+
+  vand_64(Dmaj, x, y);
+  vand_64(Dtmp, x, z);
+  veor_64(Dmaj, Dmaj, Dtmp);
+  vand_64(Dtmp, y, z);
+  veor_64(Dmaj, Dmaj, Dtmp);
+}
+
+/**
+ * SHA512 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha512_implCompress(Register from, Register state,
+        Register counter, Register table_k) {
+  Label L_hash_loop, L_hash_no_w;
+  FloatRegister st_a = d18;  //q9-q12
+  FloatRegister st_b = st_a->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_c = st_b->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_d = st_c->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_e = st_d->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_f = st_e->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_g = st_f->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_h = st_g->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister w16 = d0;  //q0-q7
+  FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w14 = w15->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w12 = w13->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w10 = w11->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w8  = w9->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w6  = w7->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w4  = w5->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w2  = w3->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister t1  = d26;
+  FloatRegister t2  = d27;
+  FloatRegister new_a = st_h;
+  FloatRegister new_e = st_d;
+  FloatRegister new_new_a = st_g;
+  FloatRegister new_new_e = st_c;
+
+  FloatRegister w0  = w1->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(st_a,st_b,st_c,st_d,st_e,st_f,st_g,st_h);
+  assert_different_registers(w0,w1,w2,w3,w4,w5,w6,w7);
+  assert_different_registers(w8,w9,w10,w11,w12,w13,w14,w15,w16);
+
+  lea(table_k, ExternalAddress(StubRoutines::sha512_table_addr()));
+
+  // read initial 16 W elements
+  vld1_64(w16,  w15,  w14,  w13,  Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w12,  w11,  w10,  w9,   Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w8,   w7,   w6,   w5,   Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w4,   w3,   w2,   w1,   Address(from),           Assembler::ALIGN_STD);
+  // read initial state to a,b,c,d,e,f,g,h
+  vld1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vld1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
+  sub(state, state, 32);
+
+  // revert W
+  vrev64_128_8(w16, w16);
+  vrev64_128_8(w14, w14);
+  vrev64_128_8(w12, w12);
+  vrev64_128_8(w10, w10);
+  vrev64_128_8(w8,  w8);
+  vrev64_128_8(w6,  w6);
+  vrev64_128_8(w4,  w4);
+  vrev64_128_8(w2,  w2);
+
+
+  mov(counter, 40);
+  BIND(L_hash_loop); {
+    sub(counter, counter, 1);
+    // first iteration
+    // calculate T1
+    // read K
+    vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
+    vadd_64_64(d31, st_h, w16);
+    sha512_ch(st_e, st_f, st_g, t2, t1);
+    sha512_sigma(st_e, q14, t2, 14, 18, 41);
+    vadd_128_64(q13, q13, q15);
+    vadd_64_64(t1, t1, t2);
+
+    // calculate T2
+    sha512_maj(st_a, st_b, st_c, d30, d31);
+    sha512_sigma(st_a, q14, t2, 28, 34, 39);
+    vadd_64_64(t2, t2, d31);
+
+    vadd_64_64(new_a, t1, t2);
+    vadd_64_64(new_e, st_d,  t1);
+
+    // second iteration
+    // calculate T1
+    // read K
+    vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
+    vadd_64_64(d31, st_g, w15);
+    sha512_ch(new_e, st_e, st_f, t2, t1);
+    sha512_sigma(new_e, q14, t2, 14, 18, 41);
+    vadd_128_64(q13, q13, q15);
+    vadd_64_64(t1, t1, t2);
+
+    // calculate T2
+    sha512_maj(new_a, st_a, st_b, d30, d31);
+    sha512_sigma(new_a, q14, t2, 28, 34, 39);
+    vadd_64_64(t2, t2, d31);
+
+    vadd_64_64(new_new_a, t1, t2);
+    vadd_64_64(new_new_e, st_c,  t1);
+
+    // restore a,b,c,d,e,f,g,h sequence
+    vswp_128(st_g, st_a);
+    vswp_128(st_g, st_c);
+    vswp_128(st_g, st_e);
+
+    cmp(counter, 8);
+    b(L_hash_no_w, Assembler::LO);
+
+    // calculate W[+1], W[+2]
+    sha512_delta(w15, q14, t1, 1, 8, 7);
+    sha512_delta(w2,  q14, d30, 19, 61, 6);
+    sha512_delta(w14, q14, t2, 1, 8, 7);
+    sha512_delta(w1,  q14, d31, 19, 61, 6);
+
+    vadd_128_64(w16, w16, t1);
+    vadd_128_64(w16, w16, q15);
+    vadd_64_64(w16, w16, w7);
+    vadd_64_64(w15, w15, w6);
+
+    BIND(L_hash_no_w);
+
+    vswp_128(w16, w14);
+    vswp_128(w14, w12);
+    vswp_128(w12, w10);
+    vswp_128(w10, w8);
+    vswp_128(w8,  w6);
+    vswp_128(w6,  w4);
+    vswp_128(w4,  w2);
+  } cbnz(counter, L_hash_loop);
+  // read initial state to w16 - w9
+  vld1_64(w16, w15, w14, w13, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vld1_64(w12, w11, w10, w9,  Address(state),           Assembler::ALIGN_STD);
+  sub(state, state, 32);
+
+  // update state
+  vadd_128_64(st_a, st_a, w16);
+  vadd_128_64(st_c, st_c, w14);
+  vadd_128_64(st_e, st_e, w12);
+  vadd_128_64(st_g, st_g, w10);
+
+  // store state
+  vst1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vst1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
+}
+
+void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
+  if (width > 15 && lsb == 0) {
+    lsr(Rd, Rd, width);
+    lsl(Rd, Rd, width);
+  } else if (width > 15 && lsb + width == 32) {
+    lsl(Rd, Rd, 32 - lsb);
+    lsr(Rd, Rd, 32 - lsb);
+  } else {
+    const int lsb1 = (lsb & 1);
+    int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
+    while (width) {
+      bic(Rd, Rd, ((1 << w1) - 1) << lsb);
+      width -= w1;
+      lsb += w1;
+      w1 = width > 8 ? 8 : width;
+    }
+  }
+}
+
+// get_thread can be called anywhere inside generated code so we need
+// to save whatever non-callee save context might get clobbered by the
+// call to the C thread_local lookup call or, indeed, the call setup
+// code. x86 appears to save C arg registers.
+
+void MacroAssembler::get_thread(Register dst) {
+  // call pthread_getspecific
+  // void * pthread_getspecific(pthread_key_t key);
+
+  // Save all call-clobbered regs except dst, plus rscratch1 and rscratch2.
+  RegSet saved_regs = RegSet::range(r0, r3) + rscratch1 + rscratch2 + lr - dst;
+  push(saved_regs, sp);
+
+  // Align stack and save value for return
+  mov(c_rarg1, sp);
+  sub(sp, sp, wordSize);
+  bic(sp, sp, 7);
+  str(c_rarg1, Address(sp));
+
+  mov(rscratch2, CAST_FROM_FN_PTR(address, Thread::current));
+
+  bl(rscratch2);
+  //undo alignment
+  ldr(sp, Address(sp));
+
+  if (dst != c_rarg0) {
+    mov(dst, c_rarg0);
+  }
+
+  // restore pushed registers
+  pop(saved_regs, sp);
+}
+
+#ifdef COMPILER2
+// 24-bit word range == 26-bit byte range
+bool check26(int offset) {
+  // this could be simplified, but it mimics encoding and decoding
+  // an actual branch insrtuction
+  int off1 = offset << 6 >> 8;
+  int encoded = off1 & ((1<<24)-1);
+  int decoded = encoded << 8 >> 6;
+  return offset == decoded;
+}
+
+// Perform some slight adjustments so the default 32MB code cache
+// is fully reachable.
+static inline address first_cache_address() {
+  return CodeCache::low_bound() + sizeof(HeapBlock::Header);
+}
+static inline address last_cache_address() {
+  return CodeCache::high_bound() - NativeInstruction::arm_insn_sz;
+}
+
+// Can we reach target using unconditional branch or call from anywhere
+// in the code cache (because code can be relocated)?
+bool MacroAssembler::_reachable_from_cache(address target) {
+#ifdef __thumb__
+  if ((1 & (intptr_t)target) != 0) {
+    // Return false to avoid 'b' if we need switching to THUMB mode.
+    return false;
+  }
+#endif
+
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+
+  if (ForceUnreachable) {
+    // Only addresses from CodeCache can be treated as reachable.
+    if (target < CodeCache::low_bound() || CodeCache::high_bound() <= target) {
+      return false;
+    }
+  }
+
+  intptr_t loffset = (intptr_t)target - (intptr_t)cl;
+  intptr_t hoffset = (intptr_t)target - (intptr_t)ch;
+
+  return check26(loffset - 8) && check26(hoffset - 8);
+}
+
+bool MacroAssembler::_cache_fully_reachable() {
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+  return _reachable_from_cache(cl) && _reachable_from_cache(ch);
+}
+
+bool MacroAssembler::reachable_from_cache(address target) {
+  assert(CodeCache::contains(pc()), "not supported");
+  return _reachable_from_cache(target);
+}
+
+bool MacroAssembler::cache_fully_reachable() {
+  return _cache_fully_reachable();
+}
+
+// IMPORTANT: does not generate mt-safe patchable code
+void MacroAssembler::call(address target, RelocationHolder rspec, Condition cond) {
+  Register scratch = lr;
+  assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rspec);
+    bl(target, cond);
+    return;
+  }
+
+  mov(scratch, (intptr_t)target, cond);
+  bl(scratch, cond);
+}
+
+// IMPORTANT: does not generate mt-safe patchable code. C2 only uses this method
+// for calls into runtime which do not need mt-safe patching
+void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch, Condition cond) {
+  assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rtype);
+    b(target, cond);
+    return;
+  }
+
+  mov(scratch, (intptr_t)target, cond);
+  b(scratch, cond);
+}
+
+void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) {
+  // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM
+  if (UseStackBanging) {
+    const int page_size = os::vm_page_size();
+
+    sub(tmp, sp, StackShadowPages*page_size);
+    strb(r0, Address(tmp));
+    for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) {
+      strb(r0, pre(tmp, -0xff0));
+    }
+  }
+}
+
+void MacroAssembler::floating_cmp(Register dst) {
+  vmrs(dst);
+  orr(dst, dst, 0x08000000);
+  eor(dst, dst, dst, lsl(3));
+  mov(dst, dst, asr(30));
+}
+
+void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2) {
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label fast_lock, done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    Label failed;
+    biased_locking_enter(Roop, Rmark, Rscratch, Rscratch2, false, done, &failed);
+    bind(failed);
+  }
+
+  ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  tst(Rmark, markOopDesc::unlocked_value);
+  b(fast_lock, Assembler::NE);
+
+  // Check for recursive lock
+  // See comments in InterpreterMacroAssembler::lock_object for
+  // explanations on the fast recursive locking check.
+  // -1- test low 2 bits
+  movs(Rscratch, Rmark, lsl(30));
+  // -2- test (hdr - SP) if the low two bits are 0
+  sub(Rscratch, Rmark, sp, Assembler::EQ);
+  movs(Rscratch, Rscratch, lsr(exact_log2(os::vm_page_size())), Assembler::EQ);
+  // If still 'eq' then recursive locking OK
+  // set to zero if recursive lock, set to non zero otherwise (see discussion in JDK-8153107)
+  str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+  b(done);
+
+  bind(fast_lock);
+  str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+
+  membar(StoreStore);
+  ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  cmp(Rscratch, Rmark);
+  strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
+  cmp(Rscratch, 0, Assembler::EQ);
+  membar(AnyAny);
+
+  bind(done);
+}
+
+void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2) {
+  Register Rmark      = Rscratch2;
+
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    biased_locking_exit(Roop, Rscratch, done);
+  }
+
+  ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+  // If hdr is NULL, we've got recursive locking and there's nothing more to do
+  cmp(Rmark, 0);
+  b(done, Assembler::EQ);
+
+  // Restore the object header
+  membar(AnyAny);
+  ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  cmp(Rscratch, Rmark);
+  strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
+  cmp(Rscratch, 0, Assembler::EQ);
+
+  membar(StoreLoad);
+
+  bind(done);
+}
+
+#endif