/*
 * Copyright (c) 2000, 2015, Oracle and/or its affiliates. All rights reserved.
 * Copyright (c) 2014, Red Hat Inc. All rights reserved.
 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
 *
 * This code is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License version 2 only, as
 * published by the Free Software Foundation.
 *
 * This code is distributed in the hope that it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 * version 2 for more details (a copy is included in the LICENSE file that
 * accompanied this code).
 *
 * You should have received a copy of the GNU General Public License version
 * 2 along with this work; if not, write to the Free Software Foundation,
 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
 *
 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
 * or visit www.oracle.com if you need additional information or have any
 * questions.
 *
 */
// This file is a derivative work resulting from (and including) modifications
// made by Azul Systems, Inc.  The dates of such changes are 2013-2016.
// Copyright 2013-2016 Azul Systems, Inc.  All Rights Reserved.
//
// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
// CA 94089 USA or visit www.azul.com if you need additional information or
// have any questions.

#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "c1/c1_CodeStubs.hpp"
#include "c1/c1_Compilation.hpp"
#include "c1/c1_LIRAssembler.hpp"
#include "c1/c1_MacroAssembler.hpp"
#include "c1/c1_Runtime1.hpp"
#include "c1/c1_ValueStack.hpp"
#include "ci/ciArrayKlass.hpp"
#include "ci/ciInstance.hpp"
#include "gc_interface/collectedHeap.hpp"
#include "memory/barrierSet.hpp"
#include "memory/cardTableModRefBS.hpp"
#include "nativeInst_aarch32.hpp"
#include "oops/objArrayKlass.hpp"
#include "runtime/sharedRuntime.hpp"
#include "vmreg_aarch32.inline.hpp"

#ifndef PRODUCT
#define COMMENT(x)   do { __ block_comment(x); } while (0)
#else
#define COMMENT(x)
#endif

NEEDS_CLEANUP // remove this definitions ?
const Register IC_Klass    = rscratch2;   // where the IC klass is cached
const Register SYNC_header = r0;   // synchronization header
const Register SHIFT_count = r0;   // where count for shift operations must be

#define __ _masm->


static void select_different_registers(Register preserve,
                                       Register extra,
                                       Register &tmp1,
                                       Register &tmp2) {
  if (tmp1 == preserve) {
    assert_different_registers(tmp1, tmp2, extra);
    tmp1 = extra;
  } else if (tmp2 == preserve) {
    assert_different_registers(tmp1, tmp2, extra);
    tmp2 = extra;
  }
  assert_different_registers(preserve, tmp1, tmp2);
}


static void select_different_registers(Register preserve,
                                       Register extra,
                                       Register &tmp1,
                                       Register &tmp2,
                                       Register &tmp3) {
  if (tmp1 == preserve) {
    assert_different_registers(tmp1, tmp2, tmp3, extra);
    tmp1 = extra;
  } else if (tmp2 == preserve) {
    assert_different_registers(tmp1, tmp2, tmp3, extra);
    tmp2 = extra;
  } else if (tmp3 == preserve) {
    assert_different_registers(tmp1, tmp2, tmp3, extra);
    tmp3 = extra;
  }
  assert_different_registers(preserve, tmp1, tmp2, tmp3);
}

bool LIR_Assembler::is_small_constant(LIR_Opr opr) { Unimplemented(); return false; }


LIR_Opr LIR_Assembler::receiverOpr() {
  return FrameMap::receiver_opr;
}

LIR_Opr LIR_Assembler::osrBufferPointer() {
  return FrameMap::as_pointer_opr(receiverOpr()->as_register());
}

//--------------fpu register translations-----------------------


address LIR_Assembler::float_constant(float f) {
  address const_addr = __ float_constant(f);
  if (const_addr == NULL) {
    bailout("const section overflow");
    return __ code()->consts()->start();
  } else {
    return const_addr;
  }
}


address LIR_Assembler::double_constant(double d) {
  address const_addr = __ double_constant(d);
  if (const_addr == NULL) {
    bailout("const section overflow");
    return __ code()->consts()->start();
  } else {
    return const_addr;
  }
}

address LIR_Assembler::int_constant(jlong n) {
  address const_addr = __ long_constant(n);
  if (const_addr == NULL) {
    bailout("const section overflow");
    return __ code()->consts()->start();
  } else {
    return const_addr;
  }
}

void LIR_Assembler::set_24bit_FPU() { Unimplemented(); }

void LIR_Assembler::reset_FPU() { Unimplemented(); }

void LIR_Assembler::fpop() { Unimplemented(); }

void LIR_Assembler::fxch(int i) { Unimplemented(); }

void LIR_Assembler::fld(int i) { Unimplemented(); }

void LIR_Assembler::ffree(int i) { Unimplemented(); }

void LIR_Assembler::breakpoint() { __ bkpt(0); }

void LIR_Assembler::push(LIR_Opr opr) { Unimplemented(); }

void LIR_Assembler::pop(LIR_Opr opr) { Unimplemented(); }

//-------------------------------------------

static Register as_reg(LIR_Opr op) {
  return op->is_double_cpu() ? op->as_register_lo() : op->as_register();
}

Address LIR_Assembler::as_Address(LIR_Address* addr) {
  // as_Address(LIR_Address*, Address::InsnDataType) should be used instead
  ShouldNotCallThis();
}

Address LIR_Assembler::as_Address_hi(LIR_Address* addr) {
  // as_Address_hi(LIR_Address*, Address::InsnDataType) should be used instead
  ShouldNotCallThis();
}

Address LIR_Assembler::as_Address_lo(LIR_Address* addr) {
  // as_Address_lo(LIR_Address*, Address::InsnDataType) should be used instead
  ShouldNotCallThis();
}

Address LIR_Assembler::as_Address(LIR_Address* addr, Register tmp, Address::InsnDataType type) {
  if (addr->base()->is_illegal()) {
    assert(addr->index()->is_illegal(), "must be illegal too");
    __ mov(tmp, addr->disp());
    return Address(tmp); // encoding is ok for any data type
  }

  Register base = addr->base()->as_pointer_register();

  if (addr->index()->is_illegal()) {
    return Address(base, addr->disp()).safe_for(type, _masm, tmp);
  } else if (addr->index()->is_cpu_register()) {
    assert(addr->disp() == 0, "must be");
    Register index = addr->index()->as_pointer_register();
    return Address(base, index, lsl(addr->scale())).safe_for(type, _masm, tmp);
  } else if (addr->index()->is_constant()) {
    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp();
    return Address(base, addr_offset).safe_for(type, _masm, tmp);
  }

  Unimplemented();
  return Address();
}

Address LIR_Assembler::as_Address_hi(LIR_Address* addr, Address::InsnDataType type) {
  assert(type == Address::IDT_INT, "only to be used for accessing high word of jlong");

  if (addr->base()->is_illegal()) {
    assert(addr->index()->is_illegal(), "must be illegal too");
    __ mov(rscratch1, addr->disp() + wordSize);
    return Address(rscratch1); // encoding is ok for IDR_INT
  }

  Register base = addr->base()->as_pointer_register();

  if (addr->index()->is_illegal()) {
    return Address(base, addr->disp() + wordSize).safe_for(Address::IDT_INT, _masm, rscratch1);
  } else if (addr->index()->is_cpu_register()) {
    assert(addr->disp() == 0, "must be");
    Register index = addr->index()->as_pointer_register();
    __ add(rscratch1, base, wordSize);
    return Address(rscratch1, index, lsl(addr->scale())); // encoding is ok for IDT_INT
  } else if (addr->index()->is_constant()) {
    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp() + wordSize;
    return Address(base, addr_offset).safe_for(Address::IDT_INT, _masm, rscratch1);
  }

  Unimplemented();
  return Address();
}

Address LIR_Assembler::as_Address_lo(LIR_Address* addr, Address::InsnDataType type) {
  return as_Address(addr, rscratch1, type);
}


void LIR_Assembler::osr_entry() {
  offsets()->set_value(CodeOffsets::OSR_Entry, code_offset());
  BlockBegin* osr_entry = compilation()->hir()->osr_entry();
  ValueStack* entry_state = osr_entry->state();
  int number_of_locks = entry_state->locks_size();

  // we jump here if osr happens with the interpreter
  // state set up to continue at the beginning of the
  // loop that triggered osr - in particular, we have
  // the following registers setup:
  //
  // r1: osr buffer
  //

  // build frame
  ciMethod* m = compilation()->method();
  __ build_frame(initial_frame_size_in_bytes(), bang_size_in_bytes());

  // OSR buffer is
  //
  // locals[nlocals-1..0]
  // monitors[0..number_of_locks]
  //
  // locals is a direct copy of the interpreter frame so in the osr buffer
  // so first slot in the local array is the last local from the interpreter
  // and last slot is local[0] (receiver) from the interpreter
  //
  // Similarly with locks. The first lock slot in the osr buffer is the nth lock
  // from the interpreter frame, the nth lock slot in the osr buffer is 0th lock
  // in the interpreter frame (the method lock if a sync method)

  // Initialize monitors in the compiled activation.
  //   r1: pointer to osr buffer
  //
  // All other registers are dead at this point and the locals will be
  // copied into place by code emitted in the IR.

  Register OSR_buf = osrBufferPointer()->as_pointer_register();
  { assert(frame::interpreter_frame_monitor_size() == BasicObjectLock::size(), "adjust code below");
    int monitor_offset = BytesPerWord * method()->max_locals() +
      (2 * BytesPerWord) * (number_of_locks - 1);
    // SharedRuntime::OSR_migration_begin() packs BasicObjectLocks in
    // the OSR buffer using 2 word entries: first the lock and then
    // the oop.
    for (int i = 0; i < number_of_locks; i++) {
      int slot_offset = monitor_offset - ((i * 2) * BytesPerWord);
#ifdef ASSERT
      // verify the interpreter's monitor has a non-null object
      {
        Label L;
        __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
        __ cbnz(rscratch1, L);
        __ stop("locked object is NULL");
        __ bind(L);
      }
#endif
      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 0));
      __ str(rscratch1, frame_map()->address_for_monitor_lock(i));
      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
      __ str(rscratch1, frame_map()->address_for_monitor_object(i));
    }
  }
}


// inline cache check; done before the frame is built.
int LIR_Assembler::check_icache() {
  Register receiver = FrameMap::receiver_opr->as_register();
  Register ic_klass = IC_Klass;
  int start_offset = __ offset();
  __ inline_cache_check(receiver, ic_klass);

  // if icache check fails, then jump to runtime routine
  // Note: RECEIVER must still contain the receiver!
  Label dont;
  __ b(dont, Assembler::EQ);
  __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));

  // We align the verified entry point unless the method body
  // (including its inline cache check) will fit in a single 64-byte
  // icache line.
  if (! method()->is_accessor() || __ offset() - start_offset > 4 * 4) {
    // force alignment after the cache check.
    __ align(CodeEntryAlignment);
  }

  __ bind(dont);
  return start_offset;
}


void LIR_Assembler::jobject2reg(jobject o, Register reg) {
  if (o == NULL) {
    __ mov(reg, 0);
  } else {
    __ movoop(reg, o, /*immediate*/true);
  }
}

void LIR_Assembler::deoptimize_trap(CodeEmitInfo *info) {
  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::deoptimize_id)));
  add_call_info_here(info);
}

void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo *info) {
  jobject o = NULL;
  PatchingStub* patch = new PatchingStub(_masm, patching_id(info));
  __ movoop(reg, o, true);
  patching_epilog(patch, lir_patch_normal, reg, info);
}

// Return sp decrement needed to build a frame
int LIR_Assembler::initial_frame_size_in_bytes() const {
  // We need to subtract two words to take into account saved lr and rfp.
  return in_bytes(frame_map()->framesize_in_bytes()) -
         FrameMap::frame_pad_in_bytes;
}

int LIR_Assembler::emit_exception_handler() {
  // if the last instruction is a call (typically to do a throw which
  // is coming at the end after block reordering) the return address
  // must still point into the code area in order to avoid assertion
  // failures when searching for the corresponding bci => add a nop
  // (was bug 5/14/1999 - gri)
  __ nop();

  // generate code for exception handler
  address handler_base = __ start_a_stub(exception_handler_size);
  if (handler_base == NULL) {
    // not enough space left for the handler
    bailout("exception handler overflow");
    return -1;
  }

  int offset = code_offset();

  // the exception oop and pc are in r0, and r3
  // no other registers need to be preserved, so invalidate them
  __ invalidate_registers(false, true, false);

  // check that there is really an exception
  __ verify_not_null_oop(r0);

  // search an exception handler (r0: exception oop, r3: throwing pc)
  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::handle_exception_from_callee_id)));  __ should_not_reach_here();
  guarantee(code_offset() - offset <= exception_handler_size, "overflow");
  __ end_a_stub();

  return offset;
}


// Emit the code to remove the frame from the stack in the exception
// unwind path.
int LIR_Assembler::emit_unwind_handler() {
#ifndef PRODUCT
  if (CommentedAssembly) {
    _masm->block_comment("Unwind handler");
  }
#endif

  int offset = code_offset();

  // Fetch the exception from TLS and clear out exception related thread state
  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
  __ mov(rscratch1, 0);
  __ str(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));

  __ bind(_unwind_handler_entry);
  __ verify_not_null_oop(r0);

  // Preform needed unlocking
  MonitorExitStub* stub = NULL;
  if (method()->is_synchronized()) {
    monitor_address(0, FrameMap::r1_opr);
    stub = new MonitorExitStub(FrameMap::r1_opr, true, 0);
    __ unlock_object(r5, r4, r1, *stub->entry());
    __ bind(*stub->continuation());
  }

  if (compilation()->env()->dtrace_method_probes()) {
    __ call_Unimplemented();
#if 0
    // FIXME check exception_store is not clobbered below!
    __ movptr(Address(rsp, 0), rax);
    __ mov_metadata(Address(rsp, sizeof(void*)), method()->constant_encoding());
    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit)));
#endif
  }

  // remove the activation and dispatch to the unwind handler
  __ block_comment("remove_frame and dispatch to the unwind handler");
  __ remove_frame(initial_frame_size_in_bytes());
  __ far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::unwind_exception_id)));

  // Emit the slow path assembly
  if (stub != NULL) {
    stub->emit_code(this);
  }

  return offset;
}


int LIR_Assembler::emit_deopt_handler() {
  // if the last instruction is a call (typically to do a throw which
  // is coming at the end after block reordering) the return address
  // must still point into the code area in order to avoid assertion
  // failures when searching for the corresponding bci => add a nop
  // (was bug 5/14/1999 - gri)
  __ nop();

  // generate code for exception handler
  address handler_base = __ start_a_stub(deopt_handler_size);
  if (handler_base == NULL) {
    // not enough space left for the handler
    bailout("deopt handler overflow");
    return -1;
  }

  int offset = code_offset();

  __ adr(lr, pc());
  __ far_jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
  guarantee(code_offset() - offset <= deopt_handler_size, "overflow");
  __ end_a_stub();

  return offset;
}


// This is the fast version of java.lang.String.compare; it has not
// OSR-entry and therefore, we generate a slow version for OSR's
void LIR_Assembler::emit_string_compare(LIR_Opr arg0, LIR_Opr arg1, LIR_Opr dst, CodeEmitInfo* info)  {
  __ mov(r2, (address)__FUNCTION__);
  __ call_Unimplemented();
}


void LIR_Assembler::add_debug_info_for_branch(address adr, CodeEmitInfo* info) {
  _masm->code_section()->relocate(adr, relocInfo::poll_type);
  int pc_offset = code_offset();
  flush_debug_info(pc_offset);
  info->record_debug_info(compilation()->debug_info_recorder(), pc_offset);
  if (info->exception_handlers() != NULL) {
    compilation()->add_exception_handlers_for_pco(pc_offset, info->exception_handlers());
  }
}

void LIR_Assembler::return_op(LIR_Opr result) {
  assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == r0, "word returns are in r0,");
  // Pop the stack before the safepoint code
  __ remove_frame(initial_frame_size_in_bytes());
  address polling_page(os::get_polling_page());
  __ read_polling_page(rscratch1, polling_page, relocInfo::poll_return_type);
  __ ret(lr);
}

int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
  address polling_page(os::get_polling_page());
  guarantee(info != NULL, "Shouldn't be NULL");
  assert(os::is_poll_address(polling_page), "should be");
  __ mov(rscratch1, Address(polling_page, relocInfo::poll_type));
  add_debug_info_for_branch(info);  // This isn't just debug info:
  // it's the oop map
  __ read_polling_page(rscratch1, relocInfo::poll_type);
  return __ offset();
}

void LIR_Assembler::move_regs(Register from_reg, Register to_reg) {
  if (from_reg != to_reg) {
    __ mov(to_reg, from_reg);
  }
}

void LIR_Assembler::swap_reg(Register a, Register b) {
  Unimplemented();
}

void LIR_Assembler::const2reg(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info) {
  assert(src->is_constant(), "should not call otherwise");
  assert(dest->is_register(), "should not call otherwise");
  LIR_Const* c = src->as_constant_ptr();

  switch (c->type()) {
    case T_INT: {
      assert(patch_code == lir_patch_none, "no patching handled here");
      __ mov(dest->as_register(), c->as_jint());
      break;
    }

    case T_ADDRESS: {
      assert(patch_code == lir_patch_none, "no patching handled here");
      __ mov(dest->as_register(), c->as_jint());
      break;
    }

    case T_LONG: {
      assert(patch_code == lir_patch_none, "no patching handled here");
      __ mov(dest->as_register_lo(), c->as_jint_lo());
      __ mov(dest->as_register_hi(), c->as_jint_hi());
      break;
    }

    case T_OBJECT: {
        if (patch_code == lir_patch_none) {
          jobject2reg(c->as_jobject(), dest->as_register());
        } else {
          jobject2reg_with_patching(dest->as_register(), info);
        }
      break;
    }

    case T_METADATA: {
      if (patch_code != lir_patch_none) {
        klass2reg_with_patching(dest->as_register(), info);
      } else {
        __ mov_metadata(dest->as_register(), c->as_metadata());
      }
      break;
    }

    case T_FLOAT: {
#ifdef __ARM_PCS_VFP
        if (__ operand_valid_for_float_immediate(c->as_jfloat())) {
            __ vmov_f32(dest->as_float_reg(), c->as_jfloat());
        } else {
            __ lea(rscratch1, InternalAddress(float_constant(c->as_jfloat())));
            __ vldr_f32(dest->as_float_reg(), Address(rscratch1));
        }
#else
#error "unimplemented"
#endif
      break;
    }

    case T_DOUBLE: {
#ifdef __ARM_PCS_VFP
        if (__ operand_valid_for_double_immediate(c->as_jdouble())) {
            __ vmov_f64(dest->as_double_reg(), c->as_jdouble());
        } else {
            __ lea(rscratch1, InternalAddress(double_constant(c->as_jdouble())));
            __ vldr_f64(dest->as_double_reg(), Address(rscratch1));
        }
#else
#error "unimplemented"
#endif
      break;
    }

    default:
      ShouldNotReachHere();
  }
}

void LIR_Assembler::const2stack(LIR_Opr src, LIR_Opr dest) {
  LIR_Const* c = src->as_constant_ptr();
  switch (c->type()) {
  case T_OBJECT:
    {
      if (! c->as_jobject()) {
        __ mov(rscratch1, 0);
        __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
      } else {
        const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
        reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
      }
    }
    break;
  case T_ADDRESS:
    {
      const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
      reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
    }
  case T_INT:
  case T_FLOAT:
    {
      __ mov(rscratch1, c->as_jint_bits());
      __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
    }
    break;
  case T_LONG:
  case T_DOUBLE:
    {
        __ mov(rscratch1, c->as_jint_lo());
        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
                                                        lo_word_offset_in_bytes));
        if (c->as_jint_lo() != c->as_jint_hi())
            __ mov(rscratch1, c->as_jint_hi());
        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
                                                        hi_word_offset_in_bytes));
    }
    break;
  default:
    ShouldNotReachHere();
  }
}

/*
 * For now this code can load only zero constants as in aarch32.
 * It seems like this implementation can break some tests in future.
 * TODO: ensure, write test, and rewrite if need.
 */
void LIR_Assembler::const2mem(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info, bool wide) {
  assert(src->is_constant(), "should not call otherwise");
  LIR_Const* c = src->as_constant_ptr();
  LIR_Address* to_addr = dest->as_address_ptr();

  void (Assembler::* insn)(Register Rt, const Address &adr, Assembler::Condition cnd);

  __ mov(rscratch2, 0);

  int null_check_here = code_offset();

  Address::InsnDataType idt = Address::toInsnDataType(type);
  switch (type) {
  case T_ADDRESS:
    assert(c->as_jint() == 0, "should be");
    insn = &Assembler::str;
    break;
  case T_LONG: {
    assert(c->as_jlong() == 0, "should be");
    insn = &Assembler::str;
    Address addr = as_Address_hi(to_addr, Address::IDT_INT);
    null_check_here = code_offset();
    __ str(rscratch2, addr);
    idt = Address::IDT_INT;
    break;
  }
  case T_INT:
    assert(c->as_jint() == 0, "should be");
    insn = &Assembler::str;
    break;
  case T_OBJECT:
  case T_ARRAY:
    assert(c->as_jobject() == 0, "should be");
    insn = &Assembler::str;
    break;
  case T_CHAR:
  case T_SHORT:
    assert(c->as_jint() == 0, "should be");
    insn = &Assembler::strh;
    break;
  case T_BOOLEAN:
  case T_BYTE:
    assert(c->as_jint() == 0, "should be");
    insn = &Assembler::strb;
    break;
  default:
    ShouldNotReachHere();
  }

  (_masm->*insn)(rscratch2, as_Address(to_addr, idt), Assembler::C_DFLT);
  if (info) add_debug_info_for_null_check(null_check_here, info);
}

void LIR_Assembler::reg2reg(LIR_Opr src, LIR_Opr dest) {
  assert(src->is_register(), "should not call otherwise");
  assert(dest->is_register(), "should not call otherwise");

  // move between cpu-registers
  if (dest->is_single_cpu()) {
    if (src->type() == T_LONG) {
      // Can do LONG -> OBJECT
      __ stop("investigate how \"LONG -> OBJECT\" works especially when high part is != 0");
      move_regs(src->as_register_lo(), dest->as_register());
      return;
    }
    assert(src->is_single_cpu(), "must match");
    if (src->type() == T_OBJECT) {
      __ verify_oop(src->as_register());
    }
    move_regs(src->as_register(), dest->as_register());

  } else if (dest->is_double_cpu()) {
    if (src->type() == T_OBJECT || src->type() == T_ARRAY) {
      // Surprising to me but we can see move of a long to t_object
      __ verify_oop(src->as_register());
      move_regs(src->as_register(), dest->as_register_lo());
      __ mov(dest->as_register_hi(), 0);
      return;
    }
    assert(src->is_double_cpu(), "must match");
    Register f_lo = src->as_register_lo();
    Register f_hi = src->as_register_hi();
    Register t_lo = dest->as_register_lo();
    Register t_hi = dest->as_register_hi();
    assert(f_hi != f_lo, "must be different");
    assert(t_hi != t_lo, "must be different");
    check_register_collision(t_lo, &f_hi);
    move_regs(f_lo, t_lo);
    move_regs(f_hi, t_hi);
  } else if (dest->is_single_fpu()) {
    __ vmov_f32(dest->as_float_reg(), src->as_float_reg());

  } else if (dest->is_double_fpu()) {
    __ vmov_f64(dest->as_double_reg(), src->as_double_reg());

  } else {
    ShouldNotReachHere();
  }
}

void LIR_Assembler::reg2stack(LIR_Opr src, LIR_Opr dest, BasicType type, bool pop_fpu_stack) {
  if (src->is_single_cpu()) {
    if (type == T_ARRAY || type == T_OBJECT) {
      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
      __ verify_oop(src->as_register());
    } else {
      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
    }

  } else if (src->is_double_cpu()) {
    Address dest_addr_LO = frame_map()->address_for_slot(dest->double_stack_ix(), lo_word_offset_in_bytes);
    __ strd(src->as_register_lo(), src->as_register_hi(), dest_addr_LO);

  } else if (src->is_single_fpu()) {
    Address dest_addr = frame_map()->address_for_slot(dest->single_stack_ix());
#ifdef __ARM_PCS_VFP
    __ vstr_f32(src->as_float_reg(), dest_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
#else
#error "unimplemented"
#endif
  } else if (src->is_double_fpu()) {
    Address dest_addr = frame_map()->address_for_slot(dest->double_stack_ix());
#ifdef __ARM_PCS_VFP
    __ vstr_f64(src->as_double_reg(), dest_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
#else
#error "unimplemented"
#endif
  } else {
    ShouldNotReachHere();
  }

}


void LIR_Assembler::reg2mem(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool pop_fpu_stack, bool wide, bool /* unaligned */) {
  LIR_Address* to_addr = dest->as_address_ptr();

  if (type == T_ARRAY || type == T_OBJECT) {
    __ verify_oop(src->as_register());
  }

  PatchingStub* patch = NULL;
  if (patch_code != lir_patch_none) {
    assert(to_addr->disp() != 0, "must have");

    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
    __ mov(rscratch1, (address) to_addr->disp());
    patching_epilog(patch, patch_code, to_addr->base()->as_register(), info);

    to_addr = new LIR_Address(to_addr->base(), FrameMap::rscratch1_opr, to_addr->type());
  }


  int null_check_here = code_offset();
  switch (type) {
    case T_FLOAT: {
#ifdef __ARM_PCS_VFP
      Address addr = as_Address(to_addr, Address::IDT_FLOAT);
      null_check_here = code_offset();
      __ vstr_f32(src->as_float_reg(), addr);
#else
#error "unimplemented"
#endif
      break;
    }

    case T_DOUBLE: {
#ifdef __ARM_PCS_VFP
      Address addr = as_Address(to_addr, Address::IDT_DOUBLE);
      null_check_here = code_offset();
      __ vstr_f64(src->as_double_reg(), addr);
#else
#error "unimplemented"
#endif

      break;
    }

    case T_ARRAY:   // fall through
    case T_OBJECT:  // fall through
    case T_ADDRESS: // fall though
    case T_INT: {
      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
      null_check_here = code_offset();
      __ str(src->as_register(), addr);
      break;
    }
    case T_METADATA:
      // We get here to store a method pointer to the stack to pass to
      // a dtrace runtime call. This can't work on 64 bit with
      // compressed klass ptrs: T_METADATA can be a compressed klass
      // ptr or a 64 bit method pointer.
      ShouldNotReachHere();
//      __ str(src->as_register(), as_Address(to_addr));
      break;

    case T_LONG: {
      Address addr = as_Address_lo(to_addr, Address::IDT_LONG);
      null_check_here = code_offset();
      null_check_here += __ strd(src->as_register_lo(), src->as_register_hi(), addr);
      break;
    }

    case T_BYTE:    // fall through
    case T_BOOLEAN: {
      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
      null_check_here = code_offset();
      __ strb(src->as_register(), addr);
      break;
    }
    case T_CHAR:    // fall through
    case T_SHORT: {
      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
      null_check_here = code_offset();
      __ strh(src->as_register(), addr);
      break;
    }
    default:
      ShouldNotReachHere();
  }

  if (info != NULL) {
    add_debug_info_for_null_check(null_check_here, info);
  }
}


void LIR_Assembler::stack2reg(LIR_Opr src, LIR_Opr dest, BasicType type) {
  assert(src->is_stack(), "should not call otherwise");
  assert(dest->is_register(), "should not call otherwise");

  if (dest->is_single_cpu()) {
    if (type == T_ARRAY || type == T_OBJECT) {
      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
      __ verify_oop(dest->as_register());
    } else {
      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
    }

  } else if (dest->is_double_cpu()) {
    Address src_addr_LO = frame_map()->address_for_slot(src->double_stack_ix(), lo_word_offset_in_bytes);
    __ ldrd(dest->as_register_lo(), dest->as_register_hi(), src_addr_LO);

  } else if (dest->is_single_fpu()) {
#ifdef __ARM_PCS_VFP
    Address src_addr = frame_map()->address_for_slot(src->single_stack_ix());
    __ vldr_f32(dest->as_float_reg(), src_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
#else
#error "unimplemented"
#endif
  } else if (dest->is_double_fpu()) {
#ifdef __ARM_PCS_VFP
    Address src_addr = frame_map()->address_for_slot(src->double_stack_ix());
    __ vldr_f64(dest->as_double_reg(), src_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
#else
#error "unimplemented"
#endif
  } else {
    ShouldNotReachHere();
  }
}


void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
  Metadata* o = NULL;
  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id);
  __ mov_metadata(reg, o);
  patching_epilog(patch, lir_patch_normal, reg, info);
}

void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {

  LIR_Opr temp;
  if (type == T_LONG || type == T_DOUBLE)
    temp = FrameMap::rscratch_long_opr;
  else
    temp = FrameMap::rscratch1_opr;

  stack2reg(src, temp, src->type());
  reg2stack(temp, dest, dest->type(), false);
}


void LIR_Assembler::mem2reg(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool wide, bool /* unaligned */) {
  LIR_Address* from_addr = src->as_address_ptr();

  if (from_addr->base()->type() == T_OBJECT) {
    __ verify_oop(from_addr->base()->as_pointer_register());
  }

  PatchingStub* patch = NULL;
  if (patch_code != lir_patch_none) {
    assert(from_addr->disp() != 0, "must have");

    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
    __ mov(rscratch1, (address) from_addr->disp());
    patching_epilog(patch, patch_code, from_addr->base()->as_register(), info);

    from_addr = new LIR_Address(from_addr->base(), FrameMap::rscratch1_opr, from_addr->type());
  }

  int null_check_here = code_offset();

  switch (type) {
    case T_FLOAT: {
#ifdef __ARM_PCS_VFP
    Address addr = as_Address(from_addr, Address::IDT_FLOAT);
    null_check_here = code_offset();
    __ vldr_f32(dest->as_float_reg(), addr);
#else
#error "unimplemented"
#endif
      break;
    }

    case T_DOUBLE: {
#ifdef __ARM_PCS_VFP
    Address addr = as_Address(from_addr, Address::IDT_DOUBLE);
    null_check_here = code_offset();
    __ vldr_f64(dest->as_double_reg(), addr);
#else
#error "unimplemented"
#endif
      break;
    }

    case T_ARRAY:   // fall through
    case T_OBJECT:  // fall through
    case T_ADDRESS: // fall through
    case T_INT: {
      Address addr = as_Address(from_addr, Address::toInsnDataType(type));
      null_check_here = code_offset();
      __ ldr(dest->as_register(), addr);
      break;
    }
    case T_METADATA:
      // We get here to store a method pointer to the stack to pass to
      // a dtrace runtime call. This can't work on 64 bit with
      // compressed klass ptrs: T_METADATA can be a compressed klass
      // ptr or a 64 bit method pointer.
      ShouldNotReachHere();
//      __ ldr(dest->as_register(), as_Address(from_addr));
      break;

    case T_LONG: {
      Address addr = as_Address_lo(from_addr, Address::IDT_LONG);
      null_check_here = code_offset();
      null_check_here += __ ldrd(dest->as_register_lo(), dest->as_register_hi(), addr);
      break;
    }

    case T_BYTE: {
      Address addr =  as_Address(from_addr, Address::IDT_BYTE);
      null_check_here = code_offset();
      __ ldrsb(dest->as_register(), addr);
      break;
    }
    case T_BOOLEAN: {
      Address addr = as_Address(from_addr, Address::IDT_BOOLEAN);
      null_check_here = code_offset();
      __ ldrb(dest->as_register(), addr);
      break;
    }

    case T_CHAR: {
      Address addr = as_Address(from_addr, Address::IDT_CHAR);
      null_check_here = code_offset();
      __ ldrh(dest->as_register(), addr);
      break;
    }
    case T_SHORT: {
      Address addr = as_Address(from_addr, Address::IDT_SHORT);
      null_check_here = code_offset();
      __ ldrsh(dest->as_register(), addr);
      break;
    }

    default:
      ShouldNotReachHere();
  }

  if (type == T_ARRAY || type == T_OBJECT) {
    __ verify_oop(dest->as_register());
  }

  if (info != NULL) {
    add_debug_info_for_null_check(null_check_here, info);
  }
}

void LIR_Assembler::prefetchr(LIR_Opr src) {
  Unimplemented();
}

void LIR_Assembler::prefetchw(LIR_Opr src) {
  Unimplemented();
}

int LIR_Assembler::array_element_size(BasicType type) const {
  int elem_size = type2aelembytes(type);
  return exact_log2(elem_size);
}

void LIR_Assembler::emit_op3(LIR_Op3* op) {
  Register Rdividend = op->in_opr1()->as_register();
  Register Rdivisor  = op->in_opr2()->as_register();
  Register Rscratch  = op->in_opr3()->as_register();
  Register Rresult   = op->result_opr()->as_register();
  int divisor = -1;

  /*
  TODO: For some reason, using the Rscratch that gets passed in is
  not possible because the register allocator does not see the tmp reg
  as used, and assignes it the same register as Rdividend. We use rscratch1
   instead.

  assert(Rdividend != Rscratch, "");
  assert(Rdivisor  != Rscratch, "");
  */

  if (Rdivisor == noreg && is_power_of_2(divisor)) {
    // convert division by a power of two into some shifts and logical operations
  }

  assert(op->code() == lir_irem || op->code() == lir_idiv, "should be irem or idiv");
  bool want_remainder = op->code() == lir_irem;

  __ divide(Rresult, Rdividend, Rdivisor, 32, want_remainder);
}

void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
#ifdef ASSERT
  assert(op->block() == NULL || op->block()->label() == op->label(), "wrong label");
  if (op->block() != NULL)  _branch_target_blocks.append(op->block());
  if (op->ublock() != NULL) _branch_target_blocks.append(op->ublock());
#endif

  if (op->cond() == lir_cond_always) {
    if (op->info() != NULL) add_debug_info_for_branch(op->info());
    __ b(*(op->label()));
  } else {
    Assembler::Condition acond;
    if (op->code() == lir_cond_float_branch) {
      bool is_unordered = (op->ublock() == op->block());
      // Assembler::EQ does not permit unordered branches, so we add
      // another branch here.  Likewise, Assembler::NE does not permit
      // ordered branches.
      if (is_unordered && op->cond() == lir_cond_equal
          || !is_unordered && op->cond() == lir_cond_notEqual)
        __ b(*(op->ublock()->label()), Assembler::VS);
      switch(op->cond()) {
      case lir_cond_equal:        acond = Assembler::EQ; break;
      case lir_cond_notEqual:     acond = Assembler::NE; break;
      case lir_cond_less:         acond = (is_unordered ? Assembler::LT : Assembler::LO); break;
      case lir_cond_lessEqual:    acond = (is_unordered ? Assembler::LE : Assembler::LS); break;
      case lir_cond_greaterEqual: acond = (is_unordered ? Assembler::HS : Assembler::GE); break;
      case lir_cond_greater:      acond = (is_unordered ? Assembler::HI : Assembler::GT); break;
      default:                    ShouldNotReachHere();
      }
    } else {
      switch (op->cond()) {
        case lir_cond_equal:        acond = Assembler::EQ; break;
        case lir_cond_notEqual:     acond = Assembler::NE; break;
        case lir_cond_less:         acond = Assembler::LT; break;
        case lir_cond_greaterEqual: acond = Assembler::GE; break;
        case lir_cond_lessEqual:    acond = Assembler::LE; break;
        case lir_cond_greater:      acond = Assembler::GT; break;
        case lir_cond_belowEqual:   acond = Assembler::LS; break;
        case lir_cond_aboveEqual:   acond = Assembler::HS; break;
        default:                         ShouldNotReachHere();
      }
      if (op->type() == T_LONG) {
        // a special trick here to be able to effectively compare jlongs
        // for the lessEqual and greater conditions the jlong operands are swapped
        // during comparison and hence should use mirror condition in conditional
        // instruction
        // see LIR_Assembler::comp_op and LIR_Assembler::cmove
        switch (op->cond()) {
          case lir_cond_lessEqual: acond = Assembler::GE;  break;
          case lir_cond_greater: acond = Assembler::LT;    break;
        }
      }
    }
    __ b(*(op->label()), acond);
  }
}

FloatRegister LIR_Assembler::as_float_reg(LIR_Opr doubleReg) {
    assert(doubleReg->is_double_fpu(), "must be f64");
    return as_FloatRegister(doubleReg->fpu_regnrLo());
}

void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {
  LIR_Opr src  = op->in_opr();
  LIR_Opr dest = op->result_opr();

  switch (op->bytecode()) {
    case Bytecodes::_i2f:
      {
        __ vmov_f32(dest->as_float_reg(), src->as_register());
        __ vcvt_f32_s32(dest->as_float_reg(), dest->as_float_reg());
        break;
      }
    case Bytecodes::_i2d:
      {
        __ vmov_f32(as_float_reg(dest), src->as_register());
        __ vcvt_f64_s32(dest->as_double_reg(), as_float_reg(dest));
        break;
      }
    case Bytecodes::_f2d:
      {
        __ vcvt_f64_f32(dest->as_double_reg(), src->as_float_reg());
        break;
      }
    case Bytecodes::_d2f:
      {
        __ vcvt_f32_f64(dest->as_float_reg(), src->as_double_reg());
        break;
      }
    case Bytecodes::_i2c:
      {
        __ uxth(dest->as_register(), src->as_register());
        break;
      }
    case Bytecodes::_i2l:
      {
        const Register dst_hi = dest->as_register_hi();
        const Register dst_lo = dest->as_register_lo();
        const Register src_lo = as_reg(src);
        __ mov(dst_lo, src_lo);
        __ asr(dst_hi, src_lo, 31);
        break;
      }
    case Bytecodes::_i2s:
      {
        __ sxth(dest->as_register(), src->as_register());
        break;
      }
    case Bytecodes::_i2b:
      {
        __ sxtb(dest->as_register(), src->as_register());
        break;
      }
    case Bytecodes::_l2i:
      {
        assert(dest->is_single_cpu(), "must be single register");
        __ mov(dest->as_register(), src->as_register_lo());
        break;
      }
    case Bytecodes::_f2i:
      {
        __ vcvt_s32_f32(src->as_float_reg(), src->as_float_reg());
        __ vmov_f32(dest->as_register(), src->as_float_reg());
        break;
      }
    case Bytecodes::_d2i:
      {
        __ vcvt_s32_f64(as_float_reg(src), src->as_double_reg());
        __ vmov_f32(dest->as_register(), as_float_reg(src));
        break;
      }
    default: ShouldNotReachHere();
  }
}

void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
  if (op->init_check()) {
    __ ldrb(rscratch1, Address(op->klass()->as_register(),
                               InstanceKlass::init_state_offset()));
    __ cmp(rscratch1, InstanceKlass::fully_initialized);
    add_debug_info_for_null_check_here(op->stub()->info());
    __ b(*op->stub()->entry(), Assembler::NE);
  }
  __ allocate_object(op->obj()->as_register(),
                     op->tmp1()->as_register(),
                     op->tmp2()->as_register(),
                     op->header_size(),
                     op->object_size(),
                     op->klass()->as_register(),
                     *op->stub()->entry());
  __ bind(*op->stub()->continuation());
}

void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
  Register len =  as_reg(op->len());

  if (UseSlowPath ||
      (!UseFastNewObjectArray && (op->type() == T_OBJECT || op->type() == T_ARRAY)) ||
      (!UseFastNewTypeArray   && (op->type() != T_OBJECT && op->type() != T_ARRAY))) {
    __ b(*op->stub()->entry());
  } else {
    Register tmp1 = op->tmp1()->as_register();
    Register tmp2 = op->tmp2()->as_register();
    Register tmp3 = op->tmp3()->as_register();
    if (len == tmp1) {
      tmp1 = tmp3;
    } else if (len == tmp2) {
      tmp2 = tmp3;
    } else if (len == tmp3) {
      // everything is ok
    } else {
      __ mov(tmp3, len);
    }
    __ allocate_array(op->obj()->as_register(),
                      len,
                      tmp1,
                      tmp2,
                      arrayOopDesc::header_size(op->type()),
                      array_element_size(op->type()),
                      op->klass()->as_register(),
                      *op->stub()->entry());
  }
  __ bind(*op->stub()->continuation());
}

void LIR_Assembler::type_profile_helper(Register mdo,
                                        ciMethodData *md, ciProfileData *data,
                                        Register recv, Label* update_done) {
  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
    Label next_test;
    // See if the receiver is receiver[n].
    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
    __ ldr(rscratch1, Address(rscratch2));
    __ cmp(recv, rscratch1);
    __ b(next_test, Assembler::NE);
    Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)));
    __ addptr(data_addr, DataLayout::counter_increment);
    __ b(*update_done);
    __ bind(next_test);
  }

  // Didn't find receiver; find next empty slot and fill it in
  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
    Label next_test;
    __ lea(rscratch2,
           Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
    Address recv_addr(rscratch2);
    __ ldr(rscratch1, recv_addr);
    __ cbnz(rscratch1, next_test);
    __ str(recv, recv_addr);
    __ mov(rscratch1, DataLayout::counter_increment);
    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i))));
    __ str(rscratch1, Address(rscratch2));
    __ b(*update_done);
    __ bind(next_test);
  }
}

void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, Label* failure, Label* obj_is_null) {
  // we always need a stub for the failure case.
  CodeStub* stub = op->stub();
  Register obj = op->object()->as_register();
  Register k_RInfo = op->tmp1()->as_register();
  Register klass_RInfo = op->tmp2()->as_register();
  Register dst = op->result_opr()->as_register();
  ciKlass* k = op->klass();
  Register Rtmp1 = noreg;

  // check if it needs to be profiled
  ciMethodData* md;
  ciProfileData* data;

  if (op->should_profile()) {
    ciMethod* method = op->profiled_method();
    assert(method != NULL, "Should have method");
    int bci = op->profiled_bci();
    md = method->method_data_or_null();
    assert(md != NULL, "Sanity");
    data = md->bci_to_data(bci);
    assert(data != NULL,                "need data for type check");
    assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
  }
  Label profile_cast_success, profile_cast_failure;
  Label *success_target = op->should_profile() ? &profile_cast_success : success;
  Label *failure_target = op->should_profile() ? &profile_cast_failure : failure;

  if (obj == k_RInfo) {
    k_RInfo = dst;
  } else if (obj == klass_RInfo) {
    klass_RInfo = dst;
  }
  if (k->is_loaded()) {
    select_different_registers(obj, dst, k_RInfo, klass_RInfo);
  } else {
    Rtmp1 = op->tmp3()->as_register();
    select_different_registers(obj, dst, k_RInfo, klass_RInfo, Rtmp1);
  }

  assert_different_registers(obj, k_RInfo, klass_RInfo);

    if (op->should_profile()) {
      Label not_null;
      __ cbnz(obj, not_null);
      // Object is null; update MDO and exit
      Register mdo  = klass_RInfo;
      __ mov_metadata(mdo, md->constant_encoding());
      Address data_addr
        = __ form_address(rscratch2, mdo,
                          md->byte_offset_of_slot(data, DataLayout::DataLayout::header_offset()),
                          LogBytesPerWord);
      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
      __ ldr(rscratch1, data_addr);
      __ orr(rscratch1, rscratch1, header_bits);
      __ str(rscratch1, data_addr);
      __ b(*obj_is_null);
      __ bind(not_null);
    } else {
      __ cbz(obj, *obj_is_null);
    }

  if (!k->is_loaded()) {
    klass2reg_with_patching(k_RInfo, op->info_for_patch());
  } else {
    __ mov_metadata(k_RInfo, k->constant_encoding());
  }
  __ verify_oop(obj);

  if (op->fast_check()) {
    // get object class
    // not a safepoint as obj null check happens earlier
    __ load_klass(rscratch1, obj);
    __ cmp( rscratch1, k_RInfo);

    __ b(*failure_target, Assembler::NE);
    // successful cast, fall through to profile or jump
  } else {
    // get object class
    // not a safepoint as obj null check happens earlier
    __ load_klass(klass_RInfo, obj);
    if (k->is_loaded()) {
      // See if we get an immediate positive hit
      __ ldr(rscratch1, Address(klass_RInfo, long(k->super_check_offset())));
      __ cmp(k_RInfo, rscratch1);
      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
        __ b(*failure_target, Assembler::NE);
        // successful cast, fall through to profile or jump
      } else {
        // See if we get an immediate positive hit
        __ b(*success_target, Assembler::EQ);
        // check for self
        __ cmp(klass_RInfo, k_RInfo);
        __ b(*success_target, Assembler::EQ);

        __ push(klass_RInfo);
        __ push(k_RInfo);
        __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
        __ ldr(klass_RInfo, Address(__ post(sp, 2 * wordSize)));

        // result is a boolean
        __ cbz(klass_RInfo, *failure_target);
        // successful cast, fall through to profile or jump
      }
    } else {
      // perform the fast part of the checking logic
      __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
      // call out-of-line instance of __ check_klass_subtype_slow_path(...):
      __ push(klass_RInfo);
      __ push(k_RInfo);
      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
      __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));

      // result is a boolean
      __ cbz(k_RInfo, *failure_target);
      // successful cast, fall through to profile or jump
    }
  }
  if (op->should_profile()) {
    Register mdo  = klass_RInfo, recv = k_RInfo;
    __ bind(profile_cast_success);
    __ mov_metadata(mdo, md->constant_encoding());
    __ load_klass(recv, obj);
    Label update_done;
    type_profile_helper(mdo, md, data, recv, success);
    __ b(*success);

    __ bind(profile_cast_failure);
    __ mov_metadata(mdo, md->constant_encoding());
    Address counter_addr
      = __ form_address(rscratch2, mdo,
                        md->byte_offset_of_slot(data, CounterData::count_offset()),
                        LogBytesPerWord);
    __ ldr(rscratch1, counter_addr);
    __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
    __ str(rscratch1, counter_addr);
    __ b(*failure);
  }
  __ b(*success);
}


void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
  LIR_Code code = op->code();
  if (code == lir_store_check) {
    Register value = op->object()->as_register();
    Register array = op->array()->as_register();
    Register k_RInfo = op->tmp1()->as_register();
    Register klass_RInfo = op->tmp2()->as_register();
    Register Rtmp1 = op->tmp3()->as_register();

    CodeStub* stub = op->stub();

    // check if it needs to be profiled
    ciMethodData* md;
    ciProfileData* data;

    if (op->should_profile()) {
      ciMethod* method = op->profiled_method();
      assert(method != NULL, "Should have method");
      int bci = op->profiled_bci();
      md = method->method_data_or_null();
      assert(md != NULL, "Sanity");
      data = md->bci_to_data(bci);
      assert(data != NULL,                "need data for type check");
      assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
    }
    Label profile_cast_success, profile_cast_failure, done;
    Label *success_target = op->should_profile() ? &profile_cast_success : &done;
    Label *failure_target = op->should_profile() ? &profile_cast_failure : stub->entry();

    if (op->should_profile()) {
      Label not_null;
      __ cbnz(value, not_null);
      // Object is null; update MDO and exit
      Register mdo  = klass_RInfo;
      __ mov_metadata(mdo, md->constant_encoding());
      Address data_addr
        = __ form_address(rscratch2, mdo,
                          md->byte_offset_of_slot(data, DataLayout::header_offset()),
                          LogBytesPerInt);
      int header_bits = DataLayout::flag_mask_to_header_mask(BitData::null_seen_byte_constant());
      __ ldr(rscratch1, data_addr);
      __ orr(rscratch1, rscratch1, header_bits);
      __ str(rscratch1, data_addr);
      __ b(done);
      __ bind(not_null);
    } else {
      __ cbz(value, done);
    }

    add_debug_info_for_null_check_here(op->info_for_exception());
    __ load_klass(k_RInfo, array);
    __ load_klass(klass_RInfo, value);

    // get instance klass (it's already uncompressed)
    __ ldr(k_RInfo, Address(k_RInfo, ObjArrayKlass::element_klass_offset()));
    // perform the fast part of the checking logic
    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
    __ push(klass_RInfo);
    __ push(k_RInfo);
    __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
    __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));
    // result is a boolean
    __ cbz(k_RInfo, *failure_target);
    // fall through to the success case

    if (op->should_profile()) {
      Register mdo  = klass_RInfo, recv = k_RInfo;
      __ bind(profile_cast_success);
      __ mov_metadata(mdo, md->constant_encoding());
      __ load_klass(recv, value);
      type_profile_helper(mdo, md, data, recv, &done);
      __ b(done);

      __ bind(profile_cast_failure);
      __ mov_metadata(mdo, md->constant_encoding());
      Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
      __ lea(rscratch2, counter_addr);
      __ ldr(rscratch1, Address(rscratch2));
      __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
      __ str(rscratch1, Address(rscratch2));
      __ b(*stub->entry());
    }

    __ bind(done);
  } else if (code == lir_checkcast) {
    Register obj = op->object()->as_register();
    Register dst = op->result_opr()->as_register();
    Label success;
    emit_typecheck_helper(op, &success, op->stub()->entry(), &success);
    __ bind(success);
    if (dst != obj) {
      __ mov(dst, obj);
    }
  } else if (code == lir_instanceof) {
    Register obj = op->object()->as_register();
    Register dst = op->result_opr()->as_register();
    Label success, failure, done;
    emit_typecheck_helper(op, &success, &failure, &failure);
    __ bind(failure);
    __ mov(dst, 0);
    __ b(done);
    __ bind(success);
    __ mov(dst, 1);
    __ bind(done);
  } else {
    ShouldNotReachHere();
  }
}

// TODO: reuse masm cmpxchgw
void LIR_Assembler::casw(Register addr, Register newval, Register cmpval, Register result) {
  assert(newval != cmpval, "must be different");
  Label retry_load, nope;
  // flush and load exclusive from the memory location
  // and fail if it is not what we expect
  __ bind(retry_load);
  __ ldrex(result, addr);
  __ cmp(result, cmpval);
  __ mov(result, 1, Assembler::NE);
  __ b(nope, Assembler::NE);
  // if we store+flush with no intervening write rscratch1 wil be zero
  __ strex(result, newval, addr);
  // retry so we only ever return after a load fails to compare
  // ensures we don't return a stale value after a failed write.
  __ cbnz(result, retry_load);
  __ membar(__ AnyAny);
  __ bind(nope);
}

void LIR_Assembler::casl(Register addr, Register newval_lo, Register newval_hi, Register cmpval_lo,  Register cmpval_hi,  Register tmp_lo, Register tmp_hi, Register result) {
  assert(newval_lo->successor() == newval_hi, "must be contiguous");
  assert(tmp_lo->successor() == tmp_hi, "must be contiguous");
  assert(tmp_lo->encoding_nocheck() % 2 == 0,  "Must be an even register");
  assert_different_registers(newval_lo, newval_hi, cmpval_lo,  cmpval_hi,  tmp_lo, tmp_hi);

  Label retry_load, nope;
  // flush and load exclusive from the memory location
  // and fail if it is not what we expect
  __ bind(retry_load);
  __ mov(result, 1);
  __ ldrexd(tmp_lo, addr);
  __ cmp(tmp_lo, cmpval_lo);
  __ b(nope, Assembler::NE);
  __ cmp(tmp_hi, cmpval_hi);
  __ b(nope, Assembler::NE);
  // if we store+flush with no intervening write rscratch1 wil be zero
  __ strexd(result, newval_lo, addr);
  // retry so we only ever return after a load fails to compare
  // ensures we don't return a stale value after a failed write.
  __ cbnz(result, retry_load);
  __ membar(__ AnyAny);
  __ bind(nope);
}


void LIR_Assembler::emit_compare_and_swap(LIR_OpCompareAndSwap* op) {
  Register addr = as_reg(op->addr());
  Register result = as_reg(op->result_opr());
  if (op->code() == lir_cas_obj || op->code() == lir_cas_int) {
    Register newval = as_reg(op->new_value());
    Register cmpval = as_reg(op->cmp_value());
    casw(addr, newval, cmpval, result);
  } else if (op->code() == lir_cas_long){
    Register newval_lo = op->new_value()->as_register_lo();
    Register newval_hi = op->new_value()->as_register_hi();
    Register cmpval_lo = op->cmp_value()->as_register_lo();
    Register cmpval_hi = op->cmp_value()->as_register_hi();
    Register tmp_lo = op->tmp1()->as_register_lo();
    Register tmp_hi = op->tmp1()->as_register_hi();
    casl(addr, newval_lo, newval_hi, cmpval_lo, cmpval_hi, tmp_lo, tmp_hi, result);
  } else {
      ShouldNotReachHere();
  }
}

static void patch_condition(address start_insn, address end_insn, Assembler::Condition cond) {
  for (uint32_t* insn_p = (uint32_t*) start_insn; (address) insn_p < end_insn; ++insn_p) {
    uint32_t insn = *insn_p;
    assert((insn >> 28) == Assembler::AL, "instructions in patch"
     " should allow conditional form and be in ALWAYS condition");
    *insn_p = (insn & 0x0fffffff) | (cond << 28);
  }
}

void LIR_Assembler::cmove(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, BasicType type) {

  Assembler::Condition acond, ncond;
  switch (condition) {
  case lir_cond_equal:        acond = Assembler::EQ; ncond = Assembler::NE; break;
  case lir_cond_notEqual:     acond = Assembler::NE; ncond = Assembler::EQ; break;
  case lir_cond_less:         acond = Assembler::LT; ncond = Assembler::GE; break;
  case lir_cond_greaterEqual: acond = Assembler::GE; ncond = Assembler::LT; break;
  case lir_cond_lessEqual:    acond = Assembler::LE; ncond = Assembler::GT; break;
  case lir_cond_greater:      acond = Assembler::GT; ncond = Assembler::LE; break;
  case lir_cond_belowEqual:   Unimplemented(); break;
  case lir_cond_aboveEqual:   Unimplemented(); break;
  default:                    ShouldNotReachHere();
  }
  if (type == T_LONG) {
      // for the lessEqual and greater conditions the jlong operands are swapped
      // during comparison and hence should use mirror condition in conditional
      // instruction. see comp_op())
    switch (condition) {
    case lir_cond_lessEqual:    acond = Assembler::GE; ncond = Assembler::LT; break;
    case lir_cond_greater:      acond = Assembler::LT; ncond = Assembler::GE; break;
    }
  }

  address true_instrs = __ pc();
  if (opr1->is_cpu_register()) {
    reg2reg(opr1, result);
  } else if (opr1->is_stack()) {
    stack2reg(opr1, result, result->type());
  } else if (opr1->is_constant()) {
    const2reg(opr1, result, lir_patch_none, NULL);
  } else {
    ShouldNotReachHere();
  }
  patch_condition(true_instrs, __ pc(), acond);

  address false_instrs = __ pc();
  if (opr2->is_cpu_register()) {
    reg2reg(opr2, result);
  } else if (opr2->is_stack()) {
    stack2reg(opr2, result, result->type());
  } else if (opr2->is_constant()) {
    const2reg(opr2, result, lir_patch_none, NULL);
  } else {
    ShouldNotReachHere();
  }
  patch_condition(false_instrs, __ pc(), ncond);
}

void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest, CodeEmitInfo* info, bool pop_fpu_stack) {
  assert(info == NULL, "should never be used, idiv/irem and ldiv/lrem not handled by this method");

  if (left->is_single_cpu()) {
    Register lreg = left->as_register();
    Register dreg = as_reg(dest);

    if (right->is_single_cpu()) {
      // cpu register - cpu register

      assert((left->type() == T_INT || left->type() == T_OBJECT)
             && right->type() == T_INT
             && dest->type() == T_INT,
             "should be");
      Register rreg = right->as_register();
      switch (code) {
      case lir_add: __ add (dest->as_register(), lreg, rreg); break;
      case lir_sub: __ sub (dest->as_register(), lreg, rreg); break;
      case lir_mul: __ mul (dest->as_register(), lreg, rreg); break;
      default:      ShouldNotReachHere();
      }

    } else if (right->is_double_cpu()) {
      ShouldNotReachHere(); // for obj+long op the generator casts long to int before invoking add
    } else if (right->is_constant()) {
      // cpu register - constant
      jint c = right->as_constant_ptr()->as_jint();

      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
      if (c == 0 && dreg == lreg) {
        COMMENT("effective nop elided");
        return;
      }

      if (Assembler::operand_valid_for_add_sub_immediate(c)) {
        switch (code) {
        case lir_add: __ add(dreg, lreg, c); break;
        case lir_sub: __ sub(dreg, lreg, c); break;
        default: ShouldNotReachHere();
        }
      } else {
        __ mov(rscratch1, c);
        switch (code) {
        case lir_add: __ add(dreg, lreg, rscratch1); break;
        case lir_sub: __ sub(dreg, lreg, rscratch1); break;
        default: ShouldNotReachHere();
        }
      }
    } else {
      ShouldNotReachHere();
    }

  } else if (left->is_double_cpu()) {
    Register lreg_lo = left->as_register_lo();
    Register lreg_hi = left->as_register_hi();

    if (right->is_double_cpu()) {
      // cpu register - cpu register
      Register rreg_lo = right->as_register_lo();
      Register rreg_hi = right->as_register_hi();
      Register dreg_lo = dest->as_register_lo();
      Register dreg_hi = dest->as_register_hi();
      if (code == lir_add || code == lir_sub) {
        check_register_collision(dreg_lo, &lreg_hi, &rreg_hi);
      }
      switch (code) {
      case lir_add: __ adds (dreg_lo, lreg_lo, rreg_lo);
                    __ adc (dreg_hi, lreg_hi, rreg_hi); break;
      case lir_sub: __ subs (dreg_lo, lreg_lo, rreg_lo);
                    __ sbc (dreg_hi, lreg_hi, rreg_hi); break;
      case lir_mul: __ mult_long (dreg_lo, dreg_hi,
                        lreg_lo, lreg_hi, rreg_lo, rreg_hi); break;
      default:
        ShouldNotReachHere();
      }

    } else if (right->is_constant()) {
      const jint c_lo = right->as_constant_ptr()->as_jint_lo_bits();
      const jint c_hi = right->as_constant_ptr()->as_jint_hi_bits();
      const Register dreg_lo = dest->as_register_lo();
      const Register dreg_hi = dest->as_register_hi();
      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
      if (c_lo == 0 && c_hi == 0 && dreg_lo == lreg_lo && dreg_hi == lreg_hi) {
        COMMENT("effective nop elided");
        return;
      }
      check_register_collision(dreg_lo, &lreg_hi, NULL, rscratch2);
      switch (code) {
        case lir_add:
          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
            __ adds(dreg_lo, lreg_lo, c_lo);
          else {
            __ mov(rscratch1, c_lo);
            __ adds(dreg_lo, lreg_lo, rscratch1);
          }
          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
            __ adc(dreg_hi, lreg_hi, c_hi);
          else {
            __ mov(rscratch1, c_hi);
            __ adc(dreg_lo, lreg_hi, rscratch1);
          }
          break;
        case lir_sub:
          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
            __ subs(dreg_lo, lreg_lo, c_lo);
          else {
            __ mov(rscratch1, c_lo);
            __ subs(dreg_lo, lreg_lo, rscratch1);
          }
          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
            __ sbc(dreg_hi, lreg_hi, c_hi);
          else {
            __ mov(rscratch1, c_hi);
            __ sbc(dreg_hi, lreg_hi, rscratch1);
          }
          break;
        default:
          ShouldNotReachHere();
      }
    } else {
      ShouldNotReachHere();
    }
  } else if (left->is_single_fpu()) {
    assert(right->is_single_fpu(), "right hand side of float arithmetics needs to be float register");
    switch (code) {
    case lir_add: __ vadd_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
    case lir_sub: __ vsub_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
    case lir_mul: __ vmul_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
    case lir_div: __ vdiv_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
    default:
      ShouldNotReachHere();
    }
  } else if (left->is_double_fpu()) {
    if (right->is_double_fpu()) {
      // cpu register - cpu register
      switch (code) {
      case lir_add: __ vadd_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
      case lir_sub: __ vsub_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
      case lir_mul: __ vmul_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
      case lir_div: __ vdiv_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
      default:
        ShouldNotReachHere();
      }
    } else {
      if (right->is_constant()) {
        ShouldNotReachHere();
      }
      ShouldNotReachHere();
    }
  } else if (left->is_single_stack() || left->is_address()) {
    assert(left == dest, "left and dest must be equal");
    ShouldNotReachHere();
  } else {
    ShouldNotReachHere();
  }
}

void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
  switch(code) {
  case lir_abs : __ vabs_f64(dest->as_double_reg(), value->as_double_reg()); break;
  case lir_sqrt: __ vsqrt_f64(dest->as_double_reg(), value->as_double_reg()); break;
  default      : ShouldNotReachHere();
  }
}

void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {

  assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
  Register Rleft = left->is_single_cpu() ? left->as_register() :
                                           left->as_register_lo();
   if (dst->is_single_cpu()) {
     Register Rdst = dst->as_register();
     if (right->is_constant()) {
       switch (code) {
         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jint()); break;
         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jint()); break;
         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jint()); break;
         default: ShouldNotReachHere(); break;
       }
     } else {
       Register Rright = right->is_single_cpu() ? right->as_register() :
                                                  right->as_register_lo();
       switch (code) {
         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
         default: ShouldNotReachHere(); break;
       }
     }
   } else {
     assert(dst->is_double_cpu(), "mismatched logic op operand size");
     const Register Rdst_lo = dst->as_register_lo();
     const Register Rdst_hi = dst->as_register_hi();
     Register Rleft_hi = left->as_register_hi();
     if (right->is_constant()) {
       // LIR generator enforces jlong constants to be valid_immediate12
       // so we know they fit into 32-bit int
       switch (code) {
         case lir_logic_and: __ andr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
         case lir_logic_or:  __ orr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
         case lir_logic_xor: __ eor (Rdst_lo, Rleft, (int)right->as_jlong()); break;
         default: ShouldNotReachHere(); break;
       }
     } else {
       assert(right->is_double_cpu(), "mismatched logic op operand size");
       Register Rright_lo = right->as_register_lo();
       Register Rright_hi = right->as_register_hi();
       check_register_collision(Rdst_lo, &Rleft_hi, &Rright_hi);
       switch (code) {
         case lir_logic_and: __ andr (Rdst_lo, Rleft, Rright_lo);
                             __ andr (Rdst_hi, Rleft_hi, Rright_hi); break;
         case lir_logic_or:  __ orr (Rdst_lo, Rleft, Rright_lo);
                             __ orr (Rdst_hi, Rleft_hi, Rright_hi); break;
         case lir_logic_xor: __ eor (Rdst_lo, Rleft, Rright_lo);
                             __ eor (Rdst_hi, Rleft_hi, Rright_hi); break;
         default: ShouldNotReachHere(); break;
       }
     }
   }
}


void LIR_Assembler::arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr temp, LIR_Opr result, CodeEmitInfo* info) { Unimplemented(); }

void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
  if (opr1->is_single_cpu()) {
    Register reg1 = as_reg(opr1);
    if (opr2->is_single_cpu()) {
      // cpu register - cpu register
      Register reg2 = opr2->as_register();
      __ cmp(reg1, reg2);
    } else if (opr2->is_constant()) {
      LIR_Const* c = opr2->as_constant_ptr();
      if (c->type() == T_INT) {
        __ cmp(reg1, c->as_jint(), rscratch1, Assembler::C_DFLT);
      } else if (c->type() == T_OBJECT || c->type() == T_ARRAY) {
        jobject o = c->as_jobject();
        if (o == NULL) {
          __ cmp(reg1, (int32_t)NULL_WORD);
        } else {
          __ movoop(rscratch1, o);
          __ cmpptr(reg1, rscratch1);
        }
      } else {
        fatal(err_msg("unexpected type: %s", basictype_to_str(c->type())));
      }
    } else if (opr2->is_address()) {
      __ ldr(rscratch2, as_Address(opr2->as_address_ptr(), rscratch1, Address::IDT_INT));
      __ cmp(reg1, rscratch2);
    } else {
      ShouldNotReachHere();
    }

  } else if (opr1->is_double_cpu()) {
    assert(opr1->type() == T_LONG, "expect jlong type");
    assert(opr2->type() == T_LONG, "expect jlong type");
    Register xlo = opr1->as_register_lo();
    Register xhi = opr1->as_register_hi();
    if (opr2->is_double_cpu()) {
      // cpu register - cpu register
      Register ylo = opr2->as_register_lo();
      Register yhi = opr2->as_register_hi();
      switch (condition) {
        case lir_cond_equal:
        case lir_cond_notEqual:
        case lir_cond_belowEqual:
        case lir_cond_aboveEqual:
          // these need APSR.ZC. the ops below set them correctly (but not APSR.V)
          __ cmp(xhi, yhi);
          __ cmp(xlo, ylo, Assembler::EQ);
          break;
        case lir_cond_less:
        case lir_cond_greaterEqual:
          __ cmp(xlo, ylo);
          __ sbcs(rscratch1, xhi, yhi);
          break;
        case lir_cond_lessEqual:
        case lir_cond_greater:
          // here goes a trick: the below operations do not produce the valid
          // value for the APSR.Z flag and there is no easy way to set it. so
          // we exchange the order of arguments in the comparison and use the
          // opposite condition in the conditional statement that follows.
          // GE should be used instead of LE and LT in place of GT.
          // the comp_op() could only be followed by: emit_opBranch(), cmove() and
          // emit_assert(). these are patched to be aware of this trick
          __ cmp(ylo, xlo);
          __ sbcs(rscratch1, yhi, xhi);
          break;
      }
    } else if (opr2->is_constant()) {
      jlong y = opr2->as_jlong();
      assert(Assembler::operand_valid_for_add_sub_immediate(y), "immediate overflow");
      switch (condition) {
        case lir_cond_equal:
        case lir_cond_notEqual:
        case lir_cond_belowEqual:
        case lir_cond_aboveEqual:
          __ cmp(xhi, (int)(y >> 32));
          __ cmp(xlo, (int)y, Assembler::EQ);
          break;
        case lir_cond_less:
        case lir_cond_greaterEqual:
          __ cmp(xlo, (int)y);
          __ sbcs(rscratch1, xhi, (int)(y >> 32));
          break;
        case lir_cond_lessEqual:
        case lir_cond_greater:
          __ rsbs(rscratch1, xlo, (int)y);
          __ rscs(rscratch1, xhi, (int)(y >> 32));
          break;
      }
    } else {
      ShouldNotReachHere();
    }
  } else if (opr1->is_single_fpu()) {
    FloatRegister reg1 = opr1->as_float_reg();
    assert(opr2->is_single_fpu(), "expect single float register");
    FloatRegister reg2 = opr2->as_float_reg();
    __ vcmp_f32(reg1, reg2);
    __ get_fpsr();
  } else if (opr1->is_double_fpu()) {
    FloatRegister reg1 = opr1->as_double_reg();
    assert(opr2->is_double_fpu(), "expect double float register");
    FloatRegister reg2 = opr2->as_double_reg();
    __ vcmp_f64(reg1, reg2);
    __ get_fpsr();
  } else {
    ShouldNotReachHere();
  }
}

void LIR_Assembler::comp_fl2i(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst, LIR_Op2* op){
  if (code == lir_cmp_fd2i || code == lir_ucmp_fd2i) {
    bool is_unordered_less = (code == lir_ucmp_fd2i);
    if (left->is_single_fpu()) {
      __ float_cmp(true, is_unordered_less ? -1 : 1, left->as_float_reg(), right->as_float_reg(), dst->as_register());
    } else if (left->is_double_fpu()) {
      __ float_cmp(false, is_unordered_less ? -1 : 1, left->as_double_reg(), right->as_double_reg(), dst->as_register());
    } else {
      ShouldNotReachHere();
    }
  } else if (code == lir_cmp_l2i) {
    __ mov(dst->as_register(), 1);
    __ subs(rscratch1, left->as_register_lo(), right->as_register_lo());
    __ sbc(rscratch2, left->as_register_hi(), right->as_register_hi());
    __ orrs(rscratch1, rscratch1, rscratch2);
    __ mov(dst->as_register(), -1, Assembler::MI);
    __ mov(dst->as_register(), 0, Assembler::EQ);
  } else {
    ShouldNotReachHere();
  }
}


void LIR_Assembler::align_call(LIR_Code code) {  }


void LIR_Assembler::call(LIR_OpJavaCall* op, relocInfo::relocType rtype) {
  __ trampoline_call(Address(op->addr(), rtype));
  add_call_info(code_offset(), op->info());
}


void LIR_Assembler::ic_call(LIR_OpJavaCall* op) {
  __ ic_call(op->addr());
  add_call_info(code_offset(), op->info());
}


/* Currently, vtable-dispatch is only enabled for sparc platforms */
void LIR_Assembler::vtable_call(LIR_OpJavaCall* op) {
  ShouldNotReachHere();
}


void LIR_Assembler::emit_static_call_stub() {
  address call_pc = __ pc();
  address stub = __ start_a_stub(call_stub_size);
  if (stub == NULL) {
    bailout("static call stub overflow");
    return;
  }

  int start = __ offset();

  __ relocate(static_stub_Relocation::spec(call_pc));
  __ mov_metadata(rmethod, (Metadata*)NULL);
  __ movptr(rscratch1, 0);
  __ b(rscratch1);

  assert(__ offset() - start <= call_stub_size, "stub too big");
  __ end_a_stub();
}


void LIR_Assembler::throw_op(LIR_Opr exceptionPC, LIR_Opr exceptionOop, CodeEmitInfo* info) {
  assert(exceptionOop->as_register() == r0, "must match");
  assert(exceptionPC->as_register() == r3, "must match");

  // exception object is not added to oop map by LinearScan
  // (LinearScan assumes that no oops are in fixed registers)
  info->add_register_oop(exceptionOop);
  Runtime1::StubID unwind_id;

  // get current pc information
  // pc is only needed if the method has an exception handler, the unwind code does not need it.
  int pc_for_athrow_offset = __ offset();
  __ add(exceptionPC->as_register(), r15_pc, -8);
  add_call_info(pc_for_athrow_offset, info); // for exception handler

  __ verify_not_null_oop(r0);
  // search an exception handler (r0: exception oop, r3: throwing pc)
  if (compilation()->has_fpu_code()) {
    unwind_id = Runtime1::handle_exception_id;
  } else {
    unwind_id = Runtime1::handle_exception_nofpu_id;
  }
  __ far_call(RuntimeAddress(Runtime1::entry_for(unwind_id)));

  // FIXME: enough room for two byte trap   ????
  __ nop();
}


void LIR_Assembler::unwind_op(LIR_Opr exceptionOop) {
  assert(exceptionOop->as_register() == r0, "must match");

  __ b(_unwind_handler_entry);
}


void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, LIR_Opr count, LIR_Opr dest, LIR_Opr tmp) {
  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();
  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();

  switch (left->type()) {
    case T_INT:
    case T_ADDRESS:
    case T_OBJECT:
      __ andr(rscratch1, count->as_register(), 0x1f);
      switch (code) {
        case lir_shl: __ lsl(dreg, lreg, rscratch1); break;
        case lir_shr: __ asr(dreg, lreg, rscratch1); break;
        case lir_ushr: __ lsr(dreg, lreg, rscratch1); break;
        default:
          ShouldNotReachHere();
          break;
      }
      break;
    case T_LONG:
    {
      Register lreg_hi = left->as_register_hi();
      Register dreg_hi = dest->as_register_hi();
      const int word_bits = 8 * wordSize;

      if (code == lir_shl || code == lir_ushr) {
        check_register_collision(dreg, &lreg, &lreg_hi, rscratch1);
        check_register_collision(dreg_hi, &lreg, &lreg_hi, rscratch2);
      }

      switch (code) {
        case lir_shl:
          __ andr(dreg, count->as_register(), 0x3f);
          __ sub(dreg_hi, dreg, word_bits);
          __ lsl(lreg_hi, lreg_hi, dreg);
          __ orr(lreg_hi, lreg_hi, lreg, lsl(dreg_hi));
          __ rsb(dreg_hi, dreg, word_bits);
          __ orr(dreg_hi, lreg_hi, lreg, lsr(dreg_hi));
          __ lsl(dreg, lreg, dreg);
          break;
        case lir_shr: {
          __ mov(rscratch2, lreg_hi);
          __ andr(rscratch1, count->as_register(), 0x3f);
          __ lsr(dreg, lreg, rscratch1);
          __ rsb(dreg_hi, rscratch1, word_bits);
          __ orr(dreg, dreg, rscratch2, lsl(dreg_hi));
          __ asr(dreg_hi, rscratch2, rscratch1);
          __ subs(rscratch1, rscratch1, word_bits);
          __ mov(dreg, rscratch2, asr(rscratch1), Assembler::GT);
        }
          break;
        case lir_ushr:
          __ andr(dreg, count->as_register(), 0x3f);
          __ lsr(lreg, lreg, dreg);
          __ rsb(dreg_hi, dreg, word_bits);
          __ orr(lreg, lreg, lreg_hi, lsl(dreg_hi));
          __ lsr(dreg_hi, lreg_hi, dreg);
          __ sub(dreg, dreg, word_bits);
          __ orr(dreg, lreg, lreg_hi, lsr(dreg));
          break;
        default:
          ShouldNotReachHere();
          break;
      }
    }
      break;
    default:
      ShouldNotReachHere();
      break;
  }
}


void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, jint count, LIR_Opr dest) {
  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();
  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();

  if (!count) {
      reg2reg(left, dest);
      return;
   }

  switch (left->type()) {
    case T_INT:
    case T_ADDRESS:
    case T_OBJECT:
      switch (code) {
        case lir_shl: __ lsl(dreg, lreg, count); break;
        case lir_shr: __ asr(dreg, lreg, count); break;
        case lir_ushr: __ lsr(dreg, lreg, count); break;
        default:
          ShouldNotReachHere();
          break;
      }
      break;
    case T_LONG: {
      Register lreg_hi = left->as_register_hi();
      Register dreg_hi = dest->as_register_hi();
      const int word_bits = 8 * wordSize;

      switch (code) {
        case lir_shl:
          if (count >= word_bits) {
            __ lsl(dreg_hi, lreg, count - word_bits);
            __ mov(dreg, 0);
          } else {
            check_register_collision(dreg_hi, &lreg);
            __ lsl(dreg_hi, lreg_hi, count);
            __ orr(dreg_hi, dreg_hi, lreg, lsr(word_bits - count));
            __ lsl(dreg, lreg, count);
          }
          break;
        case lir_shr:
          if (count >= word_bits) {
            __ asr(dreg, lreg_hi, count - word_bits);
            __ asr(dreg_hi, lreg_hi, word_bits);
          } else {
            check_register_collision(dreg, &lreg_hi);
            __ lsr(dreg, lreg, count);
            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
            __ asr(dreg_hi, lreg_hi, count);
          }
          break;
        case lir_ushr:
          if (count >= word_bits) {
            __ lsr(dreg, lreg_hi, count - word_bits);
            __ mov(dreg_hi, 0);
          } else {
            check_register_collision(dreg, &lreg_hi);
            __ lsr(dreg, lreg, count);
            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
            __ lsr(dreg_hi, lreg_hi, count);
          }
          break;
        default:
          ShouldNotReachHere();
          break;
      }
    }
      break;
    default:
      ShouldNotReachHere();
      break;
  }
}


void LIR_Assembler::store_parameter(Register r, int offset_from_sp_in_words) {
  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
  __ str (r, Address(sp, offset_from_sp_in_bytes));
}


void LIR_Assembler::store_parameter(jint c,     int offset_from_sp_in_words) {
  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
  __ mov (rscratch1, c);
  __ str (rscratch1, Address(sp, offset_from_sp_in_bytes));
}

// This code replaces a call to arraycopy; no exception may
// be thrown in this code, they must be thrown in the System.arraycopy
// activation frame; we could save some checks if this would not be the case
void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
  ciArrayKlass* default_type = op->expected_type();
  Register src = op->src()->as_register();
  Register dst = op->dst()->as_register();
  Register src_pos = op->src_pos()->as_register();
  Register dst_pos = op->dst_pos()->as_register();
  Register length  = op->length()->as_register();
  Register tmp = op->tmp()->as_register();
  // due to limited number of registers available and in order to simplify
  // the code we fix the registers used by the arguments to this intrinsic.
  // see the comment in LIRGenerator::do_ArrayCopy
  assert(src == j_rarg0, "assumed by implementation");
  assert(src_pos == j_rarg1, "assumed by implementation");
  assert(dst == j_rarg2, "assumed by implementation");
  assert(dst_pos == j_rarg3, "assumed by implementation");
  assert(length == r4, "assumed by implementation");
  assert(tmp == r5, "assumed by implementation");

  CodeStub* stub = op->stub();
  int flags = op->flags();
  BasicType basic_type = default_type != NULL ? default_type->element_type()->basic_type() : T_ILLEGAL;
  if (basic_type == T_ARRAY) basic_type = T_OBJECT;

  // if we don't know anything, just go through the generic arraycopy
  if (default_type == NULL // || basic_type == T_OBJECT
      ) {
    Label done;
    assert(src == r1 && src_pos == r2, "mismatch in calling convention");

    // Save the arguments in case the generic arraycopy fails and we
    // have to fall back to the JNI stub
    __ str(dst,     Address(sp, 0*BytesPerWord));
    __ str(dst_pos, Address(sp, 1*BytesPerWord));
    __ str(length,  Address(sp, 2*BytesPerWord));
    __ str(src_pos, Address(sp, 3*BytesPerWord));
    __ str(src,     Address(sp, 4*BytesPerWord));

    address C_entry = CAST_FROM_FN_PTR(address, Runtime1::arraycopy);
    address copyfunc_addr = StubRoutines::generic_arraycopy();

    // The arguments are in java calling convention so we shift them
    // to C convention
    assert(c_rarg0 == j_rarg3, "assumed in the code below");
    __ mov(rscratch1, c_rarg0);
    assert_different_registers(c_rarg0, j_rarg1, j_rarg2);
    __ mov(c_rarg0, j_rarg0);
    assert_different_registers(c_rarg1, j_rarg2, j_rarg3);
    __ mov(c_rarg1, j_rarg1);
    assert_different_registers(c_rarg2, j_rarg3);
    __ mov(c_rarg2, j_rarg2);
    __ mov(c_rarg3, rscratch1);
    __ str(length, Address(sp)); // the below C function follows C calling convention,
                                 // so should put 5th arg to stack

    if (copyfunc_addr == NULL) { // Use C version if stub was not generated
      __ mov(rscratch1, RuntimeAddress(C_entry));
      __ bl(rscratch1);
    } else {
#ifndef PRODUCT
      if (PrintC1Statistics) {
        __ increment(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
      }
#endif
      __ far_call(RuntimeAddress(copyfunc_addr));
    }

    __ cbz(r0, *stub->continuation());

    // Reload values from the stack so they are where the stub
    // expects them.
    __ ldr(dst,     Address(sp, 0*BytesPerWord));
    __ ldr(dst_pos, Address(sp, 1*BytesPerWord));
    __ ldr(length,  Address(sp, 2*BytesPerWord));
    __ ldr(src_pos, Address(sp, 3*BytesPerWord));
    __ ldr(src,     Address(sp, 4*BytesPerWord));

    if (copyfunc_addr != NULL) {
      // r0 is -1^K where K == partial copied count
      __ inv(rscratch1, r0);
      // adjust length down and src/end pos up by partial copied count
      __ sub(length, length, rscratch1);
      __ add(src_pos, src_pos, rscratch1);
      __ add(dst_pos, dst_pos, rscratch1);
    }
    __ b(*stub->entry());

    __ bind(*stub->continuation());
    return;
  }

  assert(default_type != NULL && default_type->is_array_klass() && default_type->is_loaded(), "must be true at this point");

  int elem_size = type2aelembytes(basic_type);
  int scale = exact_log2(elem_size);

  Address src_length_addr = Address(src, arrayOopDesc::length_offset_in_bytes());
  Address dst_length_addr = Address(dst, arrayOopDesc::length_offset_in_bytes());
  Address src_klass_addr = Address(src, oopDesc::klass_offset_in_bytes());
  Address dst_klass_addr = Address(dst, oopDesc::klass_offset_in_bytes());

  // test for NULL
  if (flags & LIR_OpArrayCopy::src_null_check) {
    __ cbz(src, *stub->entry());
  }
  if (flags & LIR_OpArrayCopy::dst_null_check) {
    __ cbz(dst, *stub->entry());
  }

  // check if negative
  if (flags & LIR_OpArrayCopy::src_pos_positive_check) {
    __ cmp(src_pos, 0);
    __ b(*stub->entry(), Assembler::LT);
  }
  if (flags & LIR_OpArrayCopy::dst_pos_positive_check) {
    __ cmp(dst_pos, 0);
    __ b(*stub->entry(), Assembler::LT);
  }

  if (flags & LIR_OpArrayCopy::length_positive_check) {
    __ cmp(length, 0);
    __ b(*stub->entry(), Assembler::LT);
  }

  if (flags & LIR_OpArrayCopy::src_range_check) {
    __ add(tmp, src_pos, length);
    __ ldr(rscratch1, src_length_addr);
    __ cmp(tmp, rscratch1);
    __ b(*stub->entry(), Assembler::HI);
  }
  if (flags & LIR_OpArrayCopy::dst_range_check) {
    __ add(tmp, dst_pos, length);
    __ ldr(rscratch1, dst_length_addr);
    __ cmp(tmp, rscratch1);
    __ b(*stub->entry(), Assembler::HI);
  }

  // FIXME: The logic in LIRGenerator::arraycopy_helper clears
  // length_positive_check if the source of our length operand is an
  // arraylength.  However, that arraylength might be zero, and the
  // stub that we're about to call contains an assertion that count !=
  // 0 .  So we make this check purely in order not to trigger an
  // assertion failure.
  __ cbz(length, *stub->continuation());

  if (flags & LIR_OpArrayCopy::type_check) {
    // We don't know the array types are compatible
    if (basic_type != T_OBJECT) {
      // Simple test for basic type arrays
      __ ldr(tmp, src_klass_addr);
      __ ldr(rscratch1, dst_klass_addr);
      __ cmp(tmp, rscratch1);
      __ b(*stub->entry(), Assembler::NE);
    } else {
      // For object arrays, if src is a sub class of dst then we can
      // safely do the copy.
      Label cont, slow;

      __ push(RegSet::of(src, dst), sp);

      __ load_klass(src, src);
      __ load_klass(dst, dst);

      __ check_klass_subtype_fast_path(src, dst, tmp, &cont, &slow, NULL);

      __ push(src); // sub
      __ push(dst); // super
      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
      // result on TOS
      __ pop(src); // result
      __ pop(dst);

      __ cbnz(src, cont);

      __ bind(slow);
      __ pop(RegSet::of(src, dst), sp);

      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
      if (copyfunc_addr != NULL) { // use stub if available
        // src is not a sub class of dst so we have to do a
        // per-element check.

        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
        if ((flags & mask) != mask) {
          // Check that at least both of them object arrays.
          assert(flags & mask, "one of the two should be known to be an object array");

          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
            __ load_klass(tmp, src);
          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
            __ load_klass(tmp, dst);
          }
          int lh_offset = in_bytes(Klass::layout_helper_offset());
          Address klass_lh_addr(tmp, lh_offset);
          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
          __ ldr(rscratch1, klass_lh_addr);
          __ mov(rscratch2, objArray_lh);
          __ eor(rscratch1, rscratch1, rscratch2);
          __ cbnz(rscratch1, *stub->entry());
        }

       // Spill because stubs can use any register they like and it's
       // easier to restore just those that we care about.
        __ str(dst,     Address(sp, 0*BytesPerWord));
        __ str(dst_pos, Address(sp, 1*BytesPerWord));
        __ str(length,  Address(sp, 2*BytesPerWord));
        __ str(src_pos, Address(sp, 3*BytesPerWord));
        __ str(src,     Address(sp, 4*BytesPerWord));

        assert(dst_pos == r0, "assumed in the code below");
        __ mov(rscratch1, dst_pos); // save dst_pos which is r0
        __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
        __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
        assert_different_registers(c_rarg0, dst, length);
        __ lea(c_rarg1, Address(dst, rscratch1, lsl(scale)));
        __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
        assert_different_registers(c_rarg1, dst, length);

        __ load_klass(c_rarg2, dst);
        __ ldr(c_rarg2, Address(c_rarg2, ObjArrayKlass::element_klass_offset()));
        __ ldr(c_rarg3, Address(c_rarg2, Klass::super_check_offset_offset()));
        __ far_call(RuntimeAddress(copyfunc_addr));

#ifndef PRODUCT
        if (PrintC1Statistics) {
          Label failed;
          __ cbnz(r0, failed);
          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_cnt));
          __ bind(failed);
        }
#endif

        __ cbz(r0, *stub->continuation());

#ifndef PRODUCT
        if (PrintC1Statistics) {
          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_attempt_cnt));
        }
#endif
        assert_different_registers(dst, dst_pos, length, src_pos, src, rscratch1);
        __ mov(rscratch1, r0);

        // Restore previously spilled arguments
        __ ldr(dst,     Address(sp, 0*BytesPerWord));
        __ ldr(dst_pos, Address(sp, 1*BytesPerWord));
        __ ldr(length,  Address(sp, 2*BytesPerWord));
        __ ldr(src_pos, Address(sp, 3*BytesPerWord));
        __ ldr(src,     Address(sp, 4*BytesPerWord));

        // return value is -1^K where K is partial copied count
        __ mvn(rscratch1, rscratch1);
        // adjust length down and src/end pos up by partial copied count
        __ sub(length, length, rscratch1);
        __ add(src_pos, src_pos, rscratch1);
        __ add(dst_pos, dst_pos, rscratch1);
      }

      __ b(*stub->entry());

      __ bind(cont);
      __ pop(RegSet::of(src, dst), sp);
    }
  }

#ifdef ASSERT
  if (basic_type != T_OBJECT || !(flags & LIR_OpArrayCopy::type_check)) {
    // Sanity check the known type with the incoming class.  For the
    // primitive case the types must match exactly with src.klass and
    // dst.klass each exactly matching the default type.  For the
    // object array case, if no type check is needed then either the
    // dst type is exactly the expected type and the src type is a
    // subtype which we can't check or src is the same array as dst
    // but not necessarily exactly of type default_type.
    Label known_ok, halt;
    __ mov_metadata(tmp, default_type->constant_encoding());

    if (basic_type != T_OBJECT) {

      __ ldr(rscratch1, dst_klass_addr);
      __ cmp(tmp, rscratch1);
      __ b(halt, Assembler::NE);
      __ ldr(rscratch1, src_klass_addr);
      __ cmp(tmp, rscratch1);
      __ b(known_ok, Assembler::EQ);
    } else {
      __ ldr(rscratch1, dst_klass_addr);
      __ cmp(tmp, rscratch1);
      __ b(known_ok, Assembler::EQ);
      __ cmp(src, dst);
      __ b(known_ok, Assembler::EQ);
    }
    __ bind(halt);
    __ stop("incorrect type information in arraycopy");
    __ bind(known_ok);
  }
#endif

  assert(dst_pos == r0, "assumed in the code below");
  __ mov(rscratch1, dst_pos); // save r0
  __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
  __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
  assert_different_registers(c_rarg0, dst, rscratch1, length);
  __ lea(c_rarg1, Address(dst, rscratch1, lsl(scale)));
  __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
  assert_different_registers(c_rarg1, dst, length);
  __ mov(c_rarg2, length);

  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
  const char *name;
  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);

  CodeBlob *cb = CodeCache::find_blob(entry);
  if (cb) {
    __ far_call(RuntimeAddress(entry));
  } else {
    __ call_VM_leaf(entry, 3);
  }

  __ bind(*stub->continuation());
}

void LIR_Assembler::emit_lock(LIR_OpLock* op) {
  Register obj = op->obj_opr()->as_register();  // may not be an oop
  Register hdr = op->hdr_opr()->as_register();
  Register lock = op->lock_opr()->as_register();
  if (!UseFastLocking) {
    __ b(*op->stub()->entry());
  } else if (op->code() == lir_lock) {
    Register scratch = noreg;
    if (UseBiasedLocking) {
      scratch = op->scratch_opr()->as_register();
    }
    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
    // add debug info for NullPointerException only if one is possible
    int null_check_offset = __ lock_object(hdr, obj, lock, scratch, *op->stub()->entry());
    if (op->info() != NULL) {
      add_debug_info_for_null_check(null_check_offset, op->info());
    }
    // done
  } else if (op->code() == lir_unlock) {
    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
    __ unlock_object(hdr, obj, lock, *op->stub()->entry());
  } else {
    Unimplemented();
  }
  __ bind(*op->stub()->continuation());
}


void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
  ciMethod* method = op->profiled_method();
  int bci          = op->profiled_bci();
  ciMethod* callee = op->profiled_callee();

  // Update counter for all call types
  ciMethodData* md = method->method_data_or_null();
  assert(md != NULL, "Sanity");
  ciProfileData* data = md->bci_to_data(bci);
  assert(data->is_CounterData(), "need CounterData for calls");
  assert(op->mdo()->is_single_cpu(),  "mdo must be allocated");
  Register mdo  = op->mdo()->as_register();
  __ mov_metadata(mdo, md->constant_encoding());
  Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
  Bytecodes::Code bc = method->java_code_at_bci(bci);
  const bool callee_is_static = callee->is_loaded() && callee->is_static();
  // Perform additional virtual call profiling for invokevirtual and
  // invokeinterface bytecodes
  if ((bc == Bytecodes::_invokevirtual || bc == Bytecodes::_invokeinterface) &&
      !callee_is_static &&  // required for optimized MH invokes
      C1ProfileVirtualCalls) {
    assert(op->recv()->is_single_cpu(), "recv must be allocated");
    Register recv = op->recv()->as_register();
    assert_different_registers(mdo, recv);
    assert(data->is_VirtualCallData(), "need VirtualCallData for virtual calls");
    ciKlass* known_klass = op->known_holder();
    if (C1OptimizeVirtualCallProfiling && known_klass != NULL) {
      // We know the type that will be seen at this call site; we can
      // statically update the MethodData* rather than needing to do
      // dynamic tests on the receiver type

      // NOTE: we should probably put a lock around this search to
      // avoid collisions by concurrent compilations
      ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
      uint i;
      for (i = 0; i < VirtualCallData::row_limit(); i++) {
        ciKlass* receiver = vc_data->receiver(i);
        if (known_klass->equals(receiver)) {
          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
          __ addptr(data_addr, DataLayout::counter_increment);
          return;
        }
      }

      // Receiver type not found in profile data; select an empty slot

      // Note that this is less efficient than it should be because it
      // always does a write to the receiver part of the
      // VirtualCallData rather than just the first time
      for (i = 0; i < VirtualCallData::row_limit(); i++) {
        ciKlass* receiver = vc_data->receiver(i);
        if (receiver == NULL) {
          Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)));
          __ mov_metadata(rscratch1, known_klass->constant_encoding());
          __ lea(rscratch2, recv_addr);
          __ str(rscratch1, Address(rscratch2));
          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
          __ addptr(data_addr, DataLayout::counter_increment);
          return;
        }
      }
    } else {
      __ load_klass(recv, recv);
      Label update_done;
      type_profile_helper(mdo, md, data, recv, &update_done);
      // Receiver did not match any saved receiver and there is no empty row for it.
      // Increment total counter to indicate polymorphic case.
      __ addptr(counter_addr, DataLayout::counter_increment);

      __ bind(update_done);
    }
  } else {
    // Static call
    __ addptr(counter_addr, DataLayout::counter_increment);
  }
}


void LIR_Assembler::emit_delay(LIR_OpDelay*) {
  Unimplemented();
}


void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
  __ lea(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
}

void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
  assert(op->crc()->is_single_cpu(),  "crc must be register");
  assert(op->val()->is_single_cpu(),  "byte value must be register");
  assert(op->result_opr()->is_single_cpu(), "result must be register");
  Register crc = op->crc()->as_register();
  Register val = op->val()->as_register();
  Register res = op->result_opr()->as_register();

  assert_different_registers(val, crc, res);
  __ lea(res, ExternalAddress(StubRoutines::crc_table_addr()));

  __ inv(crc, crc);
  __ update_byte_crc32(crc, val, res);
  __ inv(res, crc);
}

void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
  COMMENT("emit_profile_type {");
  Register obj = op->obj()->as_register();
  Register tmp = op->tmp()->as_pointer_register();
  Address mdo_addr = as_Address(op->mdp()->as_address_ptr(), noreg, Address::IDT_INT);
  ciKlass* exact_klass = op->exact_klass();
  intptr_t current_klass = op->current_klass();
  bool not_null = op->not_null();
  bool no_conflict = op->no_conflict();

  Label update, next, none;

  bool do_null = !not_null;
  bool exact_klass_set = exact_klass != NULL && ciTypeEntries::valid_ciklass(current_klass) == exact_klass;
  bool do_update = !TypeEntries::is_type_unknown(current_klass) && !exact_klass_set;

  assert(do_null || do_update, "why are we here?");
  assert(!TypeEntries::was_null_seen(current_klass) || do_update, "why are we here?");
  assert(mdo_addr.base() != rscratch1, "wrong register");

  __ verify_oop(obj);

  if (tmp != obj) {
    __ mov(tmp, obj);
  }
  if (do_null) {
    __ cbnz(tmp, update);
    if (!TypeEntries::was_null_seen(current_klass)) {
      __ ldr(rscratch2, mdo_addr);
      __ orr(rscratch2, rscratch2, TypeEntries::null_seen);
      __ str(rscratch2, mdo_addr);
    }
    if (do_update) {
#ifndef ASSERT
      __ b(next);
    }
#else
      __ b(next);
    }
  } else {
    __ cbnz(tmp, update);
    __ stop("unexpected null obj");
#endif
  }

  __ bind(update);

  if (do_update) {
#ifdef ASSERT
    if (exact_klass != NULL) {
      Label ok;
      __ load_klass(tmp, tmp);
      __ mov_metadata(rscratch1, exact_klass->constant_encoding());
      __ eor(rscratch1, tmp, rscratch1);
      __ cbz(rscratch1, ok);
      __ stop("exact klass and actual klass differ");
      __ bind(ok);
    }
#endif
    if (!no_conflict) {
      if (exact_klass == NULL || TypeEntries::is_type_none(current_klass)) {
        if (exact_klass != NULL) {
          __ mov_metadata(tmp, exact_klass->constant_encoding());
        } else {
          __ load_klass(tmp, tmp);
        }

        __ ldr(rscratch2, mdo_addr);
        __ eor(tmp, tmp, rscratch2);
        __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
        // klass seen before, nothing to do. The unknown bit may have been
        // set already but no need to check.
        __ cbz(rscratch1, next);

        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.

        if (TypeEntries::is_type_none(current_klass)) {
          __ cbz(rscratch2, none);
          __ cmp(rscratch2, TypeEntries::null_seen);
          __ b(none, Assembler::EQ);
          // There is a chance that the checks above (re-reading profiling
          // data from memory) fail if another thread has just set the
          // profiling to this obj's klass
          __ dmb(Assembler::ISH);
          __ ldr(rscratch2, mdo_addr);
          __ eor(tmp, tmp, rscratch2);
          __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
          __ cbz(rscratch1, next);
        }
      } else {
        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "conflict only");

        __ ldr(tmp, mdo_addr);
        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.
      }

      // different than before. Cannot keep accurate profile.
      __ ldr(rscratch2, mdo_addr);
      __ orr(rscratch2, rscratch2, TypeEntries::type_unknown);
      __ str(rscratch2, mdo_addr);

      if (TypeEntries::is_type_none(current_klass)) {
        __ b(next);

        __ bind(none);
        // first time here. Set profile type.
        __ str(tmp, mdo_addr);
      }
    } else {
      // There's a single possible klass at this profile point
      assert(exact_klass != NULL, "should be");
      if (TypeEntries::is_type_none(current_klass)) {
        __ mov_metadata(tmp, exact_klass->constant_encoding());
        __ ldr(rscratch2, mdo_addr);
        __ eor(tmp, tmp, rscratch2);
        __ andr(rscratch1, tmp, TypeEntries::type_klass_mask);
        __ cbz(rscratch1, next);
#ifdef ASSERT
        {
          Label ok;
          __ ldr(rscratch1, mdo_addr);
          __ cbz(rscratch1, ok);
          __ cmp(rscratch1, TypeEntries::null_seen);
          __ b(ok, Assembler::EQ);
          // may have been set by another thread
          __ dmb(Assembler::ISH);
          __ mov_metadata(rscratch1, exact_klass->constant_encoding());
          __ ldr(rscratch2, mdo_addr);
          __ eor(rscratch2, rscratch1, rscratch2);
          __ andr(rscratch2, rscratch2, TypeEntries::type_mask);
          __ cbz(rscratch2, ok);

          __ stop("unexpected profiling mismatch");
          __ bind(ok);
        }
#endif
        // first time here. Set profile type.
        __ ldr(tmp, mdo_addr);
      } else {
        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "inconsistent");

        __ ldr(tmp, mdo_addr);
        __ andr(rscratch1, tmp, TypeEntries::type_unknown);
        __ cbnz(rscratch1, next); // already unknown. Nothing to do anymore.

        __ orr(tmp, tmp, TypeEntries::type_unknown);
        __ str(tmp, mdo_addr);
        // FIXME: Write barrier needed here?
      }
    }

    __ bind(next);
  }
  COMMENT("} emit_profile_type");
}


void LIR_Assembler::align_backward_branch_target() {
}


void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
  if (left->is_single_cpu()) {
    assert(dest->is_single_cpu(), "expect single result reg");
    __ neg(dest->as_register(), left->as_register());
  } else if (left->is_double_cpu()) {
    assert(dest->is_double_cpu(), "expect double result reg");
    const Register l_lo = left->as_register_lo();
    Register l_hi = left->as_register_hi();
    check_register_collision(dest->as_register_lo(), &l_hi);
    __ rsbs(dest->as_register_lo(), l_lo, 0);
    __ rsc(dest->as_register_hi(), l_hi, 0);
  } else if (left->is_single_fpu()) {
    assert(dest->is_single_fpu(), "expect single float result reg");
    __ vneg_f32(dest->as_float_reg(), left->as_float_reg());
  } else {
    assert(left->is_double_fpu(), "expect double float operand reg");
    assert(dest->is_double_fpu(), "expect double float result reg");
    __ vneg_f64(dest->as_double_reg(), left->as_double_reg());
  }
}


void LIR_Assembler::leal(LIR_Opr addr, LIR_Opr dest) {
  __ lea(dest->as_register(), as_Address(addr->as_address_ptr(), noreg, Address::IDT_LEA));
}


void LIR_Assembler::rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info) {
  assert(!tmp->is_valid(), "don't need temporary");
  CodeBlob *cb = CodeCache::find_blob(dest);
  if (cb) {
    __ far_call(RuntimeAddress(dest));
  } else {
    __ lea(rscratch1, RuntimeAddress(dest));
    __ bl(rscratch1);
  }
  if (info != NULL) {
    add_call_info_here(info);
  }
  __ maybe_isb();
}

void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
  if (type == T_LONG || type == T_DOUBLE) {
    const LIR_Opr long_val = FrameMap::long0_opr;

    int null_check_offset = -1;

    if (src->is_register() && dest->is_address()) {
      // long1 reserved as temp by LinearScan::pd_add_temps
      const LIR_Opr long_tmp = FrameMap::long1_opr;
      __ lea(rscratch1, as_Address_lo(dest->as_address_ptr(), Address::IDT_LEA));

      if (type == T_DOUBLE) {
        // long0 reserved as temp by LinearScan::pd_add_temps
        __ vmov_f64(long_val->as_register_lo(), long_val->as_register_hi(), src->as_double_reg());
      } else {
        assert(type == T_LONG && src->is_same_register(long_val), "T_LONG src should be in long0 (by LIRGenerator)");
      }

      null_check_offset = __ offset();
      __ atomic_strd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1,
          long_tmp->as_register_lo(), long_tmp->as_register_hi());

    } else if (src->is_address() && dest->is_register()) {
      __ lea(rscratch1, as_Address_lo(src->as_address_ptr(), Address::IDT_LEA));

      null_check_offset = __ offset();
      __ atomic_ldrd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1);

      if (type == T_DOUBLE) {
        __ vmov_f64(dest->as_double_reg(), long_val->as_register_lo(), long_val->as_register_hi());
      } else {
        assert(type != T_LONG || dest->is_same_register(long_val), "T_LONG dest should be in long0 (by LIRGenerator)");
      }
    } else {
      Unimplemented();
    }

    if (info != NULL) {
      add_debug_info_for_null_check(null_check_offset, info);
    }

  } else {
    move_op(src, dest, type, lir_patch_none, info,
            /*pop_fpu_stack*/false, /*unaligned*/false, /*wide*/false);
  }
}

#ifdef ASSERT
// emit run-time assertion
void LIR_Assembler::emit_assert(LIR_OpAssert* op) {
  assert(op->code() == lir_assert, "must be");

  if (op->in_opr1()->is_valid()) {
    assert(op->in_opr2()->is_valid(), "both operands must be valid");
    comp_op(op->condition(), op->in_opr1(), op->in_opr2(), op);
  } else {
    assert(op->in_opr2()->is_illegal(), "both operands must be illegal");
    assert(op->condition() == lir_cond_always, "no other conditions allowed");
  }

  Label ok;
  if (op->condition() != lir_cond_always) {
    Assembler::Condition acond = Assembler::AL;
    switch (op->condition()) {
      case lir_cond_equal:        acond = Assembler::EQ;  break;
      case lir_cond_notEqual:     acond = Assembler::NE;  break;
      case lir_cond_less:         acond = Assembler::LT;  break;
      case lir_cond_greaterEqual: acond = Assembler::GE;  break;
      case lir_cond_lessEqual:    acond = Assembler::LE;  break;
      case lir_cond_greater:      acond = Assembler::GT;  break;
      case lir_cond_belowEqual:   acond = Assembler::LS;  break;
      case lir_cond_aboveEqual:   acond = Assembler::HS;  break;
      default:                    ShouldNotReachHere();
    }
    if (op->in_opr1()->type() == T_LONG) {
      // a special trick here to be able to effectively compare jlongs
      // for the lessEqual and greater conditions the jlong operands are swapped
      // during comparison and hence should use mirror condition in conditional
      // instruction
      // see LIR_Assembler::comp_op and LIR_Assembler::cmove
      switch (op->condition()) {
        case lir_cond_lessEqual:    acond = Assembler::GE;  break;
        case lir_cond_greater:      acond = Assembler::LT;  break;
      }
    }
    __ b(ok, acond);
  }
  if (op->halt()) {
    const char* str = __ code_string(op->msg());
    __ stop(str);
  } else {
    breakpoint();
  }
  __ bind(ok);
}
#endif

#ifndef PRODUCT
#define COMMENT(x)   do { __ block_comment(x); } while (0)
#else
#define COMMENT(x)
#endif

void LIR_Assembler::membar() {
  COMMENT("membar");
  __ membar(MacroAssembler::AnyAny);
}

void LIR_Assembler::membar_acquire() {
  __ membar(Assembler::LoadLoad|Assembler::LoadStore);
}

void LIR_Assembler::membar_release() {
  __ membar(Assembler::LoadStore|Assembler::StoreStore);
}

void LIR_Assembler::membar_loadload() {
  __ membar(Assembler::LoadLoad);
}

void LIR_Assembler::membar_storestore() {
  __ membar(MacroAssembler::StoreStore);
}

void LIR_Assembler::membar_loadstore() { __ membar(MacroAssembler::LoadStore); }

void LIR_Assembler::membar_storeload() { __ membar(MacroAssembler::StoreLoad); }

void LIR_Assembler::get_thread(LIR_Opr result_reg) {
  __ mov(result_reg->as_register(), rthread);
}


void LIR_Assembler::peephole(LIR_List *lir) {
#if 0
  if (tableswitch_count >= max_tableswitches)
    return;

  /*
    This finite-state automaton recognizes sequences of compare-and-
    branch instructions.  We will turn them into a tableswitch.  You
    could argue that C1 really shouldn't be doing this sort of
    optimization, but without it the code is really horrible.
  */

  enum { start_s, cmp1_s, beq_s, cmp_s } state;
  int first_key, last_key = -2147483648;
  int next_key = 0;
  int start_insn = -1;
  int last_insn = -1;
  Register reg = noreg;
  LIR_Opr reg_opr;
  state = start_s;

  LIR_OpList* inst = lir->instructions_list();
  for (int i = 0; i < inst->length(); i++) {
    LIR_Op* op = inst->at(i);
    switch (state) {
    case start_s:
      first_key = -1;
      start_insn = i;
      switch (op->code()) {
      case lir_cmp:
        LIR_Opr opr1 = op->as_Op2()->in_opr1();
        LIR_Opr opr2 = op->as_Op2()->in_opr2();
        if (opr1->is_cpu_register() && opr1->is_single_cpu()
            && opr2->is_constant()
            && opr2->type() == T_INT) {
          reg_opr = opr1;
          reg = opr1->as_register();
          first_key = opr2->as_constant_ptr()->as_jint();
          next_key = first_key + 1;
          state = cmp_s;
          goto next_state;
        }
        break;
      }
      break;
    case cmp_s:
      switch (op->code()) {
      case lir_branch:
        if (op->as_OpBranch()->cond() == lir_cond_equal) {
          state = beq_s;
          last_insn = i;
          goto next_state;
        }
      }
      state = start_s;
      break;
    case beq_s:
      switch (op->code()) {
      case lir_cmp: {
        LIR_Opr opr1 = op->as_Op2()->in_opr1();
        LIR_Opr opr2 = op->as_Op2()->in_opr2();
        if (opr1->is_cpu_register() && opr1->is_single_cpu()
            && opr1->as_register() == reg
            && opr2->is_constant()
            && opr2->type() == T_INT
            && opr2->as_constant_ptr()->as_jint() == next_key) {
          last_key = next_key;
          next_key++;
          state = cmp_s;
          goto next_state;
        }
      }
      }
      last_key = next_key;
      state = start_s;
      break;
    default:
      assert(false, "impossible state");
    }
    if (state == start_s) {
      if (first_key < last_key - 5L && reg != noreg) {
        {
          // printf("found run register %d starting at insn %d low value %d high value %d\n",
          //        reg->encoding(),
          //        start_insn, first_key, last_key);
          //   for (int i = 0; i < inst->length(); i++) {
          //     inst->at(i)->print();
          //     tty->print("\n");
          //   }
          //   tty->print("\n");
        }

        struct tableswitch *sw = &switches[tableswitch_count];
        sw->_insn_index = start_insn, sw->_first_key = first_key,
          sw->_last_key = last_key, sw->_reg = reg;
        inst->insert_before(last_insn + 1, new LIR_OpLabel(&sw->_after));
        {
          // Insert the new table of branches
          int offset = last_insn;
          for (int n = first_key; n < last_key; n++) {
            inst->insert_before
              (last_insn + 1,
               new LIR_OpBranch(lir_cond_always, T_ILLEGAL,
                                inst->at(offset)->as_OpBranch()->label()));
            offset -= 2, i++;
          }
        }
        // Delete all the old compare-and-branch instructions
        for (int n = first_key; n < last_key; n++) {
          inst->remove_at(start_insn);
          inst->remove_at(start_insn);
        }
        // Insert the tableswitch instruction
        inst->insert_before(start_insn,
                            new LIR_Op2(lir_cmp, lir_cond_always,
                                        LIR_OprFact::intConst(tableswitch_count),
                                        reg_opr));
        inst->insert_before(start_insn + 1, new LIR_OpLabel(&sw->_branches));
        tableswitch_count++;
      }
      reg = noreg;
      last_key = -2147483648;
    }
  next_state:
    ;
  }
#endif
}

void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp_op) {
  BasicType type = src->type();
  Address addr = as_Address(src->as_address_ptr(), Address::toInsnDataType(type));

  bool is_long = false;

  switch(type) {
  case T_INT:
  case T_OBJECT:
  case T_ARRAY:
    break;
  case T_LONG:
    is_long = true;
    break;
  default:
    ShouldNotReachHere();
  }

  switch (code) {
  case lir_xadd:
    {
      Register tmp = tmp_op->as_register();
      Register dst = as_reg(dest);
      Label again;
      __ lea(tmp, addr);
      __ bind(again);
      if(is_long) {
          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");
          _masm->ldrexd(dst, tmp);
      } else {
          _masm->ldrex(dst, tmp);
      }
      arith_op(lir_add, dest, data, dest, NULL, false);
      if (is_long) {
        _masm->strexd(rscratch1, dst, tmp);
      } else {
        _masm->strex(rscratch1, dst, tmp);
      }
      __ cbnz(rscratch1, again);
      arith_op(lir_sub, dest, data, dest, NULL, false);
      break;
    }
  case lir_xchg:
    {
      Register tmp = tmp_op->as_register();
      Register obj = as_reg(data);
      Register dst = as_reg(dest);
      assert_different_registers(obj, addr.base(), tmp, rscratch1, dst);
      Label again;
      __ lea(tmp, addr);
      __ bind(again);
      if(is_long) {
          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");

          assert(data->is_double_cpu(), "should be double register");
          assert(data->as_register_lo()->successor() == data->as_register_hi(), "must be contiguous");
          assert((data->as_register_lo()->encoding() & 1) == 0, "must be even");

          _masm->ldrexd(dst, tmp);
          _masm->strexd(rscratch1, obj, tmp);
      } else {
         _masm->ldrex(dst, tmp);
         _masm->strex(rscratch1, obj, tmp);
      }
      __ cbnz(rscratch1, again);
    }
    break;
  default:
    ShouldNotReachHere();
  }
  __ membar(__ AnyAny);
}

void LIR_Assembler::check_register_collision(Register d, Register *s1, Register *s2, Register tmp) {
  // use a temp if any of the registers used as a source of operation
  // collide with result register of the prerequisite operation
  if (d == *s1) {
    __ mov(tmp, d);
    *s1 = tmp;
  } else if (s2 && d == *s2) {
    __ mov(tmp, d);
    *s2 = tmp;
  }
}

#undef __